Spaces:

ShahzainHaider
/

OCR

Build error

App Files Files Community

ShahzainHaider commited on Jun 6, 2023

Commit

7bbae49

1 Parent(s): b74a4db

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +2 -0
.gitattributes +8 -0
.gitignore +2 -0
README.md +1 -7
__pycache__/gradio.cpython-38.pyc +0 -0
app/.gitignore +4 -0
app/__pycache__/app.cpython-38.pyc +0 -0
app/app.py +64 -0
app/constants/__pycache__/paths.cpython-38.pyc +0 -0
app/constants/paths.py +2 -0
app/constants/regex_expressions.py +1 -0
app/custome.py +0 -0
app/extract_country/__pycache__/country_dictionary.cpython-38.pyc +0 -0
app/extract_country/__pycache__/country_extract.cpython-38.pyc +0 -0
app/extract_country/__pycache__/country_validator.cpython-38.pyc +0 -0
app/extract_country/country_dictionary.py +44 -0
app/extract_country/country_extract.py +23 -0
app/extract_country/country_validator.py +15 -0
app/extract_dates/__pycache__/date_engine.cpython-38.pyc +0 -0
app/extract_dates/__pycache__/validate_date.cpython-38.pyc +0 -0
app/extract_dates/date_engine.py +34 -0
app/extract_dates/validate_date.py +21 -0
app/extract_gender/__pycache__/gender_extractor.cpython-38.pyc +0 -0
app/extract_gender/gender_extractor.py +26 -0
app/extract_identity_number/__pycache__/doc_number_extractor.cpython-38.pyc +0 -0
app/extract_identity_number/__pycache__/identity_number.cpython-38.pyc +0 -0
app/extract_identity_number/doc_number_extractor.py +13 -0
app/extract_identity_number/identity_number.py +2 -0
app/extract_mrz/__pycache__/mrz_detect.cpython-38.pyc +0 -0
app/extract_mrz/__pycache__/mrz_engine.cpython-38.pyc +0 -0
app/extract_mrz/mrz_detect.py +34 -0
app/extract_mrz/mrz_engine.py +72 -0
app/images/idcards/1.png +0 -0
app/images/idcards/BlacksSharpen.jpg +0 -0
app/images/idcards/EO3kzEEUcAEeNbf.jpg +0 -0
app/images/idcards/Genunie-ID-Card-Online.jpg +0 -0
app/images/idcards/IMG_20221130_180809.jpg +3 -0
app/images/idcards/IMG_20230210_171555.jpg +0 -0
app/images/idcards/IMG_20230210_171610.jpg +0 -0
app/images/idcards/ShahzainCNIC.jpg +3 -0
app/images/idcards/_13.jpg +0 -0
app/images/idcards/aadhaar backside image .jpg +3 -0
app/images/idcards/aadhaar frontside image.jpg +3 -0
app/images/idcards/cnic.jpg +0 -0
app/images/idcards/d.jpg +3 -0
app/images/idcards/driving licence backside image.jpg +3 -0
app/images/idcards/driving licence frontside image.jpg +3 -0
app/images/idcards/e4d62a4127719bb07ed88275c3802bf907f026978d1de2a531c1e7967e60bea9.webp +0 -0
app/images/idcards/front.jpg +3 -0
app/images/idcards/image.jpg +0 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJnb29nbGUtb2F1dGgyfDExNjY1NDE1MzQ0MDY1NjEzNTI5MSIsImVtYWlsIjoic2hhaHphaW5oYWlkZXJuYXF2aUBnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiaWF0IjoxNjg1MDAxMjgzMDExfQ.Fb5ODO7KUchlLnrK0KBvSR4pkfIAfYiECRVWXj44RTQ'
2	+ queue_id = 'c0f9e6f8-73d0-42f9-bd4f-700bdf002c04'

.gitattributes CHANGED Viewed

@@ -32,3 +32,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/IMG_20221130_180809.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/ShahzainCNIC.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/aadhaar[[:space:]]backside[[:space:]]image .jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/aadhaar[[:space:]]frontside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/d.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/driving[[:space:]]licence[[:space:]]backside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/driving[[:space:]]licence[[:space:]]frontside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text
+app/images/idcards/front.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/app.cpython-38.pyc
2	+ *pyc

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
 title: OCR
-emoji: 🌖
-colorFrom: yellow
-colorTo: pink
 sdk: gradio
 sdk_version: 3.33.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: OCR
+app_file: deploy.py
 sdk: gradio
 sdk_version: 3.33.1
 ---

__pycache__/gradio.cpython-38.pyc ADDED Viewed

Binary file (441 Bytes). View file

app/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+images/*
+output/*
+*.pyc
+.vscode

app/__pycache__/app.cpython-38.pyc ADDED Viewed

Binary file (2.05 kB). View file

app/app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import FastAPI, File, UploadFile
+from app.extract_country.country_dictionary import find_country
+from app.extract_dates.date_engine import date_extractor
+from app.extract_gender.gender_extractor import gender_extract
+from app.extract_identity_number.doc_number_extractor import doc_number
+from app.ocr_engine.ocr import OCR
+from paddleocr import PaddleOCR
+from app.layoutLM_api.api import custom_ocr
+app = FastAPI()
+origins = ["*"]
+@app.post('/extract_info')
+async def ocr(Id_card: UploadFile = File(...)):
+    ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
+    file_path = 'app/images/idcards/input.jpg'
+    with open(file_path, "wb+") as file_object:
+        file_object.write(Id_card.file.read())
+    dictionary = custom_ocr(file_path)
+    if '' in list(dictionary.values()):
+        print("Missing value found in Dic")
+        extract_text = []
+        result = ocr.ocr(file_path, cls=True)
+        extract_text = [line[1][0] for res in result for line in res]
+        print("extract_text", extract_text)
+        if len(dictionary['gender']) == 0:
+            print("Gender Missing")
+            gender_found, gender, ocr_list = gender_extract(extract_text)
+            dictionary["gender"] = gender if gender_found else None
+        if len(dictionary['dob']) == 0:
+            print("Dob Missing")
+            dob_found, dob, ocr_list = date_extractor(extract_text)
+            dictionary["dob"] = dob if dob_found else None
+        if len(dictionary['country']) == 0:
+            print("Country Missing")
+            country_found, country, ocr_list = find_country(extract_text)
+            dictionary["country"] = country if country_found else None
+        if len(dictionary['document_number']) == 0:
+            print("document Number missing")
+            document_number_found, document_number, ocr_list = doc_number(extract_text)
+            dictionary["document_number"] = document_number if document_number_found else None
+    print("Updated Dict ",dictionary)
+    response = {"Status" : 200, "OCR" : dictionary}
+    return response

app/constants/__pycache__/paths.cpython-38.pyc ADDED Viewed

Binary file (251 Bytes). View file

app/constants/paths.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ IMAGE_PATH = 'app/images/idcards/front.jpg'
2	+ EXTRACTED_MRZ_PATH = "output/mrz_image.jpg"

app/constants/regex_expressions.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ DATA_PATTERN = r"\b\d{2}\.\d{2}\.\d{4}\b"

app/custome.py ADDED Viewed

File without changes

app/extract_country/__pycache__/country_dictionary.cpython-38.pyc ADDED Viewed

Binary file (1.61 kB). View file

app/extract_country/__pycache__/country_extract.cpython-38.pyc ADDED Viewed

Binary file (868 Bytes). View file

app/extract_country/__pycache__/country_validator.cpython-38.pyc ADDED Viewed

Binary file (563 Bytes). View file

app/extract_country/country_dictionary.py ADDED Viewed

	@@ -0,0 +1,44 @@

+country_data = {
+    'PAKISTAN' : ['PAK', 'PAKISTAN'],
+    'SWITZERLAND' : ['SCHWEIZERISCHE EIDGENOSSENSCHAFT', 'CONFEDERATION SUISSE', 'CONFEDERAZIONESVIZZERA', 'CONFEDERAZIUN SVIZRA', 'SWISS CONFEDERATION'],
+    'INDIA' : ['INDIA', 'Government of India'],
+    'GERMANY' : ['Bundesrepublik Deutschland','Germany', 'Westdeutschland' , 'Ostdeutschland'],
+    'BANGLADESH' : ['Bangladesh'],
+    'UNITED KINGDOM' : ['British Citizen', 'UNITED KINGDOM'],
+    'NETHERLANDS' : ['NETHERLANDS', 'NEDERLANDSE', 'NEDERLANDEN'],
+    'CANADA' : ['canada'],
+    'UNITED ARAB EMIRATES' : ['Arab Emirates', 'UAE'],
+    'OMAN' : ['OMAN'],
+    'JORDAN' : ['jordan'],
+    'BAHRAIN' : ['BAHRAIN'],
+    'KUWAIT' : ['KUWAIT'],
+    'QATAR' : ['Qatar'],
+    'LIBYA' : ['AFRiN MAHALLi MECLiSi'],
+    'SOUTH SUDAN' : ['Akon'],
+    'CHILLE' : ['DECHILE'],
+    'COLOMBIA' : ['COLOMBIA'],
+    'BRAZIL' : ['BRAZIL', 'BRASIL'],
+    'PERU' : ['DELPERU', 'CASADO'],
+    'URUGUAY' : ['DELURUGUAY'],
+    'Coasta Rica' : ['COSTARICA'],
+    'PARAGUAY' : ['PARAGUAY'],
+    'ECUADOR' : ['ECUADOR'],
+    'GUATEMALA' : ['GUATEMALA'],
+    'Bolivia' : ['Bolivariano', 'Bolivia'],
+    'El Salvador' : ['Salvador'],
+    'Dominican Republic' : ['REPUBLICADOMINICANA']
+}
+def find_country(ocr_list):
+    try:
+        for word_from_ocr_list in ocr_list:
+            word_normalized = word_from_ocr_list.upper()
+            for key, values in country_data.items():
+                for value in values:
+                    if value.upper() in word_normalized:
+                        return True, key, ocr_list
+        else:
+            return False, None, ocr_list
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return False, None, ocr_list

app/extract_country/country_extract.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from extract_country.country_validator import extract_country_name
+def country_extractor(ocr_list):
+    try:
+        country_found = False
+        country = None
+        result_string = ' '.join(string for string in ocr_list)
+        words = result_string.split()
+        for index,word in enumerate(words):
+            country_found, country_name = extract_country_name(word)
+            if country_found:
+                country_found = True
+                country = country_name
+                break
+        return country_found, country, ocr_list
+    except Exception as e:
+        print("[Exception in country_extractor ] ", str(e))
+        return country_found, country, ocr_list

app/extract_country/country_validator.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import pycountry
+import re
+def extract_country_name(string):
+    pattern = r"\b[A-Za-z ]+\b"  # Matches any sequence of letters and spaces
+    matches = re.findall(pattern, string)
+    for match in matches:
+        try:
+            country = pycountry.countries.lookup(match)
+            print("COUNTRY : ",country.name)
+            return True, country.name
+        except LookupError:
+            pass
+    return False, None

app/extract_dates/__pycache__/date_engine.cpython-38.pyc ADDED Viewed

Binary file (805 Bytes). View file

app/extract_dates/__pycache__/validate_date.cpython-38.pyc ADDED Viewed

Binary file (880 Bytes). View file

app/extract_dates/date_engine.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from app.extract_dates.validate_date import find_smallest_date, validate_date
+def date_extractor(ocr_list):
+    try:
+        global dob_found
+        dob_found = False
+        dob = None
+        dates_list = []
+        for index,word in enumerate(ocr_list):
+            date_valid ,pattern = validate_date(word)
+            if date_valid:
+                # valid_pattern = pattern
+                ocr_list.pop(index) # removing elements from list that is being used.
+                dates_list.append(word)
+        if dates_list:
+            # print(dates_list)
+            dob = find_smallest_date(dates_list) # smallest date will be DOB
+            dob_found = True
+            print("DATE OF BIRTH : ", dob)
+        else:
+            print("Date not found")
+            dob_found = False
+            dob = ''
+        return dob_found, dob, ocr_list
+    except Exception as e:
+        print("[Exception in date_extractor] : ", str(e))
+        dob = ''
+        return dob_found, dob, ocr_list

app/extract_dates/validate_date.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import re
+from dateutil import parser
+def find_smallest_date(dates):
+    parsed_dates = [parser.parse(date) for date in dates]
+    smallest_date = min(parsed_dates)
+    print(smallest_date.strftime('%d-%m-%Y'))
+    return smallest_date.strftime('%d-%m-%Y')
+def validate_date(date_string):
+    patterns = [
+        r'^\d{2}/\d{2}/\d{4}$',    # MM/DD/YYYY
+        r'^\d{2}-\d{2}-\d{4}$',    # DD-MM-YYYY
+        r'^\d{2}.\d{2}.\d{4}$'    # DD.MM.YYYY
+    ]
+    for pattern in patterns:
+        if re.match(pattern, date_string):
+            return True, pattern
+    return False, pattern

app/extract_gender/__pycache__/gender_extractor.cpython-38.pyc ADDED Viewed

Binary file (810 Bytes). View file

app/extract_gender/gender_extractor.py ADDED Viewed

	@@ -0,0 +1,26 @@

+gender_labels = ["M","MALE", "F", "FEMALE", "V/F","N/F","FEMENINO","MASCULINA"]
+def gender_extract(ocr_list):
+    try:
+        for words in ocr_list:
+            for word in words.split():
+                word = word.upper()
+                if word in gender_labels:
+                    return True, word, ocr_list
+                elif "FEMALE" in word:
+                    gender = 'F'
+                    return True, gender, ocr_list
+                elif "MALE" in word:
+                    gender = 'M'
+                    return True, gender, ocr_list
+                elif "FEMENINO" in word:
+                    gender = "F"
+                    return True, gender, ocr_list
+                elif "MASCULINA" in word:
+                    gender = "M"
+                    return True, gender, ocr_list
+        return False, None, ocr_list
+    except Exception as e:
+        print("[Exception in gender_extract] ", str(e))
+        return None, None, ocr_list

app/extract_identity_number/__pycache__/doc_number_extractor.cpython-38.pyc ADDED Viewed

Binary file (625 Bytes). View file

app/extract_identity_number/__pycache__/identity_number.cpython-38.pyc ADDED Viewed

Binary file (300 Bytes). View file

app/extract_identity_number/doc_number_extractor.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import re
+def doc_number(ocr_list):
+    try:
+        for item in ocr_list:
+            if (len(item) >= 6 and item[0].isnumeric() ) or (len(item) >= 6 and (item[0].isalpha() and item[1].isnumeric())):
+                print("DOCUMENT NUMBRE IS : ", item)
+                return True, item, ocr_list
+        return False, None, ocr_list
+    except Exception as e:
+        print("Exception in doc_number : " ,str(e))
+        return False, None, ocr_list

app/extract_identity_number/identity_number.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def identity_card_extractor(ocr_list):
2	+ pass

app/extract_mrz/__pycache__/mrz_detect.cpython-38.pyc ADDED Viewed

Binary file (849 Bytes). View file

app/extract_mrz/__pycache__/mrz_engine.cpython-38.pyc ADDED Viewed

Binary file (1.59 kB). View file

app/extract_mrz/mrz_detect.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import cv2
+from readmrz import MrzDetector
+from constants.paths import EXTRACTED_MRZ_PATH
+def MRZ_detector(img_path):
+    try:
+        mrz_found = False
+        gray = []
+        detector = MrzDetector()
+        image = detector.read(img_path)
+        cropped = detector.crop_area(image)
+        gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+        cv2.imshow('Image', cropped)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
+        if len(cropped) != 13:
+            mrz_found = True
+            cv2.imwrite(EXTRACTED_MRZ_PATH, gray)
+            print("MRZ FOUND")
+            # cv2.imshow('Image', cropped)
+            # cv2.waitKey(0)
+            # cv2.destroyAllWindows()
+        return mrz_found, gray
+    except Exception as e:
+        print("[Exception in MRZ_detector] : ", str(e))
+        return mrz_found, gray

app/extract_mrz/mrz_engine.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from mrz.checker.td1 import TD1CodeChecker
+from mrz.checker.td2 import TD2CodeChecker
+from mrz.checker.td3 import TD3CodeChecker
+def mrz_engine(mrz_list, lenghtOfChars):
+    try:
+        user_data = {}
+        if lenghtOfChars == 90:
+            check = TD1CodeChecker(f"{mrz_list[0]}\n"
+                                f"{mrz_list[1]}\n"
+                                f"{mrz_list[2]}"
+                                )
+        elif lenghtOfChars == 72:
+            check = TD2CodeChecker(f"{mrz_list[0]}\n"
+                                f"{mrz_list[1]}")
+        elif lenghtOfChars == 88:
+            check = TD3CodeChecker(f"{mrz_list[0]}\n"
+                                f"{mrz_list[1]}")
+        else:
+            check = False
+        result = bool(check)
+        # print(result)
+        if result:
+            user_data = check.fields()
+            # print(fields.name,fields.surname )
+            # print(get_country(fields.country))
+        else:
+            print("FAILED")
+        return user_data
+    except Exception as e:
+        print("[Exception in mrz_engine] : ", str(e))
+        return user_data
+def mrz_corrector(capital_strings, lenghtOfCharaters):
+    try:
+        lines = len(capital_strings)
+        if lines == 2 and lenghtOfCharaters not in [72, 88]:
+            safe_check = {72: 3, 88: 3}
+            for index, line in enumerate(capital_strings):
+                if len(line) != safe_check[lenghtOfCharaters]:
+                    char_difference = safe_check[lenghtOfCharaters] - len(line)
+                    line = line + char_difference * '<'
+                    capital_strings[index] = line
+                    lenghtOfCharaters = len(capital_strings)*len(line)
+            for index, line in enumerate(capital_strings):
+                if len(line) != 30:
+                    char_difference = 30 - len(line)
+                    line = line + char_difference * '<'
+                    capital_strings[index] = line
+                    lenghtOfCharaters = len(capital_strings)*len(line)
+        elif lines == 3 and lenghtOfCharaters != 90:
+            for index, line in enumerate(capital_strings):
+                if len(line) != 30:
+                    char_difference = 30 - len(line)
+                    line = line + char_difference * '<'
+                    capital_strings[index] = line
+                    lenghtOfCharaters = len(capital_strings)*len(line)
+        return lenghtOfCharaters, capital_strings
+    except Exception as e:
+       print("[Exception in mrz_corrector] : ", str(e))
+       lenghtOfCharaters = 0
+       capital_strings = []
+       return lenghtOfCharaters, capital_strings