diff --git a/.env b/.env new file mode 100644 index 0000000000000000000000000000000000000000..0f0d13d37036fa844a11cb0489a07f41bafebcbe --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJnb29nbGUtb2F1dGgyfDExNjY1NDE1MzQ0MDY1NjEzNTI5MSIsImVtYWlsIjoic2hhaHphaW5oYWlkZXJuYXF2aUBnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiaWF0IjoxNjg1MDAxMjgzMDExfQ.Fb5ODO7KUchlLnrK0KBvSR4pkfIAfYiECRVWXj44RTQ' +queue_id = 'c0f9e6f8-73d0-42f9-bd4f-700bdf002c04' \ No newline at end of file diff --git a/.gitattributes b/.gitattributes index c7d9f3332a950355d5a77d85000f05e6f45435ea..409be2039650102c902192e630dba5afc84b5cd7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -32,3 +32,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +app/images/idcards/IMG_20221130_180809.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/ShahzainCNIC.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/aadhaar[[:space:]]backside[[:space:]]image .jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/aadhaar[[:space:]]frontside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/d.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/driving[[:space:]]licence[[:space:]]backside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/driving[[:space:]]licence[[:space:]]frontside[[:space:]]image.jpg filter=lfs diff=lfs merge=lfs -text +app/images/idcards/front.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..93de3958d909678c150c49292601ced1cbd17b5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/app.cpython-38.pyc +*pyc \ No newline at end of file diff --git a/README.md b/README.md index c585085a1790450c6a8792b01d1fac5ec35574a1..215dcbe0fddf6bc72bd94728c269ec18b3fee24c 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,6 @@ --- title: OCR -emoji: 🌖 -colorFrom: yellow -colorTo: pink +app_file: deploy.py sdk: gradio sdk_version: 3.33.1 -app_file: app.py -pinned: false --- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/__pycache__/gradio.cpython-38.pyc b/__pycache__/gradio.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e51e36f9d521b4c3f38a96ba255fd1b23cae174f Binary files /dev/null and b/__pycache__/gradio.cpython-38.pyc differ diff --git a/app/.gitignore b/app/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6043b92b0e9b7cb473b1928ca327a3f2bd8ae2c7 --- /dev/null +++ b/app/.gitignore @@ -0,0 +1,4 @@ +images/* +output/* +*.pyc +.vscode \ No newline at end of file diff --git a/app/__pycache__/app.cpython-38.pyc b/app/__pycache__/app.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cec0fdc52fd2b71e581bfcb2f26dcbd3b8b2de98 Binary files /dev/null and b/app/__pycache__/app.cpython-38.pyc differ diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000000000000000000000000000000000000..2be56149911e218f3df2e8f31f5f5f776a754123 --- /dev/null +++ b/app/app.py @@ -0,0 +1,64 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi import FastAPI, File, UploadFile +from app.extract_country.country_dictionary import find_country +from app.extract_dates.date_engine import date_extractor +from app.extract_gender.gender_extractor import gender_extract +from app.extract_identity_number.doc_number_extractor import doc_number +from app.ocr_engine.ocr import OCR +from paddleocr import PaddleOCR +from app.layoutLM_api.api import custom_ocr + +app = FastAPI() + +origins = ["*"] + + +@app.post('/extract_info') +async def ocr(Id_card: UploadFile = File(...)): + + ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory + + file_path = 'app/images/idcards/input.jpg' + + with open(file_path, "wb+") as file_object: + file_object.write(Id_card.file.read()) + + dictionary = custom_ocr(file_path) + + if '' in list(dictionary.values()): + print("Missing value found in Dic") + + extract_text = [] + result = ocr.ocr(file_path, cls=True) + + extract_text = [line[1][0] for res in result for line in res] + print("extract_text", extract_text) + + if len(dictionary['gender']) == 0: + print("Gender Missing") + gender_found, gender, ocr_list = gender_extract(extract_text) + + dictionary["gender"] = gender if gender_found else None + + if len(dictionary['dob']) == 0: + print("Dob Missing") + dob_found, dob, ocr_list = date_extractor(extract_text) + + dictionary["dob"] = dob if dob_found else None + + if len(dictionary['country']) == 0: + print("Country Missing") + country_found, country, ocr_list = find_country(extract_text) + dictionary["country"] = country if country_found else None + + if len(dictionary['document_number']) == 0: + print("document Number missing") + + document_number_found, document_number, ocr_list = doc_number(extract_text) + dictionary["document_number"] = document_number if document_number_found else None + + + print("Updated Dict ",dictionary) + response = {"Status" : 200, "OCR" : dictionary} + return response \ No newline at end of file diff --git a/app/constants/__pycache__/paths.cpython-38.pyc b/app/constants/__pycache__/paths.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..874239e8aa74f617e9e957f8d43f534f4cd2d5e1 Binary files /dev/null and b/app/constants/__pycache__/paths.cpython-38.pyc differ diff --git a/app/constants/paths.py b/app/constants/paths.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7420f955f54a238e14ee3ea32ad533bba46a3d --- /dev/null +++ b/app/constants/paths.py @@ -0,0 +1,2 @@ +IMAGE_PATH = 'app/images/idcards/front.jpg' +EXTRACTED_MRZ_PATH = "output/mrz_image.jpg" \ No newline at end of file diff --git a/app/constants/regex_expressions.py b/app/constants/regex_expressions.py new file mode 100644 index 0000000000000000000000000000000000000000..ba40e706001c52d49909e93ed65420d47ea6407a --- /dev/null +++ b/app/constants/regex_expressions.py @@ -0,0 +1 @@ +DATA_PATTERN = r"\b\d{2}\.\d{2}\.\d{4}\b" \ No newline at end of file diff --git a/app/custome.py b/app/custome.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/extract_country/__pycache__/country_dictionary.cpython-38.pyc b/app/extract_country/__pycache__/country_dictionary.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2122ee358fc3f212cb0712020ee7f7dc5a860dd Binary files /dev/null and b/app/extract_country/__pycache__/country_dictionary.cpython-38.pyc differ diff --git a/app/extract_country/__pycache__/country_extract.cpython-38.pyc b/app/extract_country/__pycache__/country_extract.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3708fd5eec4978cb32226feb31967261bf13bf31 Binary files /dev/null and b/app/extract_country/__pycache__/country_extract.cpython-38.pyc differ diff --git a/app/extract_country/__pycache__/country_validator.cpython-38.pyc b/app/extract_country/__pycache__/country_validator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df751a5ce660e1496890facea0d2a32b34950962 Binary files /dev/null and b/app/extract_country/__pycache__/country_validator.cpython-38.pyc differ diff --git a/app/extract_country/country_dictionary.py b/app/extract_country/country_dictionary.py new file mode 100644 index 0000000000000000000000000000000000000000..4f690d1d5fe04b9c3507eb3638ce3b5502b536fe --- /dev/null +++ b/app/extract_country/country_dictionary.py @@ -0,0 +1,44 @@ +country_data = { + 'PAKISTAN' : ['PAK', 'PAKISTAN'], + 'SWITZERLAND' : ['SCHWEIZERISCHE EIDGENOSSENSCHAFT', 'CONFEDERATION SUISSE', 'CONFEDERAZIONESVIZZERA', 'CONFEDERAZIUN SVIZRA', 'SWISS CONFEDERATION'], + 'INDIA' : ['INDIA', 'Government of India'], + 'GERMANY' : ['Bundesrepublik Deutschland','Germany', 'Westdeutschland' , 'Ostdeutschland'], + 'BANGLADESH' : ['Bangladesh'], + 'UNITED KINGDOM' : ['British Citizen', 'UNITED KINGDOM'], + 'NETHERLANDS' : ['NETHERLANDS', 'NEDERLANDSE', 'NEDERLANDEN'], + 'CANADA' : ['canada'], + 'UNITED ARAB EMIRATES' : ['Arab Emirates', 'UAE'], + 'OMAN' : ['OMAN'], + 'JORDAN' : ['jordan'], + 'BAHRAIN' : ['BAHRAIN'], + 'KUWAIT' : ['KUWAIT'], + 'QATAR' : ['Qatar'], + 'LIBYA' : ['AFRiN MAHALLi MECLiSi'], + 'SOUTH SUDAN' : ['Akon'], + 'CHILLE' : ['DECHILE'], + 'COLOMBIA' : ['COLOMBIA'], + 'BRAZIL' : ['BRAZIL', 'BRASIL'], + 'PERU' : ['DELPERU', 'CASADO'], + 'URUGUAY' : ['DELURUGUAY'], + 'Coasta Rica' : ['COSTARICA'], + 'PARAGUAY' : ['PARAGUAY'], + 'ECUADOR' : ['ECUADOR'], + 'GUATEMALA' : ['GUATEMALA'], + 'Bolivia' : ['Bolivariano', 'Bolivia'], + 'El Salvador' : ['Salvador'], + 'Dominican Republic' : ['REPUBLICADOMINICANA'] +} + +def find_country(ocr_list): + try: + for word_from_ocr_list in ocr_list: + word_normalized = word_from_ocr_list.upper() + for key, values in country_data.items(): + for value in values: + if value.upper() in word_normalized: + return True, key, ocr_list + else: + return False, None, ocr_list + except Exception as e: + print(f"An error occurred: {e}") + return False, None, ocr_list \ No newline at end of file diff --git a/app/extract_country/country_extract.py b/app/extract_country/country_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..8f1182f524f361162e78964912dbc3a4117f4535 --- /dev/null +++ b/app/extract_country/country_extract.py @@ -0,0 +1,23 @@ +from extract_country.country_validator import extract_country_name + + +def country_extractor(ocr_list): + try: + country_found = False + country = None + + result_string = ' '.join(string for string in ocr_list) + words = result_string.split() + + for index,word in enumerate(words): + country_found, country_name = extract_country_name(word) + if country_found: + country_found = True + country = country_name + break + + return country_found, country, ocr_list + + except Exception as e: + print("[Exception in country_extractor ] ", str(e)) + return country_found, country, ocr_list \ No newline at end of file diff --git a/app/extract_country/country_validator.py b/app/extract_country/country_validator.py new file mode 100644 index 0000000000000000000000000000000000000000..3b36b0b40b9fe72c836a78e5d67795274d46e682 --- /dev/null +++ b/app/extract_country/country_validator.py @@ -0,0 +1,15 @@ +import pycountry +import re + +def extract_country_name(string): + pattern = r"\b[A-Za-z ]+\b" # Matches any sequence of letters and spaces + matches = re.findall(pattern, string) + for match in matches: + try: + country = pycountry.countries.lookup(match) + print("COUNTRY : ",country.name) + return True, country.name + except LookupError: + pass + return False, None + diff --git a/app/extract_dates/__pycache__/date_engine.cpython-38.pyc b/app/extract_dates/__pycache__/date_engine.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eb6c127a66886016dcb839d1ca0962603b05fb0 Binary files /dev/null and b/app/extract_dates/__pycache__/date_engine.cpython-38.pyc differ diff --git a/app/extract_dates/__pycache__/validate_date.cpython-38.pyc b/app/extract_dates/__pycache__/validate_date.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21bb23035f584cf8e0de85b71c6eba6b9acfc0ec Binary files /dev/null and b/app/extract_dates/__pycache__/validate_date.cpython-38.pyc differ diff --git a/app/extract_dates/date_engine.py b/app/extract_dates/date_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..72ca2735ce6022f199fcc19013360e1f96beabf0 --- /dev/null +++ b/app/extract_dates/date_engine.py @@ -0,0 +1,34 @@ +from app.extract_dates.validate_date import find_smallest_date, validate_date + + +def date_extractor(ocr_list): + try: + global dob_found + dob_found = False + dob = None + dates_list = [] + + for index,word in enumerate(ocr_list): + date_valid ,pattern = validate_date(word) + if date_valid: + # valid_pattern = pattern + ocr_list.pop(index) # removing elements from list that is being used. + dates_list.append(word) + + if dates_list: + # print(dates_list) + dob = find_smallest_date(dates_list) # smallest date will be DOB + dob_found = True + + print("DATE OF BIRTH : ", dob) + else: + print("Date not found") + dob_found = False + dob = '' + + return dob_found, dob, ocr_list + + except Exception as e: + print("[Exception in date_extractor] : ", str(e)) + dob = '' + return dob_found, dob, ocr_list \ No newline at end of file diff --git a/app/extract_dates/validate_date.py b/app/extract_dates/validate_date.py new file mode 100644 index 0000000000000000000000000000000000000000..8c5bceaff10a081e06e6ada098747c0e83b11efd --- /dev/null +++ b/app/extract_dates/validate_date.py @@ -0,0 +1,21 @@ +import re +from dateutil import parser + +def find_smallest_date(dates): + parsed_dates = [parser.parse(date) for date in dates] + smallest_date = min(parsed_dates) + print(smallest_date.strftime('%d-%m-%Y')) + return smallest_date.strftime('%d-%m-%Y') + +def validate_date(date_string): + patterns = [ + r'^\d{2}/\d{2}/\d{4}$', # MM/DD/YYYY + r'^\d{2}-\d{2}-\d{4}$', # DD-MM-YYYY + r'^\d{2}.\d{2}.\d{4}$' # DD.MM.YYYY + ] + + for pattern in patterns: + if re.match(pattern, date_string): + return True, pattern + + return False, pattern diff --git a/app/extract_gender/__pycache__/gender_extractor.cpython-38.pyc b/app/extract_gender/__pycache__/gender_extractor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a05b1253abe042ccb54c376ba1cdc53e39129730 Binary files /dev/null and b/app/extract_gender/__pycache__/gender_extractor.cpython-38.pyc differ diff --git a/app/extract_gender/gender_extractor.py b/app/extract_gender/gender_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..ce2946177a040fa801ffcd1a0aec737b9d16fc4e --- /dev/null +++ b/app/extract_gender/gender_extractor.py @@ -0,0 +1,26 @@ +gender_labels = ["M","MALE", "F", "FEMALE", "V/F","N/F","FEMENINO","MASCULINA"] + +def gender_extract(ocr_list): + try: + for words in ocr_list: + for word in words.split(): + word = word.upper() + if word in gender_labels: + return True, word, ocr_list + elif "FEMALE" in word: + gender = 'F' + return True, gender, ocr_list + elif "MALE" in word: + gender = 'M' + return True, gender, ocr_list + elif "FEMENINO" in word: + gender = "F" + return True, gender, ocr_list + elif "MASCULINA" in word: + gender = "M" + return True, gender, ocr_list + + return False, None, ocr_list + except Exception as e: + print("[Exception in gender_extract] ", str(e)) + return None, None, ocr_list \ No newline at end of file diff --git a/app/extract_identity_number/__pycache__/doc_number_extractor.cpython-38.pyc b/app/extract_identity_number/__pycache__/doc_number_extractor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d889f798e2737209838acd62f09c3e634d753d9 Binary files /dev/null and b/app/extract_identity_number/__pycache__/doc_number_extractor.cpython-38.pyc differ diff --git a/app/extract_identity_number/__pycache__/identity_number.cpython-38.pyc b/app/extract_identity_number/__pycache__/identity_number.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c56b7202a67dca44d6bb7c6d0358d38c89474224 Binary files /dev/null and b/app/extract_identity_number/__pycache__/identity_number.cpython-38.pyc differ diff --git a/app/extract_identity_number/doc_number_extractor.py b/app/extract_identity_number/doc_number_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..ae31dd633d266769ce81fb1c08b50860df7c2f7b --- /dev/null +++ b/app/extract_identity_number/doc_number_extractor.py @@ -0,0 +1,13 @@ +import re + +def doc_number(ocr_list): + try: + + for item in ocr_list: + if (len(item) >= 6 and item[0].isnumeric() ) or (len(item) >= 6 and (item[0].isalpha() and item[1].isnumeric())): + print("DOCUMENT NUMBRE IS : ", item) + return True, item, ocr_list + return False, None, ocr_list + except Exception as e: + print("Exception in doc_number : " ,str(e)) + return False, None, ocr_list \ No newline at end of file diff --git a/app/extract_identity_number/identity_number.py b/app/extract_identity_number/identity_number.py new file mode 100644 index 0000000000000000000000000000000000000000..8bfe18af487360fa82ae512376fadbb67b986272 --- /dev/null +++ b/app/extract_identity_number/identity_number.py @@ -0,0 +1,2 @@ +def identity_card_extractor(ocr_list): + pass \ No newline at end of file diff --git a/app/extract_mrz/__pycache__/mrz_detect.cpython-38.pyc b/app/extract_mrz/__pycache__/mrz_detect.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4a170c105efa39ba4bc2f95399934a4c3422956 Binary files /dev/null and b/app/extract_mrz/__pycache__/mrz_detect.cpython-38.pyc differ diff --git a/app/extract_mrz/__pycache__/mrz_engine.cpython-38.pyc b/app/extract_mrz/__pycache__/mrz_engine.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51c6f3b6d5408fc935bfc514eaee1e6fca53a6fb Binary files /dev/null and b/app/extract_mrz/__pycache__/mrz_engine.cpython-38.pyc differ diff --git a/app/extract_mrz/mrz_detect.py b/app/extract_mrz/mrz_detect.py new file mode 100644 index 0000000000000000000000000000000000000000..653b7bad6ef0ea660270ac3d10e9d2f8201fc3dc --- /dev/null +++ b/app/extract_mrz/mrz_detect.py @@ -0,0 +1,34 @@ +import cv2 +from readmrz import MrzDetector + +from constants.paths import EXTRACTED_MRZ_PATH + + +def MRZ_detector(img_path): + try: + mrz_found = False + gray = [] + + detector = MrzDetector() + + image = detector.read(img_path) + cropped = detector.crop_area(image) + gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) + + cv2.imshow('Image', cropped) + cv2.waitKey(0) + cv2.destroyAllWindows() + + if len(cropped) != 13: + mrz_found = True + + cv2.imwrite(EXTRACTED_MRZ_PATH, gray) + print("MRZ FOUND") + # cv2.imshow('Image', cropped) + # cv2.waitKey(0) + # cv2.destroyAllWindows() + + return mrz_found, gray + except Exception as e: + print("[Exception in MRZ_detector] : ", str(e)) + return mrz_found, gray \ No newline at end of file diff --git a/app/extract_mrz/mrz_engine.py b/app/extract_mrz/mrz_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..63e1826d1c1267598765faac71b1375294cc3194 --- /dev/null +++ b/app/extract_mrz/mrz_engine.py @@ -0,0 +1,72 @@ +from mrz.checker.td1 import TD1CodeChecker +from mrz.checker.td2 import TD2CodeChecker +from mrz.checker.td3 import TD3CodeChecker + +def mrz_engine(mrz_list, lenghtOfChars): + try: + user_data = {} + + if lenghtOfChars == 90: + check = TD1CodeChecker(f"{mrz_list[0]}\n" + f"{mrz_list[1]}\n" + f"{mrz_list[2]}" + ) + elif lenghtOfChars == 72: + check = TD2CodeChecker(f"{mrz_list[0]}\n" + f"{mrz_list[1]}") + + elif lenghtOfChars == 88: + check = TD3CodeChecker(f"{mrz_list[0]}\n" + f"{mrz_list[1]}") + else: + check = False + + result = bool(check) + # print(result) + if result: + user_data = check.fields() + # print(fields.name,fields.surname ) + # print(get_country(fields.country)) + else: + print("FAILED") + + return user_data + except Exception as e: + print("[Exception in mrz_engine] : ", str(e)) + return user_data + +def mrz_corrector(capital_strings, lenghtOfCharaters): + + try: + lines = len(capital_strings) + + if lines == 2 and lenghtOfCharaters not in [72, 88]: + safe_check = {72: 3, 88: 3} + for index, line in enumerate(capital_strings): + if len(line) != safe_check[lenghtOfCharaters]: + char_difference = safe_check[lenghtOfCharaters] - len(line) + line = line + char_difference * '<' + capital_strings[index] = line + lenghtOfCharaters = len(capital_strings)*len(line) + + for index, line in enumerate(capital_strings): + if len(line) != 30: + char_difference = 30 - len(line) + line = line + char_difference * '<' + capital_strings[index] = line + lenghtOfCharaters = len(capital_strings)*len(line) + + elif lines == 3 and lenghtOfCharaters != 90: + for index, line in enumerate(capital_strings): + if len(line) != 30: + char_difference = 30 - len(line) + line = line + char_difference * '<' + capital_strings[index] = line + lenghtOfCharaters = len(capital_strings)*len(line) + + return lenghtOfCharaters, capital_strings + except Exception as e: + print("[Exception in mrz_corrector] : ", str(e)) + lenghtOfCharaters = 0 + capital_strings = [] + return lenghtOfCharaters, capital_strings \ No newline at end of file diff --git a/app/images/idcards/1.png b/app/images/idcards/1.png new file mode 100644 index 0000000000000000000000000000000000000000..b0c808fe005ac4c0d2566f006b3b124ea5e4f266 Binary files /dev/null and b/app/images/idcards/1.png differ diff --git a/app/images/idcards/BlacksSharpen.jpg b/app/images/idcards/BlacksSharpen.jpg new file mode 100644 index 0000000000000000000000000000000000000000..074e3227de46dcd5b40a5a0f9f70fada9e22d09a Binary files /dev/null and b/app/images/idcards/BlacksSharpen.jpg differ diff --git a/app/images/idcards/EO3kzEEUcAEeNbf.jpg b/app/images/idcards/EO3kzEEUcAEeNbf.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b4488f0f87f7600a43dfeb73cc2c4f38cde4e5ce Binary files /dev/null and b/app/images/idcards/EO3kzEEUcAEeNbf.jpg differ diff --git a/app/images/idcards/Genunie-ID-Card-Online.jpg b/app/images/idcards/Genunie-ID-Card-Online.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a432292b566ce0a9989c3c489a74806d92e46825 Binary files /dev/null and b/app/images/idcards/Genunie-ID-Card-Online.jpg differ diff --git a/app/images/idcards/IMG_20221130_180809.jpg b/app/images/idcards/IMG_20221130_180809.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6c5bc50bff712a5c02d0d8b2f111361fff836257 --- /dev/null +++ b/app/images/idcards/IMG_20221130_180809.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb72de3c358fd6239c9a27b7fcfb0b16c5403389677fa6d4edf3754cfc93aa6 +size 2901719 diff --git a/app/images/idcards/IMG_20230210_171555.jpg b/app/images/idcards/IMG_20230210_171555.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31b0ab8683f04d3f4eee55e0b3219fbe4875f2a6 Binary files /dev/null and b/app/images/idcards/IMG_20230210_171555.jpg differ diff --git a/app/images/idcards/IMG_20230210_171610.jpg b/app/images/idcards/IMG_20230210_171610.jpg new file mode 100644 index 0000000000000000000000000000000000000000..315b3912bce79f5327ea3027e3a166f9fc08f493 Binary files /dev/null and b/app/images/idcards/IMG_20230210_171610.jpg differ diff --git a/app/images/idcards/ShahzainCNIC.jpg b/app/images/idcards/ShahzainCNIC.jpg new file mode 100644 index 0000000000000000000000000000000000000000..24715d3a4028451f86dc9592556b0ae186234c4c --- /dev/null +++ b/app/images/idcards/ShahzainCNIC.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04eaf3e2a6c46ea3551c56a99dbd2334d00d1776c3fbd27925bdd7f21d084470 +size 1668246 diff --git a/app/images/idcards/_13.jpg b/app/images/idcards/_13.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ee0ad063be8573d436daa6f143a00a7aa06d9142 Binary files /dev/null and b/app/images/idcards/_13.jpg differ diff --git "a/app/images/idcards/aadhaar backside image\302\240.jpg" "b/app/images/idcards/aadhaar backside image\302\240.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..6bb9cff2d22df93791cf0a5bf5d4e92b4c733866 --- /dev/null +++ "b/app/images/idcards/aadhaar backside image\302\240.jpg" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b33cf99463058edcd1474456b5c0a8e25bc002c1527fa67d972b129af10a4d4 +size 2028688 diff --git a/app/images/idcards/aadhaar frontside image.jpg b/app/images/idcards/aadhaar frontside image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..69d07dec2f9ffa0fea6e474235e87ba2776cb885 --- /dev/null +++ b/app/images/idcards/aadhaar frontside image.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41905aa6b4558a2abb16a6ef709915f802d4edc3ec2969aec5a23e3dfa6a54b5 +size 1474221 diff --git a/app/images/idcards/cnic.jpg b/app/images/idcards/cnic.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9150aa442548baca9485b1f0ec9a2edabb1e7638 Binary files /dev/null and b/app/images/idcards/cnic.jpg differ diff --git a/app/images/idcards/d.jpg b/app/images/idcards/d.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a646034e35d1ef2d13486941351511e270030bfc --- /dev/null +++ b/app/images/idcards/d.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:015dac6cbf5f50177323f5f7da4be88bd1a110f2c573ead0989b1fd3995ddeb7 +size 1447958 diff --git a/app/images/idcards/driving licence backside image.jpg b/app/images/idcards/driving licence backside image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..00e93e07891dc7a3524eb78f0c01348ddd45b8df --- /dev/null +++ b/app/images/idcards/driving licence backside image.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b95b081250dde945a8118865fb21045c27fd5d04f795684ddffaf98dcbf490 +size 1532900 diff --git a/app/images/idcards/driving licence frontside image.jpg b/app/images/idcards/driving licence frontside image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ff86dc7610d02a9fa5790223510ef690daccfd1 --- /dev/null +++ b/app/images/idcards/driving licence frontside image.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cc3754647dea187a11bfdfcff6c8cb84f97d860beb6a4ca40bb8872b3d8e06c +size 2011567 diff --git a/app/images/idcards/e4d62a4127719bb07ed88275c3802bf907f026978d1de2a531c1e7967e60bea9.webp b/app/images/idcards/e4d62a4127719bb07ed88275c3802bf907f026978d1de2a531c1e7967e60bea9.webp new file mode 100644 index 0000000000000000000000000000000000000000..f51396a4cff1ee3b2cdb005c6d1eee49b9c344c0 Binary files /dev/null and b/app/images/idcards/e4d62a4127719bb07ed88275c3802bf907f026978d1de2a531c1e7967e60bea9.webp differ diff --git a/app/images/idcards/front.jpg b/app/images/idcards/front.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dfd8b36261045e9d4ee743b7dbbe265331f1c152 --- /dev/null +++ b/app/images/idcards/front.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40e4f511129548855a1d5b68a2aa60eb8877ab69406499fec43bc1566c6804bd +size 2186537 diff --git a/app/images/idcards/image.jpg b/app/images/idcards/image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77b92c4ee8f3072d17026e42fbdb37620fa1cf2e Binary files /dev/null and b/app/images/idcards/image.jpg differ diff --git a/app/images/idcards/image_with_text.jpg b/app/images/idcards/image_with_text.jpg new file mode 100644 index 0000000000000000000000000000000000000000..beac74464d9844c5f7587f17f8bee466c4325a61 Binary files /dev/null and b/app/images/idcards/image_with_text.jpg differ diff --git a/app/images/idcards/input.jpg b/app/images/idcards/input.jpg new file mode 100644 index 0000000000000000000000000000000000000000..358b1b73d6ddae8a4607198579a4573af1ae7c79 Binary files /dev/null and b/app/images/idcards/input.jpg differ diff --git a/app/images/idcards/m.jpg b/app/images/idcards/m.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ca81f9ba68c197723fadb03b55ec47a59528b4f5 Binary files /dev/null and b/app/images/idcards/m.jpg differ diff --git a/app/images/idcards/maxresdefault.jpg b/app/images/idcards/maxresdefault.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2ee9a1c0c104baa2afa59ee8837410aa6836a7da Binary files /dev/null and b/app/images/idcards/maxresdefault.jpg differ diff --git a/app/images/idcards/mm.jpg b/app/images/idcards/mm.jpg new file mode 100644 index 0000000000000000000000000000000000000000..acdf6792fd3009a2a0e1ce4286ddf5ca3607b638 Binary files /dev/null and b/app/images/idcards/mm.jpg differ diff --git a/app/images/idcards/mrz-1.webp b/app/images/idcards/mrz-1.webp new file mode 100644 index 0000000000000000000000000000000000000000..8aecc8c19254b1bc05bd7e22191701afc54123da Binary files /dev/null and b/app/images/idcards/mrz-1.webp differ diff --git a/app/images/idcards/pakistan.jpg b/app/images/idcards/pakistan.jpg new file mode 100644 index 0000000000000000000000000000000000000000..997ca82d51404a3ddb0ae755da016e8e26e6e6eb Binary files /dev/null and b/app/images/idcards/pakistan.jpg differ diff --git a/app/images/idcards/rafay Cnic.jpg b/app/images/idcards/rafay Cnic.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77b92c4ee8f3072d17026e42fbdb37620fa1cf2e Binary files /dev/null and b/app/images/idcards/rafay Cnic.jpg differ diff --git a/app/images/idcards/sehrish non corp.png b/app/images/idcards/sehrish non corp.png new file mode 100644 index 0000000000000000000000000000000000000000..40534720bc2a426c3dc5fde7dddbe85e9c55fe9e Binary files /dev/null and b/app/images/idcards/sehrish non corp.png differ diff --git a/app/images/idcards/shefqet.png b/app/images/idcards/shefqet.png new file mode 100644 index 0000000000000000000000000000000000000000..37dbaeed0395146e0c96cfa47fe9fb5e740c09fd Binary files /dev/null and b/app/images/idcards/shefqet.png differ diff --git a/app/images/passports/ID_POLAND_BACK.png b/app/images/passports/ID_POLAND_BACK.png new file mode 100644 index 0000000000000000000000000000000000000000..879b10b176237dc2a7a2e8fd8e6e3e3286ef3cc4 Binary files /dev/null and b/app/images/passports/ID_POLAND_BACK.png differ diff --git a/app/images/passports/Sweden.png b/app/images/passports/Sweden.png new file mode 100644 index 0000000000000000000000000000000000000000..3ed64b892d712a4fabfd83e466e845d9b7fe7c6b Binary files /dev/null and b/app/images/passports/Sweden.png differ diff --git a/app/images/passports/bean.jpg b/app/images/passports/bean.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e0dbd0a8f778cf2e8fd3903025a753b786484d61 Binary files /dev/null and b/app/images/passports/bean.jpg differ diff --git a/app/images/passports/czech-passport-2006-mrz-data-KJDTYE.jpg b/app/images/passports/czech-passport-2006-mrz-data-KJDTYE.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2ec0c2db17f75eef4496e28415306cbe764a150b Binary files /dev/null and b/app/images/passports/czech-passport-2006-mrz-data-KJDTYE.jpg differ diff --git a/app/images/passports/foreignpassportrus.jpg b/app/images/passports/foreignpassportrus.jpg new file mode 100644 index 0000000000000000000000000000000000000000..37fe20051aeb6774659d9b568ad46a0aed504275 Binary files /dev/null and b/app/images/passports/foreignpassportrus.jpg differ diff --git a/app/images/passports/image_with_text.jpg b/app/images/passports/image_with_text.jpg new file mode 100644 index 0000000000000000000000000000000000000000..beac74464d9844c5f7587f17f8bee466c4325a61 Binary files /dev/null and b/app/images/passports/image_with_text.jpg differ diff --git a/app/images/passports/passport old.jpg b/app/images/passports/passport old.jpg new file mode 100644 index 0000000000000000000000000000000000000000..00f2d12e0d209024df8dd9590da11d443ed06ee2 Binary files /dev/null and b/app/images/passports/passport old.jpg differ diff --git a/app/images/passports/passport.jpg b/app/images/passports/passport.jpg new file mode 100644 index 0000000000000000000000000000000000000000..616d1e8a2e512ef5f7a41119b5ee68214fddabba Binary files /dev/null and b/app/images/passports/passport.jpg differ diff --git a/app/images/passports/td3_image.jpg b/app/images/passports/td3_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c2bb3b2c763a0a4e57390271b99af9d4ee7d3c73 Binary files /dev/null and b/app/images/passports/td3_image.jpg differ diff --git a/app/images/passports/where_is_your_mrz_passport_number.jpg b/app/images/passports/where_is_your_mrz_passport_number.jpg new file mode 100644 index 0000000000000000000000000000000000000000..25b01ba8650093d8d8026e01e44fcc7e7db3fb2d Binary files /dev/null and b/app/images/passports/where_is_your_mrz_passport_number.jpg differ diff --git a/app/layoutLM_api/__pycache__/api.cpython-38.pyc b/app/layoutLM_api/__pycache__/api.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e3b13ac1920bffb7612e1c877823169a5736f41 Binary files /dev/null and b/app/layoutLM_api/__pycache__/api.cpython-38.pyc differ diff --git a/app/layoutLM_api/api.py b/app/layoutLM_api/api.py new file mode 100644 index 0000000000000000000000000000000000000000..5756e030cc3e59e2bb092aad31db294e58fc0bd8 --- /dev/null +++ b/app/layoutLM_api/api.py @@ -0,0 +1,56 @@ +import requests +import mimetypes +import requests +import mimetypes +from butler import Client +import os +from dotenv import load_dotenv + +mimetypes.init() +load_dotenv() + +api_key = os.getenv('api_key') +queue_id = os.getenv('queue_id') + + +def custom_ocr(image_path): + + extracted_field = { + 'name': '', + 'dob': '', + 'country': '', + 'gender': '', + 'document_number': '', + } + + # PUT THIS IN ENV FILE + # Make sure to add your API Key to the auth headers + + + # Response is a strongly typed object + response = Client(api_key).extract_document(queue_id, image_path) + dictionary = response.to_dict() + print("dictionary : ", dictionary) + + for field in dictionary['formFields']: + + if ('Name' in field['fieldName']) and (extracted_field['name'] == ''): + + if field['fieldName'] == 'Last Name': + extracted_field['name'] = field['value'] + elif field['fieldName'] == 'First Name': + extracted_field['name'] = field['value'] + elif field['fieldName'] == 'Middle Name': + extracted_field['name'] = field['value'] + + if field['fieldName'] == 'Document Number': + extracted_field['document_number'] = field['value'] + + if field['fieldName'] == 'Date of Birth': + extracted_field['dob'] = field['value'] + + if (field['fieldName'] == 'State' or field['fieldName'] == 'County' or field['fieldName'] == 'Place of Birth' ) and (len(field['value']) > 0): + extracted_field['country'] = field['value'] + + print("MODEL EXTRACTED FIELDS : ", extracted_field) + return extracted_field \ No newline at end of file diff --git a/app/ocr_engine/__pycache__/ocr.cpython-38.pyc b/app/ocr_engine/__pycache__/ocr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ba4991ba218301d6a90c40babf94b6405f9ed20 Binary files /dev/null and b/app/ocr_engine/__pycache__/ocr.cpython-38.pyc differ diff --git a/app/ocr_engine/ocr.py b/app/ocr_engine/ocr.py new file mode 100644 index 0000000000000000000000000000000000000000..e165a31cc69a458c2bb472a9609cff07e429d0de --- /dev/null +++ b/app/ocr_engine/ocr.py @@ -0,0 +1,5 @@ +from paddleocr import PaddleOCR, draw_ocr +from PIL import Image +import torch + +OCR = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory \ No newline at end of file diff --git a/app/output/mrz_image.jpg b/app/output/mrz_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6f5be705278b37630116c26243d50e135bf5914d Binary files /dev/null and b/app/output/mrz_image.jpg differ diff --git a/app/run.py b/app/run.py new file mode 100644 index 0000000000000000000000000000000000000000..fdfa16134685d4fc1ae0e738129f84a24fde2ca2 --- /dev/null +++ b/app/run.py @@ -0,0 +1,66 @@ +from constants.paths import IMAGE_PATH +from extract_country.country_extract import country_extractor +from extract_dates.date_engine import date_extractor +from extract_gender.gender_extractor import gender_extract +from extract_identity_number.identity_number import identity_card_extractor +from extract_mrz.mrz_detect import MRZ_detector +from ocr_engine.ocr import ocr +from extract_mrz import mrz_engine +import re + +def main(): + global extract_text + extract_text = [] + + mrz_found, cropped = MRZ_detector(IMAGE_PATH) + + result = ocr.ocr(cropped, cls=True) #extracting MRZ text with ocr + + if mrz_found and not (len(result) <= 1): + + lenghtOfCharaters = 0 + capital_strings = [] + print('\n') + for x,line in enumerate(result[0]): + text = line[1][0] + + print(text) + + capital_strings.append(text.upper()) + lenghtOfCharaters = len(text.upper()) + lenghtOfCharaters + + print('\n') + print("Extracted Text : ",capital_strings) + print("The lenght of charaters is : ", lenghtOfCharaters) + + lenghtOfCharaters, capital_strings = mrz_engine.mrz_corrector(capital_strings,lenghtOfCharaters) + + if lenghtOfCharaters != 0 and len(capital_strings) != 0: + user_data = mrz_engine(capital_strings,lenghtOfCharaters) + print("USER DATA : ", user_data) + else: + print("MRZ NOT FOUND \nExtracting full card \n") + + result = ocr.ocr(IMAGE_PATH, cls=True) + print("\n") + + # extract_text = [] #storing extracting text in a list + + for idx in range(len(result)): + res = result[idx] + for line in res: + string_value = line[1][0] + extract_text.append(string_value) + + print("LIST : ", extract_text) + + dob_found, dob, ocr_list = date_extractor(extract_text) + # gender_found, gender, ocr_list = gender_extract(ocr_list) + # country_found, country, ocr_list = country_extractor(ocr_list) + # identity_number_found, identity_number, ocr_list = identity_card_extractor(ocr_list) + # dob_found, DOB = date_extractor(extract_text) + # dob_found, DOB = date_extractor(extract_text) + # print("UPDAETD LIST : ", ocr_list) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/deploy.py b/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..ca6f00b26125e4087d2f3d32c69391d3515ad7f5 --- /dev/null +++ b/deploy.py @@ -0,0 +1,63 @@ +import numpy as np +import gradio as gr +from fastapi import FastAPI, File, UploadFile +from app.extract_country.country_dictionary import find_country +from app.extract_dates.date_engine import date_extractor +from app.extract_gender.gender_extractor import gender_extract +from app.extract_identity_number.doc_number_extractor import doc_number +from app.ocr_engine.ocr import OCR +from paddleocr import PaddleOCR +from app.layoutLM_api.api import custom_ocr +import cv2 + +def ocr(input_img): + print(type(input_img)) + ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory + + file_path = 'app/images/idcards/input.jpg' + cv2.imwrite(file_path, input_img) + + # with open(file_path, "wb+") as file_object: + # file_object.write(input_img.file.read()) + + dictionary = custom_ocr(file_path) + + if '' in list(dictionary.values()): + print("Missing value found in Dic") + + extract_text = [] + result = ocr.ocr(file_path, cls=True) + + extract_text = [line[1][0] for res in result for line in res] + print("extract_text", extract_text) + + if len(dictionary['gender']) == 0: + print("Gender Missing") + gender_found, gender, ocr_list = gender_extract(extract_text) + + dictionary["gender"] = gender if gender_found else None + + if len(dictionary['dob']) == 0: + print("Dob Missing") + dob_found, dob, ocr_list = date_extractor(extract_text) + + dictionary["dob"] = dob if dob_found else None + + if len(dictionary['country']) == 0: + print("Country Missing") + country_found, country, ocr_list = find_country(extract_text) + dictionary["country"] = country if country_found else None + + if len(dictionary['document_number']) == 0: + print("document Number missing") + + document_number_found, document_number, ocr_list = doc_number(extract_text) + dictionary["document_number"] = document_number if document_number_found else None + + + print("Updated Dict ",dictionary) + response = {"Status" : 200, "OCR" : dictionary} + return response + +demo = gr.Interface(ocr, gr.Image(), "json") +demo.launch(share=True) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..659c32f56e9899212da81947c34ecb9c1386fad1 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +import uvicorn + +if __name__ == "__main__": + uvicorn.run("app.app:app",host="0.0.0.0", port=8000, reload=True) + \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000000000000000000000000000000000000..abe353d7514046ec352d06bafbce09bd8d5af684 --- /dev/null +++ b/test.py @@ -0,0 +1,19 @@ +import re + +def identify_document_id(data_list): + keywords = ["Document ID", "Document Number", "Passport Number", "ID Number"] # Add other possible keywords + + for item in data_list: + for keyword in keywords: + if keyword in item: + # Extract document ID based on format and length + document_id = re.findall(r'\b[A-Za-z0-9]+\b', item) + # Additional checks for format and length can be added here + return document_id[0] if document_id else None + + return None + +# Test the function with the given data list +data_list = ["Govermment of the People's Republic of Bangladesh", 'NationalIDCard', '12May 1975', 'HETH', 'Caaaat', 'Name', 'ROMANARAHMAN', 'Date of tn 12 May 1975', '8673674936'] +document_id = identify_document_id(data_list) +print(document_id)