OCR / test.py
ShahzainHaider's picture
Upload folder using huggingface_hub
7bbae49
raw
history blame
834 Bytes
import re
def identify_document_id(data_list):
keywords = ["Document ID", "Document Number", "Passport Number", "ID Number"] # Add other possible keywords
for item in data_list:
for keyword in keywords:
if keyword in item:
# Extract document ID based on format and length
document_id = re.findall(r'\b[A-Za-z0-9]+\b', item)
# Additional checks for format and length can be added here
return document_id[0] if document_id else None
return None
# Test the function with the given data list
data_list = ["Govermment of the People's Republic of Bangladesh", 'NationalIDCard', '12May 1975', 'HETH', 'Caaaat', 'Name', 'ROMANARAHMAN', 'Date of tn 12 May 1975', '8673674936']
document_id = identify_document_id(data_list)
print(document_id)