Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import openpyxl | |
import tempfile | |
import pycountry | |
from deep_translator import GoogleTranslator | |
from langdetect import detect | |
classifier = pipeline("zero-shot-classification", | |
model="LogicSpine/address-large-text-classifier") | |
def translate_text(text: str): | |
text = text.strip() | |
translator = GoogleTranslator(source='auto', target='en') | |
detected_lang = detect(text) | |
if detected_lang == 'en': | |
return text | |
translated = translator.translate(text) | |
return translated | |
def check_for_third(address: str) -> bool: | |
countries = [country.name.lower() for country in pycountry.countries] | |
old_country_names = [ | |
"burma", | |
"ceylon", | |
"persia", | |
"zaire", | |
"upper volta", | |
"swaziland", | |
"macedonia", | |
"czech republic", | |
"turkey", | |
"holland", | |
"kampuchea", | |
"dahomey", | |
"bechuanaland", | |
"gold coast", | |
"nyasaland", | |
"korea", | |
"russia", | |
"usa", | |
"uk" | |
] | |
countries = countries + old_country_names | |
if "," in address: | |
address = address.split(",") | |
else: | |
address = [address.strip()] | |
for ad in address: | |
if ad.lower().strip() in countries: | |
return True | |
return False | |
def check_for_first(address: str) -> bool: | |
keyword_list = ["school", "laboratory", "department"] | |
for key in keyword_list: | |
if key.lower() in address.lower(): | |
return True | |
return False | |
def check_for_second(address: str) -> bool: | |
keyword_list = ["university"] | |
for key in keyword_list: | |
if key.lower() in address.lower().strip(): | |
return True | |
return False | |
def compaire_two(bigger: str, smaller: str, mid: int) -> bool: | |
"""Helps to find the result according to the priority | |
Args: | |
bigger (str): Pass the bigger | |
smaller (str): And smaller | |
mid (int): Pass the mid where 1 reffer to 1st and 2 as 2nd and 3 as 3rd | |
Raises: | |
ValueError: If invalid mid is passed | |
Returns: | |
bool: if bigger have more priority then True else False | |
""" | |
if mid == 1: | |
if check_for_first(bigger): | |
return True | |
lab = ["School", "Department", "Laboratory"] | |
elif mid == 2: | |
if check_for_second(bigger): | |
return True | |
lab = ["University", "Polytechnic"] | |
elif mid == 3: | |
if check_for_third(bigger): | |
return True | |
lab = ["State", "District", "Country"] | |
else: | |
raise ValueError(f"Invalid value passed in mid : {mid}") | |
sb = classifier(bigger, lab) | |
ss = classifier(smaller, lab) | |
result_bigger = sum(sb["scores"]) | |
result_smaller = sum(ss["scores"]) | |
if result_bigger > result_smaller: | |
return True | |
return False | |
def get_ai_position(address: str) -> int: | |
"""This function use AI to find the position of the address | |
Args: | |
address (str): Pass the address here | |
Returns: | |
int: Return the mid 1 for 4th and 2 for 5th and 3 for 6th | |
""" | |
if check_for_first(address): | |
return 1 | |
if check_for_second(address): | |
return 2 | |
if check_for_third(address): | |
return 3 | |
result_first = sum(classifier(address, ["School", "Department", "Laboratory"])["scores"]) | |
result_second = sum(classifier(address, ["University", "Polytechnic"])["scores"]) | |
result_third = sum(classifier(address, ["State", "District", "Country"])["scores"]) | |
total = max(result_first, result_second, result_third) | |
if total == result_first: | |
return 1 | |
elif total == result_second: | |
return 2 | |
if total == result_third: | |
return 3 | |
else: | |
return 3 | |
def compare_by_mid(bigger: int, smaller: int, address: str, threshold: float = 0.1) -> bool: | |
"""Helps to find the proper position for the address according to the mid | |
Args: | |
bigger (int): Pass the mid 1, 2 or 3 | |
smaller (int): Pass the mid 1, 2 or 3 | |
address (str): If possibility of bigger is more then return True else False | |
threshold (float): Minimum score difference to consider valid comparison | |
Returns: | |
bool: Boolean | |
""" | |
if bigger == 1: | |
if check_for_first(address): | |
return True | |
bigger_l = ["School", "Department", "Laboratory"] | |
elif bigger == 2: | |
if check_for_second(address): | |
return True | |
bigger_l = ["University", "Polytechnic"] | |
else: | |
if check_for_third(address): | |
return True | |
bigger_l = ["State", "District", "Country"] | |
if smaller == 1: | |
smaller_l = ["School", "Department", "Laboratory"] | |
elif smaller == 2: | |
smaller_l = ["University", "Polytechnic"] | |
else: | |
smaller_l = ["State", "District", "Country"] | |
result_bigger = classifier(address, bigger_l) | |
result_smaller = classifier(address, smaller_l) | |
max_bigger = max(result_bigger["scores"]) | |
max_smaller = max(result_smaller["scores"]) | |
score_difference = max_smaller - max_bigger | |
return score_difference > threshold | |
def find_missing_data(data1: str, data2: str, data3: str, var1: str, var2: str, var3: str) -> str: | |
"""Helps to find the missing data | |
Args: | |
data1 (str): Pass the first data or you can say address | |
data2 (str): Pass the 2nd address | |
data3 (str): pass third address | |
var1 (str): pass the first variable to check | |
var2 (str): pass 2nd variable to check | |
var3 (str): pass the third variable to check the 3rd address | |
Returns: | |
str: return the address as string | |
""" | |
data_set = {data1, data2, data3} | |
variables_filled = {var1, var2, var3} | |
missing_data = data_set - variables_filled | |
if missing_data: | |
return ', '.join(missing_data) | |
else: | |
return "All data has been assigned correctly." | |
def swapper(i1, i2, i3): | |
first, second, third = None, None, None | |
inputs = [i1, i2, i3] | |
first_candidates = [] | |
for data in inputs: | |
original_data = data | |
if check_for_third(translate_text(data)): | |
if third is None: | |
third = data | |
else: | |
third, data = data, third | |
if check_for_first(translate_text(data)): | |
first_candidates.append(data) | |
elif check_for_second(translate_text(data)): | |
if second is None: | |
second = data | |
else: | |
second, data = data, second | |
elif check_for_first(translate_text(data)): | |
first_candidates.append(data) | |
elif check_for_second(translate_text(data)): | |
if second is None: | |
second = data | |
else: | |
second, data = data, second | |
if first_candidates: | |
first = first_candidates[0] if first is None else first | |
if len(first_candidates) > 1: | |
second = first_candidates[1] if second is None else second | |
remaining_data = [i1, i2, i3] | |
if first is None: | |
try: | |
first = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None))) | |
except: | |
first = i1 | |
if second is None: | |
try: | |
second = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None))) | |
except: | |
second = i2 | |
if third is None: | |
try: | |
third = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None))) | |
except: | |
third = i3 | |
return first, second, third | |
def settle_all_address(address_first: str, address_second: str, address_third: str): | |
address_1 = address_first | |
address_2 = address_second | |
address_3 = address_third | |
r_add1 = None | |
r_add2 = None | |
r_add3 = None | |
# Check for first function | |
if check_for_first(address_first): | |
r_add1 = address_first | |
elif check_for_first(address_second): | |
r_add1 = address_second | |
elif check_for_first(address_third): | |
r_add1 = address_third | |
# Check for second function | |
if check_for_second(address_first): | |
r_add2 = address_first | |
elif check_for_second(address_second): | |
r_add2 = address_second | |
elif check_for_second(address_third): | |
r_add2 = address_third | |
# Check for third function | |
if check_for_third(address_first): | |
r_add3 = address_first | |
elif check_for_third(address_second): | |
r_add3 = address_second | |
elif check_for_third(address_third): | |
r_add3 = address_third | |
if r_add1 == r_add2 or r_add1 == r_add3 or r_add2 == r_add3: | |
# Duplicate data found now perform the comparizon in here | |
if r_add1 == r_add2 == r_add3: | |
r_add1 = None | |
r_add2 = None | |
r_add3 = None | |
else: | |
if r_add1 == r_add2 and r_add1 != None: # If address 1 and address 2 is same then use AI for checking | |
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3 | |
if compaire_two(m_add, r_add2, 1): | |
r_add2 = m_add | |
else: | |
r_add1 = m_add | |
elif r_add1 == r_add3 and r_add1 != None: | |
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3 | |
if compaire_two(m_add, r_add3, 1): | |
r_add1 = m_add | |
else: | |
r_add3 = m_add | |
elif r_add2 == r_add3 and r_add2 != None: | |
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3 | |
if compaire_two(m_add, r_add3, 3): | |
r_add3 = m_add | |
else: | |
r_add2 = m_add | |
if r_add1 == None or r_add2 == None or r_add3 == None: | |
# if any of them is None then calculate the address | |
ai_position1 = get_ai_position(address_1) | |
ai_position2 = get_ai_position(address_2) | |
ai_position3 = get_ai_position(address_3) | |
if ai_position1 == 3: | |
if r_add3: | |
pass | |
else: | |
r_add3 = address_1 | |
if r_add1 == None or r_add2 == None: | |
if r_add3 == address_1: | |
if compare_by_mid(1, 2, address_2): | |
r_add1 = address_2 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_2 | |
elif r_add3 == address_2: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_1 | |
elif r_add3 == address_3: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add2 = address_1 | |
elif ai_position1 == 2: | |
if r_add2: | |
pass | |
else: | |
r_add2 = address_1 | |
if r_add1 == None or r_add3 == None: | |
if r_add2 == address_1: | |
if compare_by_mid(1, 3, address_2): | |
r_add1 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_2 | |
elif r_add2 == address_2: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_1 | |
elif r_add2 == address_3: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add3 = address_1 | |
else: | |
if r_add1: | |
pass | |
else: | |
r_add1 = address_1 | |
if r_add2 == None or r_add3 == None: | |
if r_add1 == address_1: | |
if compare_by_mid(2, 3, address_2): | |
r_add2 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_2 | |
elif r_add1 == address_2: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_1 | |
elif r_add1 == address_3: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add2 = address_2 | |
r_add3 = address_1 | |
if ai_position2 == 3: | |
if r_add3: | |
pass | |
else: | |
r_add3 = address_2 | |
if r_add1 == None or r_add2 == None: | |
if r_add3 == address_1: | |
if compare_by_mid(1, 2, address_2): | |
r_add1 = address_2 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_2 | |
elif r_add3 == address_2: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_1 | |
elif r_add3 == address_3: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add2 = address_1 | |
elif ai_position2 == 2: | |
if r_add2: | |
pass | |
else: | |
r_add2 = address_2 | |
if r_add1 == None or r_add3 == None: | |
if r_add2 == address_1: | |
if compare_by_mid(1, 3, address_2): | |
r_add1 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_2 | |
elif r_add2 == address_2: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_1 | |
elif r_add2 == address_3: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add3 = address_1 | |
else: | |
if r_add1: | |
pass | |
else: | |
r_add1 = address_2 | |
if r_add2 == None or r_add3 == None: | |
if r_add1 == address_1: | |
if compare_by_mid(2, 3, address_2): | |
r_add2 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_2 | |
elif r_add1 == address_2: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_1 | |
elif r_add1 == address_3: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add2 = address_2 | |
r_add3 = address_1 | |
if ai_position3 == 3: | |
if r_add3: | |
pass | |
else: | |
r_add3 = address_3 | |
if r_add1 == None or r_add2 == None: | |
if r_add3 == address_1: | |
if compare_by_mid(1, 2, address_2): | |
r_add1 = address_2 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_2 | |
elif r_add3 == address_2: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add2 = address_1 | |
elif r_add3 == address_3: | |
if compare_by_mid(1, 2, address_1): | |
r_add1 = address_1 | |
r_add2 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add2 = address_1 | |
elif ai_position3 == 2: | |
if r_add2: | |
pass | |
else: | |
r_add2 = address_3 | |
if r_add1 == None or r_add3 == None: | |
if r_add2 == address_1: | |
if compare_by_mid(1, 3, address_2): | |
r_add1 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_2 | |
elif r_add2 == address_2: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add1 = address_3 | |
r_add3 = address_1 | |
elif r_add2 == address_3: | |
if compare_by_mid(1, 3, address_1): | |
r_add1 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add1 = address_2 | |
r_add3 = address_1 | |
else: | |
if r_add1: | |
pass | |
else: | |
r_add1 = address_3 | |
if r_add2 == None or r_add3 == None: | |
if r_add1 == address_1: | |
if compare_by_mid(2, 3, address_2): | |
r_add2 = address_2 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_2 | |
elif r_add1 == address_2: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_3 | |
else: | |
r_add2 = address_3 | |
r_add3 = address_1 | |
elif r_add1 == address_3: | |
if compare_by_mid(2, 3, address_1): | |
r_add2 = address_1 | |
r_add3 = address_2 | |
else: | |
r_add2 = address_2 | |
r_add3 = address_1 | |
return swapper(r_add1, r_add2, r_add3) | |
def process_file(filepath: str): | |
wb = openpyxl.load_workbook(filepath, data_only=True) | |
ws = wb.active | |
new_wb = openpyxl.Workbook() | |
new_ws = new_wb.active | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') | |
temp_path = temp_file.name | |
columns_to_process = [4, 5, 6] | |
for col in range(1, ws.max_column + 1): | |
new_ws.cell(row=1, column=col).value = ws.cell(row=1, column=col).value | |
empty_rows = 0 | |
# for row in ws.iter_rows(min_row=2, max_row=ws.max_row): | |
# if any(cell.value not in (None, "") for cell in row): | |
# total_row += 1 | |
for row in ws.iter_rows(min_row=2, max_row=ws.max_row): | |
if empty_rows > 3: | |
break | |
row_num = row[0].row | |
for col in range(1, ws.max_column + 1): | |
if col not in columns_to_process: | |
new_ws.cell(row=row_num, column=col).value = ws.cell(row=row_num, column=col).value | |
else: | |
new_ws.cell(row=row_num, column=col).value = None | |
address_first = ws.cell(row=row_num, column=4).value | |
address_second = ws.cell(row=row_num, column=5).value | |
address_third = ws.cell(row=row_num, column=6).value | |
if address_first != None and address_second != None and address_third != None: | |
# print(f"Processing {address_first} | {address_second} | {address_third}") | |
ad1, ad2, ad3 = settle_all_address(address_first, address_second, address_third) | |
new_ws.cell(row=row_num, column=4).value = ad1 | |
new_ws.cell(row=row_num, column=5).value = ad2 | |
new_ws.cell(row=row_num, column=6).value = ad3 | |
print(f"Adding : {ad1} | {ad2} | {ad3}") | |
else: | |
empty_rows += 1 | |
new_wb.save(temp_path) | |
return temp_path | |
def gradio_process(file): | |
file_path = file.name | |
output_file_path = process_file(file_path) | |
return output_file_path | |
iface = gr.Interface( | |
fn=gradio_process, | |
inputs=gr.File(), | |
outputs=gr.File(), | |
title="AI Address Processor", | |
description="Upload an Excel file, and the AI will process the addresses." | |
) | |
iface.launch() |