OttoYu's picture
Update app.py
83be062 verified
import re
from difflib import SequenceMatcher
import requests
import xml.etree.ElementTree as ET
import gradio as gr
from concurrent.futures import ThreadPoolExecutor
areaData = {
"Hong Kong": {
"Central and Western": [
"Sai Ying Pun", "Kennedy Town", "Shek Tong Tsui", "Sai Wan", "Sheung Wan",
"Central", "Admiralty", "Mid-Levels West", "Mid-Levels", "The Peak"
],
"Wan Chai": [
"Wan Chai", "Causeway Bay", "Happy Valley", "Tai Hang", "Stubbs Road",
"Jardine's Lookout"
],
"Eastern": [
"Tin Hau", "Braemar Hill", "North Point", "Quarry Bay", "Sai Wan Ho",
"Shau Kei Wan", "Chai Wan", "Siu Sai Wan"
],
"Southern": [
"Pok Fu Lam", "Aberdeen", "Ap Lei Chau", "Wong Chuk Hang", "Shouson Hill",
"Repulse Bay", "Chung Hom Kok", "Stanley", "Tai Tam", "Shek O", "Telegraph Bay"
]
},
"Kowloon": {
"Yau Tsim Mong": [
"Tsim Sha Tsui", "Yau Ma Tei", "West Kowloon", "Kowloon Tong", "Mong Kok",
"Tai Kok Tsui", "Jordan", "Prince Edward"
],
"Sham Shui Po": [
"Mei Foo", "Lai Chi Kok", "Cheung Sha Wan", "Sham Shui Po", "Shek Kip Mei", "Tai Wo Ping", "Stonecutters Island"
],
"Kowloon City": [
"Hung Hom", "To Kwa Wan", "Ma Tau Kok", "Ma Tau Wai", "Kai Tak", "Kowloon City",
"Ho Man Tin", "Kowloon Tong", "Beacon Hill"
],
"Wong Tai Sin": [
"San Po Kong", "Wong Tai Sin", "Tung Tau", "Wang Tau Hom", "Lok Fu", "Diamond Hill",
"Tsz Wan Shan", "Ngau Chi Wan"
],
"Kwun Tong": [
"Ping Shek", "Kowloon Bay", "Ngau Tau Kok", "Tsz Wan Shan", "Kwun Tong",
"Sau Mau Ping", "Lam Tin", "Yau Tong", "Lei Yue Mun"
]
},
"New Territories": {
"Kwai Tsing": [
"Kwai Chung", "Tsing Yi", "Kwai Fong"
],
"Tsuen Wan": [
"Tsuen Wan", "Tsing Lung Bridge", "Tsing Hung Bridge", "Shen Tsuen", "Tsing Chung Koon",
"Ma Wan", "Tsing Lung Bridge"
],
"Tuen Mun": [
"Tai Lam Chung", "Siu Lam", "Tuen Mun", "Lam Tei"
],
"Yuen Long": [
"Hung Shui Kiu", "Ha Tsuen", "Lau Fau Shan", "Tin Shui Wai", "Yuen Long", "San Tin",
"Lok Ma Chau", "Kam Tin", "Shek Kong", "Pat Heung"
],
"North": [
"Fanling", "Luen Wo Hui", "Sheung Shui", "Shek Wu Hui", "Sha Tau Kok", "Lok Keng",
"Wu Kau Tang"
],
"Tai Po": [
"Tai Po Market", "Tai Po", "Tai Po Kau", "Tai Mei Tuk", "Plover Cove", "Cheung Uk Tau",
"Tai Wo"
],
"Sha Tin": [
"Tai Wai", "Sha Tin", "Fo Tan", "Ma On Shan", "Shui Chuen O", "Ma On Shan"
],
"Sai Kung": [
"Clear Water Bay", "Sai Kung", "Tai Mong Tsai", "Tseung Kwan O", "Hang Hau",
"Tiu Keng Leng", "Ma Yau Tong"
],
"Islands": [
"Cheung Chau", "Peng Chau", "Lantau Island", "Tung Chung", "Lamma Island"
]
}
}
def normalize_text(text):
return re.sub(r'\s+', ' ', text.lower().strip())
def normalize_address(address):
return re.sub(r'[^\w\s]', '', re.sub(r'\s+', ' ', address)).strip().upper()
def load_and_normalize_address_pool(file_paths):
address_pool = []
for file_path in file_paths:
try:
with open(file_path, 'r') as f:
for line in f:
address = line.strip()
if address:
normalized = normalize_address(address)
address_pool.append((address, normalized))
except FileNotFoundError:
print(f"File not found: {file_path}")
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return address_pool
def similarity(a, b):
a, b = a.replace(' ', ''), b.replace(' ', '')
return SequenceMatcher(None, a, b).ratio()
def extract_relevant_part(user_input):
number_part = re.findall(r'\d+', user_input)
number_part = number_part[0] if number_part else ''
address_part = re.sub(r'^\d+', '', user_input).strip()
return number_part, address_part
def match_address(user_input, address_pool):
number_part, address_part = extract_relevant_part(user_input)
normalized_input = normalize_address(address_part)
best_match = None
highest_similarity = 0
for original_address, normalized_address in address_pool:
sim = similarity(normalized_input, normalized_address)
if sim > highest_similarity:
highest_similarity = sim
best_match = original_address
if best_match:
best_match = f"{number_part} {best_match}".strip() if number_part else best_match
return best_match, highest_similarity
def fetch_address_from_als_api(user_input):
api_url = f"https://www.als.gov.hk/lookup?q={requests.utils.quote(user_input)}"
try:
response = requests.get(api_url)
response.raise_for_status()
tree = ET.ElementTree(ET.fromstring(response.content))
root = tree.getroot()
result = {}
eng_premises = root.find(".//EngPremisesAddress")
if eng_premises is not None:
result['English Address'] = {
'Estate': eng_premises.findtext(".//EstateName", ''),
'Street': eng_premises.findtext(".//StreetName", ''),
'Building No': eng_premises.findtext(".//BuildingNoFrom", ''),
'District': eng_premises.findtext(".//DcDistrict", ''),
'Region': eng_premises.findtext(".//Region", '')
}
chi_premises = root.find(".//ChiPremisesAddress")
if chi_premises is not None:
result['Chinese Address'] = {
'Estate': chi_premises.findtext(".//EstateName", ''),
'Street': chi_premises.findtext(".//StreetName", ''),
'Building No': chi_premises.findtext(".//BuildingNoFrom", ''),
'District': chi_premises.findtext(".//DcDistrict", ''),
'Region': chi_premises.findtext(".//Region", '')
}
geo_info = root.find(".//GeospatialInformation")
if geo_info is not None:
result['Geospatial Information'] = {
'Latitude': geo_info.findtext(".//Latitude", ''),
'Longitude': geo_info.findtext(".//Longitude", ''),
'Northing': geo_info.findtext(".//Northing", ''),
'Easting': geo_info.findtext(".//Easting", '')
}
return result
except requests.RequestException as e:
return f"Error fetching data from ALS API: {e}"
def extract_building_from_address(user_input):
normalized_input = normalize_text(user_input)
match = re.match(r'([^,]+)', normalized_input)
return match.group(1).strip() if match else normalized_input
def address_search(user_inputs):
results = []
user_inputs_list = user_inputs.splitlines()
def process_input(user_input):
building_part = extract_building_from_address(user_input)
normalized_input = normalize_address(building_part)
best_match, similarity_score = match_address(normalized_input, address_pool)
als_result = fetch_address_from_als_api(best_match) if best_match else "No match found."
result_str = f"Best match: {best_match} (Similarity: {similarity_score:.2f})\n"
if isinstance(als_result, dict):
for address_type, details in als_result.items():
result_str += f"\n{address_type}:\n"
for key, value in details.items():
result_str += f"{key}: {value}\n"
else:
result_str += als_result
return result_str
with ThreadPoolExecutor() as executor:
results = list(executor.map(process_input, user_inputs_list))
return "\n\n".join(results)
def clean_area_data(area_data):
cleaned_area_data = {}
for region, districts in area_data.items():
cleaned_districts = {}
for district, subdistricts in districts.items():
valid_subdistricts = [normalize_text(name) for name in subdistricts if
not re.search(r'Non-Building|Invalid|Other', name, re.I)]
cleaned_districts[normalize_text(district)] = valid_subdistricts
cleaned_area_data[normalize_text(region)] = cleaned_districts
return cleaned_area_data
cleaned_area_data = clean_area_data(areaData)
file_paths = [
'EngBuilding.txt',
'EngEstate.txt',
'EngStreet.txt',
'EngVillage.txt'
]
address_pool = load_and_normalize_address_pool(file_paths)
interface = gr.Interface(
fn=address_search,
inputs=gr.Textbox(label="Enter Addresses (one per line, allow Batch Processing)", lines=10),
outputs=gr.Textbox(label="ALS API Results"),
title="Address Lookup and Matching (English)",
description="Enter addresses to find the closest matches and fetch details from the ALS API."
)
interface.launch()