import json from functools import lru_cache import gradio as gr from difflib import SequenceMatcher @lru_cache(maxsize=1) def load_json_file(json_file): with open(json_file, 'r', encoding='utf-8') as file: return json.load(file) def preprocess_jyutping_data(jyutping_data): return { char: syllable for syllable, mappings in jyutping_data.items() for mapping in mappings for char in mapping["漢字"] } def chinese_to_jyutping(text, char_to_jyutping): return [char_to_jyutping.get(char, char) for char in text] def get_similar_initials(): return { 'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'], 'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'], 'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'], 'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j'] } def get_lazy_pronunciations(): return { 'n': ['l'], 'l': ['n'], 'gw': ['g'], 'g': ['gw'], 'k': ['t'], 't': ['k'], 'ng': ['n'], 'n': ['ng'] } def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations): initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0] initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0] return (initial1 == initial2 or initial2 in similar_initials.get(initial1, []) or initial2 in lazy_pronunciations.get(initial1, [])) @lru_cache(maxsize=1) def get_char_to_jyutping(): jyutping_data = load_json_file('lexi-can_key.json') return preprocess_jyutping_data(jyutping_data) def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations): similar_count = sum( 1 for uj in user_jyutping for rj in result_jyutping if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations) ) return similar_count / max(len(user_jyutping), len(result_jyutping)) def match_user_input(user_input): char_to_jyutping = get_char_to_jyutping() similar_initials = get_similar_initials() lazy_pronunciations = get_lazy_pronunciations() saved_results = load_json_file('jyutping_results_largec.json') user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping) exact_match = next((result for result in saved_results if set(user_jyutping).issubset(result["jyutping"])), None) if exact_match: return { "input_text": user_input, "input_jyutping": user_jyutping, "match": exact_match, "match_type": "exact" } matches = [] for result in saved_results: phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials, lazy_pronunciations) text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio() length_diff = abs(len(user_input) - len(result["text"])) length_penalty = 1 / (1 + length_diff) total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1) matches.append((result, total_score)) matches.sort(key=lambda x: x[1], reverse=True) top_matches = matches[:3] return { "input_text": user_input, "input_jyutping": user_jyutping, "matches": [ { "match": match[0], "score": match[1], "match_type": "phonetic_similarity" } for match in top_matches ] } sample_cases = [ "龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街", "謝非道", "金中道", "得立街", "地梨根得里" ] def gradio_app(custom_input, sample_case): user_input = sample_case if sample_case else custom_input if not user_input: return "Please enter text or select a sample case." result = match_user_input(user_input) if "match" in result: return json.dumps(result, ensure_ascii=False, indent=4) else: formatted_result = { "input_text": result["input_text"], "input_jyutping": result["input_jyutping"], "matches": [ { "text": match["match"]["text"], "jyutping": match["match"]["jyutping"], "score": round(match["score"], 4), "match_type": match["match_type"] } for match in result["matches"] ] } return json.dumps(formatted_result, ensure_ascii=False, indent=4) interface = gr.Interface( fn=gradio_app, inputs=[ gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"), gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case") ], outputs=gr.JSON(label="Matching Result"), title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理", description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。" ) interface.launch()