Spaces:
Runtime error
Runtime error
Upload CantonesePhonetics.js
Browse files- CantonesePhonetics.js +186 -0
CantonesePhonetics.js
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class CantonesePhonetics {
|
2 |
+
constructor() {
|
3 |
+
this.charToJyutping = {};
|
4 |
+
this.savedResults = [];
|
5 |
+
this.similarInitials = {
|
6 |
+
b: ["p", "m"],
|
7 |
+
c: ["z", "s"],
|
8 |
+
d: ["t", "n"],
|
9 |
+
f: ["h", "w"],
|
10 |
+
g: ["k", "ng"],
|
11 |
+
gw: ["kw"],
|
12 |
+
h: ["f", "w"],
|
13 |
+
j: ["z", "c"],
|
14 |
+
k: ["g", "h"],
|
15 |
+
kw: ["gw"],
|
16 |
+
l: ["n"],
|
17 |
+
m: ["n", "b"],
|
18 |
+
n: ["l", "m", "ng"],
|
19 |
+
ng: ["g", "n"],
|
20 |
+
p: ["b", "m"],
|
21 |
+
s: ["c", "z"],
|
22 |
+
t: ["d", "n"],
|
23 |
+
w: ["f", "h"],
|
24 |
+
z: ["c", "j"]
|
25 |
+
};
|
26 |
+
this.similarFinals = {
|
27 |
+
aa: ["a", "aai", "aau"],
|
28 |
+
aai: ["aa", "ai"],
|
29 |
+
aau: ["aa", "au"],
|
30 |
+
ai: ["ei", "aai"],
|
31 |
+
au: ["ou", "aau"],
|
32 |
+
e: ["i", "ei"],
|
33 |
+
ei: ["ai", "i"],
|
34 |
+
i: ["e", "ei", "yu"],
|
35 |
+
o: ["u", "ou"],
|
36 |
+
oi: ["ui"],
|
37 |
+
ou: ["u", "au"],
|
38 |
+
u: ["o", "ou", "yu"],
|
39 |
+
ui: ["oi"],
|
40 |
+
yu: ["i", "u"]
|
41 |
+
};
|
42 |
+
}
|
43 |
+
|
44 |
+
async initialize() {
|
45 |
+
const [jyutpingData, results] = await Promise.all([
|
46 |
+
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/lexi-can_key.json").then(response => response.json()),
|
47 |
+
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/jyutping_results_largec.json").then(response => response.json())
|
48 |
+
]);
|
49 |
+
|
50 |
+
this.charToJyutping = this.preprocessJyutpingData(jyutpingData);
|
51 |
+
this.savedResults = results;
|
52 |
+
}
|
53 |
+
|
54 |
+
preprocessJyutpingData(jyutpingData) {
|
55 |
+
const result = {};
|
56 |
+
for (const [syllable, mappings] of Object.entries(jyutpingData)) {
|
57 |
+
for (const mapping of mappings) {
|
58 |
+
for (const char of mapping["漢字"]) {
|
59 |
+
result[char] = syllable;
|
60 |
+
}
|
61 |
+
}
|
62 |
+
}
|
63 |
+
return result;
|
64 |
+
}
|
65 |
+
|
66 |
+
chineseToJyutping(text) {
|
67 |
+
return text.split("").map(char => this.charToJyutping[char] || char);
|
68 |
+
}
|
69 |
+
|
70 |
+
areJyutpingSimilar(jyutping1, jyutping2) {
|
71 |
+
function splitJyutping(jyutping) {
|
72 |
+
if (jyutping.length > 1 && "wzjgk".includes(jyutping[1])) {
|
73 |
+
return [jyutping.slice(0, 2), jyutping.slice(2)];
|
74 |
+
}
|
75 |
+
return [jyutping[0], jyutping.slice(1)];
|
76 |
+
}
|
77 |
+
|
78 |
+
const [initial1, final1] = splitJyutping(jyutping1);
|
79 |
+
const [initial2, final2] = splitJyutping(jyutping2);
|
80 |
+
|
81 |
+
const initialMatch = initial1 === initial2 ||
|
82 |
+
(this.similarInitials[initial1] && this.similarInitials[initial1].includes(initial2)) ||
|
83 |
+
(this.similarInitials[initial2] && this.similarInitials[initial2].includes(initial1));
|
84 |
+
|
85 |
+
const finalMatch = final1 === final2 ||
|
86 |
+
(this.similarFinals[final1] && this.similarFinals[final1].includes(final2)) ||
|
87 |
+
(this.similarFinals[final2] && this.similarFinals[final2].includes(final1));
|
88 |
+
|
89 |
+
return initialMatch && finalMatch;
|
90 |
+
}
|
91 |
+
|
92 |
+
calculatePhoneticSimilarity(userJyutping, resultJyutping) {
|
93 |
+
const similarCount = userJyutping.reduce(
|
94 |
+
(count, uj) => count + resultJyutping.filter(rj => this.areJyutpingSimilar(uj, rj)).length,
|
95 |
+
0
|
96 |
+
);
|
97 |
+
return similarCount / Math.max(userJyutping.length, resultJyutping.length);
|
98 |
+
}
|
99 |
+
|
100 |
+
similarity(s1, s2) {
|
101 |
+
let longer = s1,
|
102 |
+
shorter = s2;
|
103 |
+
if (s1.length < s2.length) {
|
104 |
+
longer = s2;
|
105 |
+
shorter = s1;
|
106 |
+
}
|
107 |
+
const longerLength = longer.length;
|
108 |
+
if (longerLength == 0) {
|
109 |
+
return 1.0;
|
110 |
+
}
|
111 |
+
return (longerLength - this.editDistance(longer, shorter)) / longerLength;
|
112 |
+
}
|
113 |
+
|
114 |
+
editDistance(s1, s2) {
|
115 |
+
s1 = s1.toLowerCase();
|
116 |
+
s2 = s2.toLowerCase();
|
117 |
+
|
118 |
+
const costs = new Array();
|
119 |
+
for (let i = 0; i <= s1.length; i++) {
|
120 |
+
let lastValue = i;
|
121 |
+
for (let j = 0; j <= s2.length; j++) {
|
122 |
+
if (i == 0) costs[j] = j;
|
123 |
+
else {
|
124 |
+
if (j > 0) {
|
125 |
+
let newValue = costs[j - 1];
|
126 |
+
if (s1.charAt(i - 1) != s2.charAt(j - 1))
|
127 |
+
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
|
128 |
+
costs[j - 1] = lastValue;
|
129 |
+
lastValue = newValue;
|
130 |
+
}
|
131 |
+
}
|
132 |
+
}
|
133 |
+
if (i > 0) costs[s2.length] = lastValue;
|
134 |
+
}
|
135 |
+
return costs[s2.length];
|
136 |
+
}
|
137 |
+
|
138 |
+
matchUserInput(userInput) {
|
139 |
+
const userJyutping = this.chineseToJyutping(userInput);
|
140 |
+
|
141 |
+
const exactMatch = this.savedResults.find(result =>
|
142 |
+
userJyutping.every(uj => result.jyutping.includes(uj))
|
143 |
+
);
|
144 |
+
|
145 |
+
if (exactMatch) {
|
146 |
+
return {
|
147 |
+
input_text: userInput,
|
148 |
+
input_jyutping: userJyutping,
|
149 |
+
match: exactMatch,
|
150 |
+
match_type: "exact"
|
151 |
+
};
|
152 |
+
}
|
153 |
+
|
154 |
+
const matches = this.savedResults
|
155 |
+
.map(result => {
|
156 |
+
if (!result.text || !result.jyutping) return null;
|
157 |
+
|
158 |
+
const phoneticScore = this.calculatePhoneticSimilarity(userJyutping, result.jyutping);
|
159 |
+
const textSimilarity = this.similarity(userInput, result.text);
|
160 |
+
const lengthDiff = Math.abs(userInput.length - result.text.length);
|
161 |
+
const lengthPenalty = 1 / (1 + lengthDiff);
|
162 |
+
|
163 |
+
const totalScore = phoneticScore * 0.7 + textSimilarity * 0.2 + lengthPenalty * 0.1;
|
164 |
+
return {
|
165 |
+
result,
|
166 |
+
score: totalScore
|
167 |
+
};
|
168 |
+
})
|
169 |
+
.filter(Boolean);
|
170 |
+
|
171 |
+
matches.sort((a, b) => b.score - a.score);
|
172 |
+
const topMatches = matches.slice(0, 3);
|
173 |
+
|
174 |
+
return {
|
175 |
+
input_text: userInput,
|
176 |
+
input_jyutping: userJyutping,
|
177 |
+
matches: topMatches.map(match => ({
|
178 |
+
match: match.result,
|
179 |
+
score: match.score,
|
180 |
+
match_type: "phonetic_similarity"
|
181 |
+
}))
|
182 |
+
};
|
183 |
+
}
|
184 |
+
}
|
185 |
+
|
186 |
+
const phonetics = new CantonesePhonetics();
|