AkitoP commited on
Commit
0888de7
·
1 Parent(s): 5024e84
__pycache__/parse_accent.cpython-310.pyc ADDED
Binary file (1.28 kB). View file
 
__pycache__/surface2katakana_with_acc.cpython-310.pyc ADDED
Binary file (7.97 kB). View file
 
app.py CHANGED
@@ -8,7 +8,8 @@ import librosa
8
  import spaces
9
  import torch
10
  from transformers import pipeline, WhisperConfig
11
-
 
12
  warnings.filterwarnings("ignore")
13
 
14
  is_hf = os.getenv("SYSTEM") == "spaces"
@@ -29,7 +30,7 @@ pipe = pipeline(
29
  def transcribe(audio: str) -> str:
30
  result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
31
  print(result)
32
- return result
33
 
34
 
35
  initial_md = """
@@ -43,6 +44,8 @@ with gr.Blocks() as app:
43
  audio = gr.Audio(type="filepath")
44
  transcribe_btn = gr.Button("Transcribe")
45
  output = gr.Textbox(label="Result")
46
- transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output])
 
 
47
 
48
  app.launch(inbrowser=True)
 
8
  import spaces
9
  import torch
10
  from transformers import pipeline, WhisperConfig
11
+ from parse_accent import parse_pitch_accent
12
+ from surface2katakana_with_acc import katakana_to_phones
13
  warnings.filterwarnings("ignore")
14
 
15
  is_hf = os.getenv("SYSTEM") == "spaces"
 
30
  def transcribe(audio: str) -> str:
31
  result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
32
  print(result)
33
+ return result, parse_pitch_accent(result), katakana_to_phones(result)
34
 
35
 
36
  initial_md = """
 
44
  audio = gr.Audio(type="filepath")
45
  transcribe_btn = gr.Button("Transcribe")
46
  output = gr.Textbox(label="Result")
47
+ output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
48
+ output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
49
+ transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
50
 
51
  app.launch(inbrowser=True)
parse_accent.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def parse_pitch_accent(s):
2
+ # Remove '^', '#', and '$', keep '_', '?'
3
+ s = s.replace('^', '').replace('#', '').replace('$', '')
4
+
5
+ marks = [] # List to store the binary marks
6
+ current_mark = None # Current mark (0 or 1)
7
+ last_accent = None # '↑' or '↓' or None
8
+ prev_char_index = -1 # Index of the previous character (not an accent marker)
9
+ chars = list(s) # List of characters from the string
10
+
11
+ i = 0
12
+ while i < len(chars):
13
+ char = chars[i]
14
+ if char == '↑' or char == '↓':
15
+ if last_accent == char:
16
+ # Apply special rules for consecutive same accents
17
+ if char == '↑':
18
+ # Mark 0 before the second '↑'
19
+ if prev_char_index >= 0:
20
+ marks[prev_char_index] = '0'
21
+ elif char == '↓':
22
+ # Mark 1 before the second '↓'
23
+ if prev_char_index >= 0:
24
+ marks[prev_char_index] = '1'
25
+ else:
26
+ # At the start, determine the initial mark based on the first accent
27
+ if current_mark is None:
28
+ current_mark = '0' if char == '↑' else '1'
29
+ # Set the current mark after the accent
30
+ current_mark = '1' if char == '↑' else '0'
31
+ last_accent = char
32
+ elif char in ['_', '?']:
33
+ # For '_' and '?', append the current mark
34
+ marks.append(current_mark)
35
+ prev_char_index = len(marks) - 1
36
+ else:
37
+ # Regular character, append the current mark
38
+ if current_mark is None:
39
+ # If no accent encountered yet, look-ahead to determine the starting mark
40
+ for j in range(i, len(chars)):
41
+ if chars[j] == '↑':
42
+ current_mark = '0'
43
+ break
44
+ elif chars[j] == '↓':
45
+ current_mark = '1'
46
+ break
47
+ marks.append(current_mark)
48
+ prev_char_index = len(marks) - 1
49
+ i += 1
50
+ # Convert the list of marks to a string
51
+ result = ''.join(marks)
52
+ return result
53
+ def katakana_normalize(s):
54
+ return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
55
+ # Example usage
56
+ # input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$'
57
+ # output = parse_pitch_accent(input_str)
58
+ # output_str = katakana_normalize(input_str)
59
+ # print(output_str)
60
+ # assert len(output) == len(output_str)
61
+ # for i in range(len(output)):
62
+ # print(f"{output_str[i]}: {output[i]}")
requirements.txt CHANGED
@@ -3,4 +3,5 @@ librosa
3
  numpy
4
  spaces
5
  torch
6
- transformers
 
 
3
  numpy
4
  spaces
5
  torch
6
+ transformers
7
+ pyopenjtalk
surface2katakana_with_acc.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyopenjtalk
2
+ import re
3
+ import sys
4
+ import os
5
+
6
+ # Temporarily redirect stdout and stderr
7
+ sys.stdout = open(os.devnull, 'w')
8
+ sys.stderr = open(os.devnull, 'w')
9
+
10
+ # Call the function that produces the warning
11
+ # e.g., pyopenjtalk.some_function()
12
+
13
+ # Restore stdout and stderr
14
+ sys.stdout = sys.__stdout__
15
+ sys.stderr = sys.__stderr__
16
+
17
+ # 定义平假名到片假名的转换表
18
+ hiragana_to_katakana = str.maketrans(
19
+ "ぁあぃいぅうぇえぉおかがきぎくぐけげこご"
20
+ "さざしじすずせぜそぞただちぢっつづてでとど"
21
+ "なにぬねのはばぱひびぴふぶぷへべぺほぼぽ"
22
+ "まみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ",
23
+ "ァアィイゥウェエォオカガキギクグケゲコゴ"
24
+ "サザシジスズセゼソゾタダチヂッツヅテデトド"
25
+ "ナニヌネノハバパヒビピフブプヘベペホボポ"
26
+ "マミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
27
+ )
28
+
29
+ # 定义一个函数,将平假名转换为片假名
30
+ def hiragana_to_katakana_func(text):
31
+ return text.translate(hiragana_to_katakana)
32
+
33
+ # 定义一个函数,准确地分割假名为音拍(mora)
34
+ def split_into_moras(kana):
35
+ # 正则表达式匹配日语音拍,包括拗音、小写片假名和长音符号
36
+ mora_pattern = re.compile(
37
+ r"(?:[ァ-ヴー]|[ぁ-ゖ]|ー)[ァィゥェォャュョ]?|ー"
38
+ )
39
+ moras = mora_pattern.findall(kana)
40
+ return moras
41
+
42
+ # 定义一个函数,根据 acc 值标注升降调
43
+ def annotate_kana_with_accent(moras, acc):
44
+ annotated_moras = []
45
+ for i, mora in enumerate(moras):
46
+ annotated_moras.append(mora)
47
+ # 当 acc == 0 时,在第一个假名后添加上升符号
48
+ if acc == 0 and i == 0:
49
+ annotated_moras.append('↑')
50
+ # 当 acc > 1 时,在第一个假名后添加上升符号
51
+ elif acc > 1 and i == 0:
52
+ annotated_moras.append('↑')
53
+ # 当 acc > 0 时,在第 n 个假名后添加下降符号
54
+ elif acc > 0 and i + 1 == acc:
55
+ annotated_moras.append('↓')
56
+ return ''.join(annotated_moras)
57
+
58
+ # 主函数,获取带音调符号的片假名序列
59
+ def get_katakana_with_accent(text):
60
+ current_accent = 0
61
+ # 对于0形,其结束时current_accent为1,对于其他,其结束时current_accent为0
62
+ #
63
+ tokens = pyopenjtalk.run_frontend(text)
64
+ result = ''
65
+ for token in tokens:
66
+ #print(token)
67
+ mora_size = token['mora_size']
68
+ if mora_size > 1:
69
+ pron = token['pron']
70
+ acc = token['acc']
71
+ # 将发音转换为平假名
72
+ kana = pyopenjtalk.g2p(pron, kana=True)
73
+ # 转换为片假名
74
+ kana = hiragana_to_katakana_func(kana)
75
+ # 分割为音拍(mora)
76
+ moras = split_into_moras(kana)
77
+ # 标注音调符号
78
+ annotated_kana = annotate_kana_with_accent(moras, acc)
79
+ result += annotated_kana
80
+ elif mora_size == 0 or token['pron'] == '’':
81
+ # 对于标点符号等,直接添加原始字符串
82
+ result += token['string']
83
+ else:
84
+ result += token['pron']
85
+ result.replace('’', '↑')
86
+ return result
87
+
88
+ import pyopenjtalk
89
+ import re
90
+ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
91
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
92
+
93
+ The algorithm is based on `Prosodic features control by symbols as input of
94
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
95
+
96
+ Args:
97
+ text (str): Input text.
98
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
99
+
100
+ Returns:
101
+ List[str]: List of phoneme + prosody symbols.
102
+
103
+ Examples:
104
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
105
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
106
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
107
+
108
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
109
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
110
+
111
+ """
112
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
113
+ #print(labels)
114
+ N = len(labels)
115
+
116
+ phones = []
117
+ for n in range(N):
118
+ lab_curr = labels[n]
119
+
120
+ # current phoneme
121
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
122
+ # deal unvoiced vowels as normal vowels
123
+ if drop_unvoiced_vowels and p3 in "AEIOU":
124
+ p3 = p3.lower()
125
+
126
+ # deal with sil at the beginning and the end of text
127
+ if p3 == "sil":
128
+ assert n == 0 or n == N - 1
129
+ if n == 0:
130
+ phones.append("^")
131
+ elif n == N - 1:
132
+ # check question form or not
133
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
134
+ if e3 == 0:
135
+ phones.append("$")
136
+ elif e3 == 1:
137
+ phones.append("?")
138
+ continue
139
+ elif p3 == "pau":
140
+ phones.append("_")
141
+ continue
142
+ else:
143
+ phones.append(p3)
144
+
145
+ # accent type and position info (forward or backward)
146
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
147
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
148
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
149
+
150
+ # number of mora in accent phrase
151
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
152
+
153
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
154
+ # accent phrase border
155
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
156
+ phones.append("#")
157
+ # pitch falling
158
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
159
+ phones.append("]")
160
+ # pitch rising
161
+ elif a2 == 1 and a2_next == 2:
162
+ phones.append("[")
163
+
164
+ return phones
165
+
166
+ def _numeric_feature_by_regex(regex, s):
167
+ match = re.search(regex, s)
168
+ if match is None:
169
+ return -50
170
+ return int(match.group(1))
171
+ import pyopenjtalk
172
+ def build_phone_to_katakana():
173
+ # 所有基本的片假名音节
174
+ basic_katakana = [
175
+ 'ア', 'イ', 'ウ', 'エ', 'オ',
176
+ 'カ', 'キ', 'ク', 'ケ', 'コ',
177
+ 'サ', 'シ', 'ス', 'セ', 'ソ',
178
+ 'タ', 'チ', 'ツ', 'テ', 'ト',
179
+ 'ナ', 'ニ', 'ヌ', 'ネ', 'ノ',
180
+ 'ハ', 'ヒ', 'フ', 'ヘ', 'ホ',
181
+ 'マ', 'ミ', 'ム', 'メ', 'モ',
182
+ 'ヤ', 'ユ', 'ヨ',
183
+ 'ラ', 'リ', 'ル', 'レ', 'ロ',
184
+ 'ワ', 'ヲ', 'ン',
185
+ 'ガ', 'ギ', 'グ', 'ゲ', 'ゴ',
186
+ 'ザ', 'ジ', 'ズ', 'ゼ', 'ゾ',
187
+ 'ダ', 'ヂ', 'ヅ', 'デ', 'ド',
188
+ 'バ', 'ビ', 'ブ', 'ベ', 'ボ',
189
+ 'パ', 'ピ', 'プ', 'ペ', 'ポ',
190
+ 'キャ', 'キュ', 'キョ',
191
+ 'シャ', 'シュ', 'ショ',
192
+ 'チャ', 'チュ', 'チョ',
193
+ 'ニャ', 'ニュ', 'ニョ',
194
+ 'ヒャ', 'ヒュ', 'ヒョ',
195
+ 'ミャ', 'ミュ', 'ミョ',
196
+ 'リャ', 'リュ', 'リョ',
197
+ 'ギャ', 'ギュ', 'ギョ',
198
+ 'ジャ', 'ジュ', 'ジョ',
199
+ 'ビャ', 'ビュ', 'ビョ',
200
+ 'ピャ', 'ピュ', 'ピョ',
201
+ 'ヴァ', 'ヴィ', 'ヴ', 'ヴェ', 'ヴォ',
202
+ 'ファ', 'フィ', 'フェ', 'フォ',
203
+ 'ウィ', 'ウェ', 'ウォ',
204
+ 'ティ', 'トゥ',
205
+ 'ディ', 'ドゥ',
206
+ 'ツァ', 'ツィ', 'ツェ', 'ツォ',
207
+ 'デュ', 'デョ',
208
+ 'ジェ', 'ジョ',
209
+ 'チェ', 'チョ',
210
+ 'シェ', 'ショ',
211
+ 'ヂェ', 'ヂョ',
212
+ 'ヒェ', 'ヒョ',
213
+ 'ビェ', 'ビョ',
214
+ 'ピェ', 'ピョ',
215
+ 'キェ', 'キョ',
216
+ 'ギェ', 'ギョ',
217
+ 'ミェ', 'ミョ',
218
+ 'リェ', 'リョ',
219
+ 'アァ', 'イィ', 'ウゥ', 'エェ', 'オォ',
220
+ 'ヴャ', 'ヴュ', 'ヴョ',
221
+ 'ッ', 'ー'
222
+ ]
223
+
224
+
225
+ katakana_to_phone = {}
226
+
227
+ for kana in basic_katakana:
228
+ # 将片假名转换为平假名
229
+ # hiragana = pyopenjtalk.g2p(kana, kana=True)
230
+ # 将平假名转换为音素表示
231
+ phones = pyopenjtalk.g2p(kana)
232
+ #print(phones)
233
+ # 去除开头和结尾的静音标记(pau)
234
+ phones = phones.strip('')
235
+ # 存储映射关系
236
+ katakana_to_phone[kana] = phones
237
+
238
+ phone_to_katakana = {}
239
+
240
+ for kana, phones in katakana_to_phone.items():
241
+ # 检查是否已有相同的音素映射
242
+ phone_to_katakana[phones] = kana
243
+ return phone_to_katakana, katakana_to_phone
244
+ # 定义转换函数
245
+ def phones_list_to_katakana(phone_list, phone_to_katakana):
246
+ output = ''
247
+ i = 0
248
+ length = len(phone_list)
249
+ special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
250
+
251
+ while i < length:
252
+ phone = phone_list[i]
253
+ if phone in special_symbols:
254
+ output += phone
255
+ i += 1
256
+ else:
257
+ max_match_length = 5
258
+ match_found = False
259
+ for l in range(max_match_length, 0, -1):
260
+ if i + l <= length:
261
+ phones_seq = ' '.join(phone_list[i:i+l])
262
+ if phones_seq in phone_to_katakana:
263
+ output += phone_to_katakana[phones_seq]
264
+ i += l
265
+ match_found = True
266
+ break
267
+ if not match_found:
268
+ single_phone = phone_list[i]
269
+ if single_phone in phone_to_katakana:
270
+ output += phone_to_katakana[single_phone]
271
+ i += 1
272
+ else:
273
+ print(f"无法映射的���素: {single_phone}")
274
+ i += 1
275
+ if len(output) == 0:
276
+ return "…"
277
+ return output.replace("[", "↑").replace("]", "↓")
278
+ def katakana_to_phones_list(katakana_list, katakana_to_phone):
279
+ output = []
280
+ i = 0
281
+ length = len(katakana_list)
282
+ special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
283
+
284
+ while i < length:
285
+ katakana = katakana_list[i]
286
+ if katakana in special_symbols:
287
+ output.append(katakana)
288
+ i += 1
289
+ else:
290
+ max_match_length = 5
291
+ match_found = False
292
+ for l in range(max_match_length, 0, -1):
293
+ if i + l <= length:
294
+ katakana_seq = ''.join(katakana_list[i:i+l])
295
+ if katakana_seq in katakana_to_phone:
296
+ output.append(katakana_to_phone[katakana_seq])
297
+ i += l
298
+ match_found = True
299
+ break
300
+ if not match_found:
301
+ single_katakana = katakana_list[i]
302
+ if single_katakana in katakana_to_phone:
303
+ output.append(katakana_to_phone[single_katakana])
304
+ i += 1
305
+ else:
306
+ print(f"无法映射的片假名: {single_katakana}")
307
+ i += 1
308
+ if len(output) == 0:
309
+ return ["…"]
310
+ return output
311
+
312
+ phone_to_katakana, katakana_to_phone = build_phone_to_katakana()
313
+
314
+ def surface_to_katakana_with_accent(text):
315
+ text = text.replace("…", "")
316
+ phones = pyopenjtalk_g2p_prosody(text)
317
+ return phones_list_to_katakana(phones, phone_to_katakana)
318
+
319
+ def katakana_to_phones(katakana, katakana_to_phone = katakana_to_phone):
320
+ katakana_list = list(katakana)
321
+ phone_list = katakana_to_phones_list(katakana_list, katakana_to_phone)
322
+ return ' '.join(phone_list).replace("^", "").replace("#", "").replace("$", "").replace(" "," ").strip()
323
+
324
+ # 处理文本中的标点符号和空格
325
+ # def preprocess_text(text):
326
+ # # 定义日语字符的正则表达式
327
+ # japanese_characters = re.compile(
328
+ # r"[ぁ-ゟ゠-ヿ一-龯]"
329
+ # )
330
+ # # 定义非日语字符(包括标点符号、空格等)的正则表达式
331
+ # non_japanese_characters = re.compile(
332
+ # r"[^ぁ-ゟ゠-ヿ一-龯]+"
333
+ # )
334
+ # sentences = re.split(non_japanese_characters, text)
335
+ # marks = re.findall(non_japanese_characters, text)
336
+ # processed_text = []
337
+ # for i, sentence in enumerate(sentences):
338
+ # if sentence:
339
+ # annotated_sentence = get_katakana_with_accent(sentence)
340
+ # processed_text.append(annotated_sentence)
341
+ # if i < len(marks):
342
+ # mark = marks[i]
343
+ # if mark.strip():
344
+ # processed_text.append(mark)
345
+ # temp = ''.join(processed_text)
346
+ # return_text = temp.replace("’", "↑")
347
+ # return return_text
348
+ def preprocess_text(text):
349
+ #print(text)
350
+ return surface_to_katakana_with_accent(text)
351
+ # 示例用法
352
+ if __name__ == "__main__":
353
+ text = "^キョ↓オワ#ワ↑タシノ#マ↑ホオ#エ↑ネル↓キイノ#ホ↑キュウノ#タ↑メ↓ギ$"
354
+ annotated_text = katakana_to_phones(text)
355
+ print(annotated_text)