litagin commited on
Commit
a89499b
1 Parent(s): b6b44c3

Delete tools

Browse files
tools/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- """
2
- 工具包
3
- """
 
 
 
 
tools/classify_language.py DELETED
@@ -1,197 +0,0 @@
1
- import regex as re
2
-
3
- try:
4
- from config import config
5
-
6
- LANGUAGE_IDENTIFICATION_LIBRARY = (
7
- config.webui_config.language_identification_library
8
- )
9
- except:
10
- LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
11
-
12
- module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
13
-
14
- langid_languages = [
15
- "af",
16
- "am",
17
- "an",
18
- "ar",
19
- "as",
20
- "az",
21
- "be",
22
- "bg",
23
- "bn",
24
- "br",
25
- "bs",
26
- "ca",
27
- "cs",
28
- "cy",
29
- "da",
30
- "de",
31
- "dz",
32
- "el",
33
- "en",
34
- "eo",
35
- "es",
36
- "et",
37
- "eu",
38
- "fa",
39
- "fi",
40
- "fo",
41
- "fr",
42
- "ga",
43
- "gl",
44
- "gu",
45
- "he",
46
- "hi",
47
- "hr",
48
- "ht",
49
- "hu",
50
- "hy",
51
- "id",
52
- "is",
53
- "it",
54
- "ja",
55
- "jv",
56
- "ka",
57
- "kk",
58
- "km",
59
- "kn",
60
- "ko",
61
- "ku",
62
- "ky",
63
- "la",
64
- "lb",
65
- "lo",
66
- "lt",
67
- "lv",
68
- "mg",
69
- "mk",
70
- "ml",
71
- "mn",
72
- "mr",
73
- "ms",
74
- "mt",
75
- "nb",
76
- "ne",
77
- "nl",
78
- "nn",
79
- "no",
80
- "oc",
81
- "or",
82
- "pa",
83
- "pl",
84
- "ps",
85
- "pt",
86
- "qu",
87
- "ro",
88
- "ru",
89
- "rw",
90
- "se",
91
- "si",
92
- "sk",
93
- "sl",
94
- "sq",
95
- "sr",
96
- "sv",
97
- "sw",
98
- "ta",
99
- "te",
100
- "th",
101
- "tl",
102
- "tr",
103
- "ug",
104
- "uk",
105
- "ur",
106
- "vi",
107
- "vo",
108
- "wa",
109
- "xh",
110
- "zh",
111
- "zu",
112
- ]
113
-
114
-
115
- def classify_language(text: str, target_languages: list = None) -> str:
116
- if module == "fastlid" or module == "fasttext":
117
- from fastlid import fastlid, supported_langs
118
-
119
- classifier = fastlid
120
- if target_languages != None:
121
- target_languages = [
122
- lang for lang in target_languages if lang in supported_langs
123
- ]
124
- fastlid.set_languages = target_languages
125
- elif module == "langid":
126
- import langid
127
-
128
- classifier = langid.classify
129
- if target_languages != None:
130
- target_languages = [
131
- lang for lang in target_languages if lang in langid_languages
132
- ]
133
- langid.set_languages(target_languages)
134
- else:
135
- raise ValueError(f"Wrong module {module}")
136
-
137
- lang = classifier(text)[0]
138
-
139
- return lang
140
-
141
-
142
- def classify_zh_ja(text: str) -> str:
143
- for idx, char in enumerate(text):
144
- unicode_val = ord(char)
145
-
146
- # 检测日语字符
147
- if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
148
- return "ja"
149
-
150
- # 检测汉字字符
151
- if 0x4E00 <= unicode_val <= 0x9FFF:
152
- # 检查周围的字符
153
- next_char = text[idx + 1] if idx + 1 < len(text) else None
154
-
155
- if next_char and (
156
- 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
157
- ):
158
- return "ja"
159
-
160
- return "zh"
161
-
162
-
163
- def split_alpha_nonalpha(text, mode=1):
164
- if mode == 1:
165
- pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])"
166
- elif mode == 2:
167
- pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])"
168
- else:
169
- raise ValueError("Invalid mode. Supported modes are 1 and 2.")
170
-
171
- return re.split(pattern, text)
172
-
173
-
174
- if __name__ == "__main__":
175
- text = "这是一个测试文本"
176
- print(classify_language(text))
177
- print(classify_zh_ja(text)) # "zh"
178
-
179
- text = "これはテストテキストです"
180
- print(classify_language(text))
181
- print(classify_zh_ja(text)) # "ja"
182
-
183
- text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
184
-
185
- print(split_alpha_nonalpha(text, mode=1))
186
- # output: ['vits', '和', 'Bert-VITS', '2是', 'tts', '模型。花费3', 'days.花费3天。Take 3 days']
187
-
188
- print(split_alpha_nonalpha(text, mode=2))
189
- # output: ['vits', '和', 'Bert-VITS2', '是', 'tts', '模型。花费', '3days.花费', '3', '天。Take 3 days']
190
-
191
- text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days"
192
- print(split_alpha_nonalpha(text, mode=1))
193
- # output: ['vits ', '和 ', 'Bert-VITS', '2 ', '是 ', 'tts ', '模型。花费3', 'days.花费3天。Take ', '3 ', 'days']
194
-
195
- text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days"
196
- print(split_alpha_nonalpha(text, mode=2))
197
- # output: ['vits ', '和 ', 'Bert-VITS2 ', '是 ', 'tts ', '模型。花费', '3days.花费', '3', '天。Take ', '3 ', 'days']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/log.py DELETED
@@ -1,16 +0,0 @@
1
- """
2
- logger封装
3
- """
4
- from loguru import logger
5
- import sys
6
-
7
-
8
- # 移除所有默认的处理器
9
- logger.remove()
10
-
11
- # 自定义格式并添加到标准输出
12
- log_format = (
13
- "<g>{time:MM-DD HH:mm:ss}</g> |<lvl>{level:^8}</lvl>| {file}:{line} | {message}"
14
- )
15
-
16
- logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/sentence.py DELETED
@@ -1,173 +0,0 @@
1
- import logging
2
-
3
- import regex as re
4
-
5
- from tools.classify_language import classify_language, split_alpha_nonalpha
6
-
7
-
8
- def check_is_none(item) -> bool:
9
- """none -> True, not none -> False"""
10
- return (
11
- item is None
12
- or (isinstance(item, str) and str(item).isspace())
13
- or str(item) == ""
14
- )
15
-
16
-
17
- def markup_language(text: str, target_languages: list = None) -> str:
18
- pattern = (
19
- r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
20
- r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
21
- r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
22
- )
23
- sentences = re.split(pattern, text)
24
-
25
- pre_lang = ""
26
- p = 0
27
-
28
- if target_languages is not None:
29
- sorted_target_languages = sorted(target_languages)
30
- if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
31
- new_sentences = []
32
- for sentence in sentences:
33
- new_sentences.extend(split_alpha_nonalpha(sentence))
34
- sentences = new_sentences
35
-
36
- for sentence in sentences:
37
- if check_is_none(sentence):
38
- continue
39
-
40
- lang = classify_language(sentence, target_languages)
41
-
42
- if pre_lang == "":
43
- text = text[:p] + text[p:].replace(
44
- sentence, f"[{lang.upper()}]{sentence}", 1
45
- )
46
- p += len(f"[{lang.upper()}]")
47
- elif pre_lang != lang:
48
- text = text[:p] + text[p:].replace(
49
- sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
50
- )
51
- p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
52
- pre_lang = lang
53
- p += text[p:].index(sentence) + len(sentence)
54
- text += f"[{pre_lang.upper()}]"
55
-
56
- return text
57
-
58
-
59
- def split_by_language(text: str, target_languages: list = None) -> list:
60
- pattern = (
61
- r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
62
- r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
63
- r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
64
- )
65
- sentences = re.split(pattern, text)
66
-
67
- pre_lang = ""
68
- start = 0
69
- end = 0
70
- sentences_list = []
71
-
72
- if target_languages is not None:
73
- sorted_target_languages = sorted(target_languages)
74
- if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
75
- new_sentences = []
76
- for sentence in sentences:
77
- new_sentences.extend(split_alpha_nonalpha(sentence))
78
- sentences = new_sentences
79
-
80
- for sentence in sentences:
81
- if check_is_none(sentence):
82
- continue
83
-
84
- lang = classify_language(sentence, target_languages)
85
-
86
- end += text[end:].index(sentence)
87
- if pre_lang != "" and pre_lang != lang:
88
- sentences_list.append((text[start:end], pre_lang))
89
- start = end
90
- end += len(sentence)
91
- pre_lang = lang
92
- sentences_list.append((text[start:], pre_lang))
93
-
94
- return sentences_list
95
-
96
-
97
- def sentence_split(text: str, max: int) -> list:
98
- pattern = r"[!(),—+\-.:;??。,、;:]+"
99
- sentences = re.split(pattern, text)
100
- discarded_chars = re.findall(pattern, text)
101
-
102
- sentences_list, count, p = [], 0, 0
103
-
104
- # 按被分割的符号遍历
105
- for i, discarded_chars in enumerate(discarded_chars):
106
- count += len(sentences[i]) + len(discarded_chars)
107
- if count >= max:
108
- sentences_list.append(text[p : p + count].strip())
109
- p += count
110
- count = 0
111
-
112
- # 加入最后剩余的文本
113
- if p < len(text):
114
- sentences_list.append(text[p:])
115
-
116
- return sentences_list
117
-
118
-
119
- def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
120
- # 如果该speaker只支持一种语言
121
- if speaker_lang is not None and len(speaker_lang) == 1:
122
- if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
123
- logging.debug(
124
- f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
125
- )
126
- lang = speaker_lang[0]
127
-
128
- sentences_list = []
129
- if lang.upper() != "MIX":
130
- if max <= 0:
131
- sentences_list.append(
132
- markup_language(text, speaker_lang)
133
- if lang.upper() == "AUTO"
134
- else f"[{lang.upper()}]{text}[{lang.upper()}]"
135
- )
136
- else:
137
- for i in sentence_split(text, max):
138
- if check_is_none(i):
139
- continue
140
- sentences_list.append(
141
- markup_language(i, speaker_lang)
142
- if lang.upper() == "AUTO"
143
- else f"[{lang.upper()}]{i}[{lang.upper()}]"
144
- )
145
- else:
146
- sentences_list.append(text)
147
-
148
- for i in sentences_list:
149
- logging.debug(i)
150
-
151
- return sentences_list
152
-
153
-
154
- if __name__ == "__main__":
155
- text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
156
- print(markup_language(text, target_languages=None))
157
- print(sentence_split(text, max=50))
158
- print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
159
-
160
- text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
161
- print(split_by_language(text, ["zh", "ja", "en"]))
162
-
163
- text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
164
-
165
- print(split_by_language(text, ["zh", "ja", "en"]))
166
- # output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
167
-
168
- print(split_by_language(text, ["zh", "en"]))
169
- # output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
170
-
171
- text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days"
172
- print(split_by_language(text, ["zh", "en"]))
173
- # output: [('vits ', 'en'), ('和 ', 'zh'), ('Bert-VITS2 ', 'en'), ('是 ', 'zh'), ('tts ', 'en'), ('模型。花费 ', 'zh'), ('3 days. ', 'en'), ('花费 3天。', 'zh'), ('Take 3 days', 'en')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/translate.py DELETED
@@ -1,61 +0,0 @@
1
- """
2
- 翻译api
3
- """
4
- from config import config
5
-
6
- import random
7
- import hashlib
8
- import requests
9
-
10
-
11
- def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12
- """
13
- :param Sentence: 待翻译语句
14
- :param from_Language: 待翻译语句语言
15
- :param to_Language: 目标语言
16
- :return: 翻译后语句 出错时返回None
17
-
18
- 常见语言代码:中文 zh 英语 en 日语 jp
19
- """
20
- appid = config.translate_config.app_key
21
- key = config.translate_config.secret_key
22
- if appid == "" or key == "":
23
- return "请开发者在config.yml中配置app_key与secret_key"
24
- url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25
- texts = Sentence.splitlines()
26
- outTexts = []
27
- for t in texts:
28
- if t != "":
29
- # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30
- salt = str(random.randint(1, 100000))
31
- signString = appid + t + salt + key
32
- hs = hashlib.md5()
33
- hs.update(signString.encode("utf-8"))
34
- signString = hs.hexdigest()
35
- if from_Language == "":
36
- from_Language = "auto"
37
- headers = {"Content-Type": "application/x-www-form-urlencoded"}
38
- payload = {
39
- "q": t,
40
- "from": from_Language,
41
- "to": to_Language,
42
- "appid": appid,
43
- "salt": salt,
44
- "sign": signString,
45
- }
46
- # 发送请求
47
- try:
48
- response = requests.post(
49
- url=url, data=payload, headers=headers, timeout=3
50
- )
51
- response = response.json()
52
- if "trans_result" in response.keys():
53
- result = response["trans_result"][0]
54
- if "dst" in result.keys():
55
- dst = result["dst"]
56
- outTexts.append(dst)
57
- except Exception:
58
- return Sentence
59
- else:
60
- outTexts.append(t)
61
- return "\n".join(outTexts)