PoTaTo721 commited on
Commit
efef05d
β€’
1 Parent(s): 574a682

Fix text clean.py

Browse files
Files changed (2) hide show
  1. app.py +3 -0
  2. fish_speech/text/clean.py +9 -47
app.py CHANGED
@@ -1,4 +1,7 @@
1
  import os
 
 
 
2
  import queue
3
  from huggingface_hub import snapshot_download
4
  import hydra
 
1
  import os
2
+
3
+ os.environ["TORCHAUDIO_USE_FFMPEG"] = "1"
4
+
5
  import queue
6
  from huggingface_hub import snapshot_download
7
  import hydra
fish_speech/text/clean.py CHANGED
@@ -1,61 +1,24 @@
1
- import itertools
2
  import re
3
 
4
- LANGUAGE_UNICODE_RANGE_MAP = {
5
- "ZH": [(0x4E00, 0x9FFF)],
6
- "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
7
- "EN": [(0x0000, 0x007F)],
8
- }
9
-
10
  SYMBOLS_MAPPING = {
11
- ":": ",",
12
- "οΌ›": ",",
13
- ",": ",",
14
- "。": ".",
15
- "!": "!",
16
- "?": "?",
17
- "\n": ".",
18
- "Β·": ",",
19
- "、": ",",
20
- "...": "…",
21
  "β€œ": "'",
22
  "”": "'",
23
  "β€˜": "'",
24
  "’": "'",
25
- "(": "'",
26
- "οΌ‰": "'",
27
- "(": "'",
28
- ")": "'",
29
- "γ€Š": "'",
30
- "》": "'",
31
- "【": "'",
32
- "】": "'",
33
- "[": "'",
34
- "]": "'",
35
- "β€”": "-",
36
- "~": "-",
37
- "~": "-",
38
- "・": "-",
39
- "γ€Œ": "'",
40
- "」": "'",
41
- ";": ",",
42
- ":": ",",
43
  }
44
 
45
  REPLACE_SYMBOL_REGEX = re.compile(
46
  "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
47
  )
48
- ALL_KNOWN_UTF8_RANGE = list(
49
- itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
50
- )
51
- REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
52
- "[^"
53
- + "".join(
54
- f"{re.escape(chr(start))}-{re.escape(chr(end))}"
55
- for start, end in ALL_KNOWN_UTF8_RANGE
56
- )
57
- + "]"
58
- )
59
 
60
 
61
  def clean_text(text):
@@ -64,6 +27,5 @@ def clean_text(text):
64
 
65
  # Replace all chinese symbols with their english counterparts
66
  text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
67
- # text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
68
 
69
  return text
 
 
1
  import re
2
 
 
 
 
 
 
 
3
  SYMBOLS_MAPPING = {
 
 
 
 
 
 
 
 
 
 
4
  "β€œ": "'",
5
  "”": "'",
6
  "β€˜": "'",
7
  "’": "'",
8
+ "【": "",
9
+ "】": "",
10
+ "[": "",
11
+ "]": "",
12
+ "(": "",
13
+ "οΌ‰": "",
14
+ "(": "",
15
+ ")": "",
16
+ "・": "Β·",
 
 
 
 
 
 
 
 
 
17
  }
18
 
19
  REPLACE_SYMBOL_REGEX = re.compile(
20
  "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
21
  )
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  def clean_text(text):
 
27
 
28
  # Replace all chinese symbols with their english counterparts
29
  text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
 
30
 
31
  return text