gyroing commited on
Commit
a82f51b
1 Parent(s): 4e24051

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -1
app.py CHANGED
@@ -5,7 +5,43 @@ from io import BytesIO
5
  from huggingface_hub import hf_hub_download
6
  from piper import PiperVoice
7
  from transformers import pipeline
8
- import hazm_correction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def synthesize_speech(text):
11
 
 
5
  from huggingface_hub import hf_hub_download
6
  from piper import PiperVoice
7
  from transformers import pipeline
8
+ import hazm
9
+ import typing
10
+
11
+ normalizer = hazm.Normalizer()
12
+ sent_tokenizer = hazm.SentenceTokenizer()
13
+ word_tokenizer = hazm.WordTokenizer()
14
+
15
+ tagger = hazm.POSTagger(
16
+ model=str("gyroing/PersianTextCorrection_Hazm/pos_tagger.model")
17
+ )
18
+
19
+ def preprocess_text(text: str) -> typing.List[typing.List[str]]:
20
+ """Split/normalize text into sentences/words with hazm"""
21
+ text = normalizer.normalize(text)
22
+ processed_sentences = []
23
+
24
+ for sentence in sent_tokenizer.tokenize(text):
25
+ words = word_tokenizer.tokenize(sentence)
26
+ processed_words = fix_words(words)
27
+ processed_sentences.append(" ".join(processed_words))
28
+
29
+ return " ".join(processed_sentences)
30
+
31
+ def fix_words(words: typing.List[str]) -> typing.List[str]:
32
+ fixed_words = []
33
+
34
+ for word, pos in tagger.tag(words):
35
+ if pos[-1] == "Z":
36
+ if word[-1] != "ِ":
37
+ if (word[-1] == "ه") and (word[-2] != "ا"):
38
+ word += "‌ی"
39
+ word += "ِ"
40
+
41
+
42
+ fixed_words.append(word)
43
+
44
+ return fixed_words
45
 
46
  def synthesize_speech(text):
47