shakhovak commited on
Commit
d2febe5
·
1 Parent(s): 96fdf72

update with new dicts

Browse files
dictionaries/{accents.json → file_norm.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adb807918505efc4f2707e6536f52951e2be3bc3f714a7285fecdc7434c7f7b8
3
- size 178733505
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf4187d80e9702f94253d81a48fa3a14d484e2befaeb939fdca99eb6c42f1d5
3
+ size 178087540
dictionaries/{omographs.json → file_omo.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:055502f1eb755ff7f1cd8831d7b44105dd0c190183f81312f209db35885d4ad6
3
- size 1552979
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba98b20c885cee2f54da731bb068df53fa6960bd3c8ef36417d8f6ffc90acbff
3
+ size 4240115
ruaccent.py CHANGED
@@ -8,25 +8,34 @@ from text_split import split_by_sentences
8
 
9
  class RUAccent:
10
  vowels = "аеёиоуыэюя"
 
11
  def __init__(self):
12
  self.omographs = None
13
  self.accents = None
14
  self.workdir = os.getcwd()
15
 
16
-
17
  def load(self, custom_accent=None, custom_omographs=None):
18
-
19
  if custom_omographs is None:
20
  custom_omographs = {}
21
 
22
  if custom_accent is None:
23
  custom_accent = {}
24
 
25
- self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
 
 
 
 
 
26
 
27
  self.omographs.update(custom_omographs)
28
 
29
- self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
 
 
 
 
 
30
 
31
  self.accents.update(custom_accent)
32
 
@@ -55,24 +64,24 @@ class RUAccent:
55
  outputs = []
56
  for sentence in sentences:
57
  text = self.split_by_words(sentence)
58
- # processed_text = self._process_yo(text)
59
 
60
- # processed_text = self._process_omographs(text)
61
  founded_omographs = self._process_omographs(text)
62
  omographs_list.extend(founded_omographs)
63
 
64
- processed_text, unknown_words = self._process_accent(text, founded_omographs)
 
 
65
  unknown_list.extend(unknown_words)
66
 
67
  processed_text = " ".join(processed_text)
68
  processed_text = self.delete_spaces_before_punc(processed_text)
69
- # outputs.append(processed_text)
70
 
71
  accented_sentence.append(processed_text)
72
- # " ".join(outputs)
73
 
74
- omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
75
- return accented_sentence, omographs_list, unknown_list
 
 
76
 
77
  def _process_yo(self, text):
78
  splitted_text = text
@@ -88,16 +97,8 @@ class RUAccent:
88
  for i, word in enumerate(splitted_text):
89
  variants = self.omographs.get(word)
90
  if variants:
91
- founded_omographs.append(
92
- {word: variants}
93
- )
94
-
95
 
96
- # for omograph in founded_omographs:
97
- # splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
98
- # cls = omograph["variants"][0] # Just take the first variant from the dictionary
99
- # splitted_text[omograph["position"]] = cls
100
- # return splitted_text
101
  return founded_omographs
102
 
103
  def _process_accent(self, text, founded_omographs):
@@ -105,23 +106,17 @@ class RUAccent:
105
  unknown_words = []
106
  for i, word in enumerate(splitted_text):
107
  stressed_word = self.accents.get(word, word)
108
- if stressed_word == word:
109
- # if len(word) > 4:
110
- if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
111
- unknown_words.append(word)
112
- splitted_text[i] = word
113
 
114
- elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
115
  splitted_text[i] = word
116
 
117
- else:
118
- splitted_text[i] = stressed_word
119
 
120
-
121
-
122
-
123
- # stressed_word = self.accents.get(word, word)
124
- # splitted_text[i] = stressed_word
125
 
126
  return splitted_text, unknown_words
127
 
@@ -136,7 +131,7 @@ class RUAccent:
136
  # ru_accent = RUAccent()
137
  # ru_accent.load()
138
  #
139
- # text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
140
  # processed_text = ru_accent.process_all(text_to_process)
141
  #
142
  # print(processed_text)
 
8
 
9
  class RUAccent:
10
  vowels = "аеёиоуыэюя"
11
+
12
  def __init__(self):
13
  self.omographs = None
14
  self.accents = None
15
  self.workdir = os.getcwd()
16
 
 
17
  def load(self, custom_accent=None, custom_omographs=None):
 
18
  if custom_omographs is None:
19
  custom_omographs = {}
20
 
21
  if custom_accent is None:
22
  custom_accent = {}
23
 
24
+ self.omographs = json.load(
25
+ open(
26
+ join_path(self.workdir, "dictionaries", "file_omo.json"),
27
+ encoding="utf-8",
28
+ )
29
+ )
30
 
31
  self.omographs.update(custom_omographs)
32
 
33
+ self.accents = json.load(
34
+ open(
35
+ join_path(self.workdir, "dictionaries", "file_norm.json"),
36
+ encoding="utf-8",
37
+ )
38
+ )
39
 
40
  self.accents.update(custom_accent)
41
 
 
64
  outputs = []
65
  for sentence in sentences:
66
  text = self.split_by_words(sentence)
 
67
 
 
68
  founded_omographs = self._process_omographs(text)
69
  omographs_list.extend(founded_omographs)
70
 
71
+ processed_text, unknown_words = self._process_accent(
72
+ text, founded_omographs
73
+ )
74
  unknown_list.extend(unknown_words)
75
 
76
  processed_text = " ".join(processed_text)
77
  processed_text = self.delete_spaces_before_punc(processed_text)
 
78
 
79
  accented_sentence.append(processed_text)
 
80
 
81
+ omographs_list = [
82
+ f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
83
+ ]
84
+ return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
85
 
86
  def _process_yo(self, text):
87
  splitted_text = text
 
97
  for i, word in enumerate(splitted_text):
98
  variants = self.omographs.get(word)
99
  if variants:
100
+ founded_omographs.append({word: self.omographs[word]["acc_variants"]})
 
 
 
101
 
 
 
 
 
 
102
  return founded_omographs
103
 
104
  def _process_accent(self, text, founded_omographs):
 
106
  unknown_words = []
107
  for i, word in enumerate(splitted_text):
108
  stressed_word = self.accents.get(word, word)
 
 
 
 
 
109
 
110
+ if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
111
  splitted_text[i] = word
112
 
113
+ elif stressed_word != word:
114
+ splitted_text[i] = stressed_word["accent"]
115
 
116
+ else:
117
+ if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
118
+ unknown_words.append(word)
119
+ splitted_text[i] = word
 
120
 
121
  return splitted_text, unknown_words
122
 
 
131
  # ru_accent = RUAccent()
132
  # ru_accent.load()
133
  #
134
+ # text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
135
  # processed_text = ru_accent.process_all(text_to_process)
136
  #
137
  # print(processed_text)
templates/index.html CHANGED
@@ -13,4 +13,11 @@
13
  <input type="submit" value="Process">
14
  </form>
15
  </body>
 
 
 
 
 
 
 
16
  </html>
 
13
  <input type="submit" value="Process">
14
  </form>
15
  </body>
16
+ <body>
17
+ <h1>Upload a Text File</h1>
18
+ <form action="/upload" method="post" enctype="multipart/form-data">
19
+ <input type="file" name="file" accept=".txt" required>
20
+ <button type="submit">Upload</button>
21
+ </form>
22
+ </body>
23
  </html>
web_interface.py CHANGED
@@ -8,37 +8,83 @@ app = Flask(__name__)
8
  ru_accent = RUAccent()
9
  ru_accent.load()
10
 
11
- @app.route('/')
 
12
  def index():
13
- return render_template('index.html')
 
14
 
15
- @app.route('/process', methods=['POST'])
16
  def process():
17
- if request.method == 'POST':
18
- input_text = request.form['input_text']
19
  processed_text = ru_accent.process_all(input_text)
20
 
21
  # Create three text files with the same content
22
 
23
- file_name = 'accented_text.txt'
24
- with open(file_name, 'w', encoding="utf-8") as file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  file.write(" ".join(processed_text[0]))
26
 
27
- file_name = 'omographs.txt'
28
- with open(file_name, 'w', encoding="utf-8") as file:
29
  file.write("\n".join(processed_text[1]))
30
 
31
- file_name = 'unknown.txt'
32
- with open(file_name, 'w', encoding="utf-8") as file:
33
  file.write("\n".join(processed_text[2]))
34
 
 
35
 
36
- return render_template('result.html')
 
37
 
38
- @app.route('/download/<file_name>')
 
39
  def download(file_name):
40
- file_name = f'{file_name}'
41
- return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
 
42
 
43
- if __name__ == '__main__':
44
- app.run(debug=True, host='0.0.0.0', port=7860)
 
8
  ru_accent = RUAccent()
9
  ru_accent.load()
10
 
11
+
12
+ @app.route("/")
13
  def index():
14
+ return render_template("index.html")
15
+
16
 
17
+ @app.route("/process", methods=["POST"])
18
  def process():
19
+ if request.method == "POST":
20
+ input_text = request.form["input_text"]
21
  processed_text = ru_accent.process_all(input_text)
22
 
23
  # Create three text files with the same content
24
 
25
+ file_name = "accented_text.txt"
26
+ with open(file_name, "w", encoding="utf-8") as file:
27
+ file.write(" ".join(processed_text[0]))
28
+
29
+ file_name = "omographs.txt"
30
+ with open(file_name, "w", encoding="utf-8") as file:
31
+ file.write("\n".join(processed_text[1]))
32
+
33
+ file_name = "unknown.txt"
34
+ with open(file_name, "w", encoding="utf-8") as file:
35
+ file.write("\n".join(processed_text[2]))
36
+
37
+ return render_template("result.html")
38
+
39
+
40
+ @app.route("/upload", methods=["POST"])
41
+ def upload():
42
+ # Check if the POST request has a file part
43
+ if "file" not in request.files:
44
+ return "No file part"
45
+
46
+ file = request.files["file"]
47
+
48
+ # If the user submits an empty form
49
+ if file.filename == "":
50
+ return "No selected file"
51
+
52
+ # Check if the file is a text file
53
+ if file and file.filename.endswith(".txt"):
54
+ # Save the uploaded file to the server (you might want to store it in a more secure way)
55
+ file.save(file.filename)
56
+
57
+ # Process the file content (replace this with your actual processing logic)
58
+ with open(file.filename, "r", encoding="utf-8") as f:
59
+ content = f.read()
60
+
61
+ processed_text = ru_accent.process_all(content)
62
+
63
+ # Create three text files with the same content
64
+
65
+ file_name = "accented_text.txt"
66
+ with open(file_name, "w", encoding="utf-8") as file:
67
  file.write(" ".join(processed_text[0]))
68
 
69
+ file_name = "omographs.txt"
70
+ with open(file_name, "w", encoding="utf-8") as file:
71
  file.write("\n".join(processed_text[1]))
72
 
73
+ file_name = "unknown.txt"
74
+ with open(file_name, "w", encoding="utf-8") as file:
75
  file.write("\n".join(processed_text[2]))
76
 
77
+ return render_template("result.html")
78
 
79
+ else:
80
+ return "Invalid file format. Please upload a text file."
81
 
82
+
83
+ @app.route("/download/<file_name>")
84
  def download(file_name):
85
+ file_name = f"{file_name}"
86
+ return send_file(file_name, as_attachment=True, download_name=f"{file_name}")
87
+
88
 
89
+ if __name__ == "__main__":
90
+ app.run(debug=True, host="0.0.0.0", port=7860)