Spaces:
Sleeping
Sleeping
shakhovak
commited on
Commit
·
d2febe5
1
Parent(s):
96fdf72
update with new dicts
Browse files- dictionaries/{accents.json → file_norm.json} +2 -2
- dictionaries/{omographs.json → file_omo.json} +2 -2
- ruaccent.py +29 -34
- templates/index.html +7 -0
- web_interface.py +63 -17
dictionaries/{accents.json → file_norm.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebf4187d80e9702f94253d81a48fa3a14d484e2befaeb939fdca99eb6c42f1d5
|
3 |
+
size 178087540
|
dictionaries/{omographs.json → file_omo.json}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba98b20c885cee2f54da731bb068df53fa6960bd3c8ef36417d8f6ffc90acbff
|
3 |
+
size 4240115
|
ruaccent.py
CHANGED
@@ -8,25 +8,34 @@ from text_split import split_by_sentences
|
|
8 |
|
9 |
class RUAccent:
|
10 |
vowels = "аеёиоуыэюя"
|
|
|
11 |
def __init__(self):
|
12 |
self.omographs = None
|
13 |
self.accents = None
|
14 |
self.workdir = os.getcwd()
|
15 |
|
16 |
-
|
17 |
def load(self, custom_accent=None, custom_omographs=None):
|
18 |
-
|
19 |
if custom_omographs is None:
|
20 |
custom_omographs = {}
|
21 |
|
22 |
if custom_accent is None:
|
23 |
custom_accent = {}
|
24 |
|
25 |
-
self.omographs = json.load(
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
self.omographs.update(custom_omographs)
|
28 |
|
29 |
-
self.accents = json.load(
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
self.accents.update(custom_accent)
|
32 |
|
@@ -55,24 +64,24 @@ class RUAccent:
|
|
55 |
outputs = []
|
56 |
for sentence in sentences:
|
57 |
text = self.split_by_words(sentence)
|
58 |
-
# processed_text = self._process_yo(text)
|
59 |
|
60 |
-
# processed_text = self._process_omographs(text)
|
61 |
founded_omographs = self._process_omographs(text)
|
62 |
omographs_list.extend(founded_omographs)
|
63 |
|
64 |
-
processed_text, unknown_words = self._process_accent(
|
|
|
|
|
65 |
unknown_list.extend(unknown_words)
|
66 |
|
67 |
processed_text = " ".join(processed_text)
|
68 |
processed_text = self.delete_spaces_before_punc(processed_text)
|
69 |
-
# outputs.append(processed_text)
|
70 |
|
71 |
accented_sentence.append(processed_text)
|
72 |
-
# " ".join(outputs)
|
73 |
|
74 |
-
omographs_list = [
|
75 |
-
|
|
|
|
|
76 |
|
77 |
def _process_yo(self, text):
|
78 |
splitted_text = text
|
@@ -88,16 +97,8 @@ class RUAccent:
|
|
88 |
for i, word in enumerate(splitted_text):
|
89 |
variants = self.omographs.get(word)
|
90 |
if variants:
|
91 |
-
founded_omographs.append(
|
92 |
-
{word: variants}
|
93 |
-
)
|
94 |
-
|
95 |
|
96 |
-
# for omograph in founded_omographs:
|
97 |
-
# splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
|
98 |
-
# cls = omograph["variants"][0] # Just take the first variant from the dictionary
|
99 |
-
# splitted_text[omograph["position"]] = cls
|
100 |
-
# return splitted_text
|
101 |
return founded_omographs
|
102 |
|
103 |
def _process_accent(self, text, founded_omographs):
|
@@ -105,23 +106,17 @@ class RUAccent:
|
|
105 |
unknown_words = []
|
106 |
for i, word in enumerate(splitted_text):
|
107 |
stressed_word = self.accents.get(word, word)
|
108 |
-
if stressed_word == word:
|
109 |
-
# if len(word) > 4:
|
110 |
-
if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
|
111 |
-
unknown_words.append(word)
|
112 |
-
splitted_text[i] = word
|
113 |
|
114 |
-
|
115 |
splitted_text[i] = word
|
116 |
|
117 |
-
|
118 |
-
splitted_text[i] = stressed_word
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
# splitted_text[i] = stressed_word
|
125 |
|
126 |
return splitted_text, unknown_words
|
127 |
|
@@ -136,7 +131,7 @@ class RUAccent:
|
|
136 |
# ru_accent = RUAccent()
|
137 |
# ru_accent.load()
|
138 |
#
|
139 |
-
# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
|
140 |
# processed_text = ru_accent.process_all(text_to_process)
|
141 |
#
|
142 |
# print(processed_text)
|
|
|
8 |
|
9 |
class RUAccent:
|
10 |
vowels = "аеёиоуыэюя"
|
11 |
+
|
12 |
def __init__(self):
|
13 |
self.omographs = None
|
14 |
self.accents = None
|
15 |
self.workdir = os.getcwd()
|
16 |
|
|
|
17 |
def load(self, custom_accent=None, custom_omographs=None):
|
|
|
18 |
if custom_omographs is None:
|
19 |
custom_omographs = {}
|
20 |
|
21 |
if custom_accent is None:
|
22 |
custom_accent = {}
|
23 |
|
24 |
+
self.omographs = json.load(
|
25 |
+
open(
|
26 |
+
join_path(self.workdir, "dictionaries", "file_omo.json"),
|
27 |
+
encoding="utf-8",
|
28 |
+
)
|
29 |
+
)
|
30 |
|
31 |
self.omographs.update(custom_omographs)
|
32 |
|
33 |
+
self.accents = json.load(
|
34 |
+
open(
|
35 |
+
join_path(self.workdir, "dictionaries", "file_norm.json"),
|
36 |
+
encoding="utf-8",
|
37 |
+
)
|
38 |
+
)
|
39 |
|
40 |
self.accents.update(custom_accent)
|
41 |
|
|
|
64 |
outputs = []
|
65 |
for sentence in sentences:
|
66 |
text = self.split_by_words(sentence)
|
|
|
67 |
|
|
|
68 |
founded_omographs = self._process_omographs(text)
|
69 |
omographs_list.extend(founded_omographs)
|
70 |
|
71 |
+
processed_text, unknown_words = self._process_accent(
|
72 |
+
text, founded_omographs
|
73 |
+
)
|
74 |
unknown_list.extend(unknown_words)
|
75 |
|
76 |
processed_text = " ".join(processed_text)
|
77 |
processed_text = self.delete_spaces_before_punc(processed_text)
|
|
|
78 |
|
79 |
accented_sentence.append(processed_text)
|
|
|
80 |
|
81 |
+
omographs_list = [
|
82 |
+
f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
|
83 |
+
]
|
84 |
+
return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
|
85 |
|
86 |
def _process_yo(self, text):
|
87 |
splitted_text = text
|
|
|
97 |
for i, word in enumerate(splitted_text):
|
98 |
variants = self.omographs.get(word)
|
99 |
if variants:
|
100 |
+
founded_omographs.append({word: self.omographs[word]["acc_variants"]})
|
|
|
|
|
|
|
101 |
|
|
|
|
|
|
|
|
|
|
|
102 |
return founded_omographs
|
103 |
|
104 |
def _process_accent(self, text, founded_omographs):
|
|
|
106 |
unknown_words = []
|
107 |
for i, word in enumerate(splitted_text):
|
108 |
stressed_word = self.accents.get(word, word)
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
+
if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
|
111 |
splitted_text[i] = word
|
112 |
|
113 |
+
elif stressed_word != word:
|
114 |
+
splitted_text[i] = stressed_word["accent"]
|
115 |
|
116 |
+
else:
|
117 |
+
if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
|
118 |
+
unknown_words.append(word)
|
119 |
+
splitted_text[i] = word
|
|
|
120 |
|
121 |
return splitted_text, unknown_words
|
122 |
|
|
|
131 |
# ru_accent = RUAccent()
|
132 |
# ru_accent.load()
|
133 |
#
|
134 |
+
# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
|
135 |
# processed_text = ru_accent.process_all(text_to_process)
|
136 |
#
|
137 |
# print(processed_text)
|
templates/index.html
CHANGED
@@ -13,4 +13,11 @@
|
|
13 |
<input type="submit" value="Process">
|
14 |
</form>
|
15 |
</body>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
</html>
|
|
|
13 |
<input type="submit" value="Process">
|
14 |
</form>
|
15 |
</body>
|
16 |
+
<body>
|
17 |
+
<h1>Upload a Text File</h1>
|
18 |
+
<form action="/upload" method="post" enctype="multipart/form-data">
|
19 |
+
<input type="file" name="file" accept=".txt" required>
|
20 |
+
<button type="submit">Upload</button>
|
21 |
+
</form>
|
22 |
+
</body>
|
23 |
</html>
|
web_interface.py
CHANGED
@@ -8,37 +8,83 @@ app = Flask(__name__)
|
|
8 |
ru_accent = RUAccent()
|
9 |
ru_accent.load()
|
10 |
|
11 |
-
|
|
|
12 |
def index():
|
13 |
-
return render_template(
|
|
|
14 |
|
15 |
-
@app.route(
|
16 |
def process():
|
17 |
-
if request.method ==
|
18 |
-
input_text = request.form[
|
19 |
processed_text = ru_accent.process_all(input_text)
|
20 |
|
21 |
# Create three text files with the same content
|
22 |
|
23 |
-
file_name =
|
24 |
-
with open(file_name,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
file.write(" ".join(processed_text[0]))
|
26 |
|
27 |
-
file_name =
|
28 |
-
with open(file_name,
|
29 |
file.write("\n".join(processed_text[1]))
|
30 |
|
31 |
-
file_name =
|
32 |
-
with open(file_name,
|
33 |
file.write("\n".join(processed_text[2]))
|
34 |
|
|
|
35 |
|
36 |
-
|
|
|
37 |
|
38 |
-
|
|
|
39 |
def download(file_name):
|
40 |
-
file_name = f
|
41 |
-
return send_file(file_name, as_attachment=True, download_name=f
|
|
|
42 |
|
43 |
-
if __name__ ==
|
44 |
-
app.run(debug=True, host=
|
|
|
8 |
ru_accent = RUAccent()
|
9 |
ru_accent.load()
|
10 |
|
11 |
+
|
12 |
+
@app.route("/")
|
13 |
def index():
|
14 |
+
return render_template("index.html")
|
15 |
+
|
16 |
|
17 |
+
@app.route("/process", methods=["POST"])
|
18 |
def process():
|
19 |
+
if request.method == "POST":
|
20 |
+
input_text = request.form["input_text"]
|
21 |
processed_text = ru_accent.process_all(input_text)
|
22 |
|
23 |
# Create three text files with the same content
|
24 |
|
25 |
+
file_name = "accented_text.txt"
|
26 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
27 |
+
file.write(" ".join(processed_text[0]))
|
28 |
+
|
29 |
+
file_name = "omographs.txt"
|
30 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
31 |
+
file.write("\n".join(processed_text[1]))
|
32 |
+
|
33 |
+
file_name = "unknown.txt"
|
34 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
35 |
+
file.write("\n".join(processed_text[2]))
|
36 |
+
|
37 |
+
return render_template("result.html")
|
38 |
+
|
39 |
+
|
40 |
+
@app.route("/upload", methods=["POST"])
|
41 |
+
def upload():
|
42 |
+
# Check if the POST request has a file part
|
43 |
+
if "file" not in request.files:
|
44 |
+
return "No file part"
|
45 |
+
|
46 |
+
file = request.files["file"]
|
47 |
+
|
48 |
+
# If the user submits an empty form
|
49 |
+
if file.filename == "":
|
50 |
+
return "No selected file"
|
51 |
+
|
52 |
+
# Check if the file is a text file
|
53 |
+
if file and file.filename.endswith(".txt"):
|
54 |
+
# Save the uploaded file to the server (you might want to store it in a more secure way)
|
55 |
+
file.save(file.filename)
|
56 |
+
|
57 |
+
# Process the file content (replace this with your actual processing logic)
|
58 |
+
with open(file.filename, "r", encoding="utf-8") as f:
|
59 |
+
content = f.read()
|
60 |
+
|
61 |
+
processed_text = ru_accent.process_all(content)
|
62 |
+
|
63 |
+
# Create three text files with the same content
|
64 |
+
|
65 |
+
file_name = "accented_text.txt"
|
66 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
67 |
file.write(" ".join(processed_text[0]))
|
68 |
|
69 |
+
file_name = "omographs.txt"
|
70 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
71 |
file.write("\n".join(processed_text[1]))
|
72 |
|
73 |
+
file_name = "unknown.txt"
|
74 |
+
with open(file_name, "w", encoding="utf-8") as file:
|
75 |
file.write("\n".join(processed_text[2]))
|
76 |
|
77 |
+
return render_template("result.html")
|
78 |
|
79 |
+
else:
|
80 |
+
return "Invalid file format. Please upload a text file."
|
81 |
|
82 |
+
|
83 |
+
@app.route("/download/<file_name>")
|
84 |
def download(file_name):
|
85 |
+
file_name = f"{file_name}"
|
86 |
+
return send_file(file_name, as_attachment=True, download_name=f"{file_name}")
|
87 |
+
|
88 |
|
89 |
+
if __name__ == "__main__":
|
90 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|