Spaces:

Shakhovak
/

RU_accent_flask

Sleeping

App Files Files Community

shakhovak commited on Dec 18, 2023

Commit

d2febe5

1 Parent(s): 96fdf72

update with new dicts

Browse files

Files changed (5) hide show

dictionaries/{accents.json → file_norm.json} +2 -2
dictionaries/{omographs.json → file_omo.json} +2 -2
ruaccent.py +29 -34
templates/index.html +7 -0
web_interface.py +63 -17

dictionaries/{accents.json → file_norm.json} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:adb807918505efc4f2707e6536f52951e2be3bc3f714a7285fecdc7434c7f7b8
-size 178733505

 version https://git-lfs.github.com/spec/v1
+oid sha256:ebf4187d80e9702f94253d81a48fa3a14d484e2befaeb939fdca99eb6c42f1d5
+size 178087540

dictionaries/{omographs.json → file_omo.json} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:055502f1eb755ff7f1cd8831d7b44105dd0c190183f81312f209db35885d4ad6
-size 1552979

 version https://git-lfs.github.com/spec/v1
+oid sha256:ba98b20c885cee2f54da731bb068df53fa6960bd3c8ef36417d8f6ffc90acbff
+size 4240115

ruaccent.py CHANGED Viewed

@@ -8,25 +8,34 @@ from text_split import split_by_sentences
 class RUAccent:
     vowels = "аеёиоуыэюя"
     def __init__(self):
         self.omographs = None
         self.accents = None
         self.workdir = os.getcwd()
     def load(self, custom_accent=None, custom_omographs=None):
         if custom_omographs is None:
             custom_omographs = {}
         if custom_accent is None:
             custom_accent = {}
-        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
         self.omographs.update(custom_omographs)
-        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
         self.accents.update(custom_accent)
@@ -55,24 +64,24 @@ class RUAccent:
         outputs = []
         for sentence in sentences:
             text = self.split_by_words(sentence)
-            # processed_text = self._process_yo(text)
-            # processed_text = self._process_omographs(text)
             founded_omographs = self._process_omographs(text)
             omographs_list.extend(founded_omographs)
-            processed_text, unknown_words = self._process_accent(text, founded_omographs)
             unknown_list.extend(unknown_words)
             processed_text = " ".join(processed_text)
             processed_text = self.delete_spaces_before_punc(processed_text)
-            # outputs.append(processed_text)
             accented_sentence.append(processed_text)
-            # " ".join(outputs)
-        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
-        return accented_sentence, omographs_list, unknown_list
     def _process_yo(self, text):
         splitted_text = text
@@ -88,16 +97,8 @@ class RUAccent:
         for i, word in enumerate(splitted_text):
             variants = self.omographs.get(word)
             if variants:
-                founded_omographs.append(
-                    {word: variants}
-                )
-        # for omograph in founded_omographs:
-        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
-        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
-        #     splitted_text[omograph["position"]] = cls
-        # return splitted_text
         return founded_omographs
     def _process_accent(self, text, founded_omographs):
@@ -105,23 +106,17 @@ class RUAccent:
         unknown_words = []
         for i, word in enumerate(splitted_text):
             stressed_word = self.accents.get(word, word)
-            if stressed_word == word:
-                # if len(word) > 4:
-                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
-                    unknown_words.append(word)
-                splitted_text[i] = word
-            elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
                 splitted_text[i] = word
-            else:
-                splitted_text[i] = stressed_word
-            # stressed_word = self.accents.get(word, word)
-            # splitted_text[i] = stressed_word
         return splitted_text, unknown_words
@@ -136,7 +131,7 @@ class RUAccent:
 # ru_accent = RUAccent()
 # ru_accent.load()
 #
-# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
 # processed_text = ru_accent.process_all(text_to_process)
 #
 # print(processed_text)

 class RUAccent:
     vowels = "аеёиоуыэюя"
     def __init__(self):
         self.omographs = None
         self.accents = None
         self.workdir = os.getcwd()
     def load(self, custom_accent=None, custom_omographs=None):
         if custom_omographs is None:
             custom_omographs = {}
         if custom_accent is None:
             custom_accent = {}
+        self.omographs = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "file_omo.json"),
+                encoding="utf-8",
+            )
+        )
         self.omographs.update(custom_omographs)
+        self.accents = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "file_norm.json"),
+                encoding="utf-8",
+            )
+        )
         self.accents.update(custom_accent)
         outputs = []
         for sentence in sentences:
             text = self.split_by_words(sentence)
             founded_omographs = self._process_omographs(text)
             omographs_list.extend(founded_omographs)
+            processed_text, unknown_words = self._process_accent(
+                text, founded_omographs
+            )
             unknown_list.extend(unknown_words)
             processed_text = " ".join(processed_text)
             processed_text = self.delete_spaces_before_punc(processed_text)
             accented_sentence.append(processed_text)
+        omographs_list = [
+            f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
+        ]
+        return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
     def _process_yo(self, text):
         splitted_text = text
         for i, word in enumerate(splitted_text):
             variants = self.omographs.get(word)
             if variants:
+                founded_omographs.append({word: self.omographs[word]["acc_variants"]})
         return founded_omographs
     def _process_accent(self, text, founded_omographs):
         unknown_words = []
         for i, word in enumerate(splitted_text):
             stressed_word = self.accents.get(word, word)
+            if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
                 splitted_text[i] = word
+            elif stressed_word != word:
+                splitted_text[i] = stressed_word["accent"]
+            else:
+                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
+                    unknown_words.append(word)
+                splitted_text[i] = word
         return splitted_text, unknown_words
 # ru_accent = RUAccent()
 # ru_accent.load()
 #
+# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
 # processed_text = ru_accent.process_all(text_to_process)
 #
 # print(processed_text)

templates/index.html CHANGED Viewed

@@ -13,4 +13,11 @@
         <input type="submit" value="Process">
     </form>
 </body>
 </html>

         <input type="submit" value="Process">
     </form>
 </body>
+<body>
+    <h1>Upload a Text File</h1>
+    <form action="/upload" method="post" enctype="multipart/form-data">
+        <input type="file" name="file" accept=".txt" required>
+        <button type="submit">Upload</button>
+    </form>
+</body>
 </html>

web_interface.py CHANGED Viewed

@@ -8,37 +8,83 @@ app = Flask(__name__)
 ru_accent = RUAccent()
 ru_accent.load()
-@app.route('/')
 def index():
-    return render_template('index.html')
-@app.route('/process', methods=['POST'])
 def process():
-    if request.method == 'POST':
-        input_text = request.form['input_text']
         processed_text = ru_accent.process_all(input_text)
         # Create three text files with the same content
-        file_name = 'accented_text.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
             file.write(" ".join(processed_text[0]))
-        file_name = 'omographs.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
             file.write("\n".join(processed_text[1]))
-        file_name = 'unknown.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
             file.write("\n".join(processed_text[2]))
-        return render_template('result.html')
-@app.route('/download/<file_name>')
 def download(file_name):
-    file_name = f'{file_name}'
-    return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
-if __name__ == '__main__':
-    app.run(debug=True, host='0.0.0.0', port=7860)

 ru_accent = RUAccent()
 ru_accent.load()
+@app.route("/")
 def index():
+    return render_template("index.html")
+@app.route("/process", methods=["POST"])
 def process():
+    if request.method == "POST":
+        input_text = request.form["input_text"]
         processed_text = ru_accent.process_all(input_text)
         # Create three text files with the same content
+        file_name = "accented_text.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
+            file.write(" ".join(processed_text[0]))
+        file_name = "omographs.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
+            file.write("\n".join(processed_text[1]))
+        file_name = "unknown.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
+            file.write("\n".join(processed_text[2]))
+        return render_template("result.html")
+@app.route("/upload", methods=["POST"])
+def upload():
+    # Check if the POST request has a file part
+    if "file" not in request.files:
+        return "No file part"
+    file = request.files["file"]
+    # If the user submits an empty form
+    if file.filename == "":
+        return "No selected file"
+    # Check if the file is a text file
+    if file and file.filename.endswith(".txt"):
+        # Save the uploaded file to the server (you might want to store it in a more secure way)
+        file.save(file.filename)
+        # Process the file content (replace this with your actual processing logic)
+        with open(file.filename, "r", encoding="utf-8") as f:
+            content = f.read()
+        processed_text = ru_accent.process_all(content)
+        # Create three text files with the same content
+        file_name = "accented_text.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
             file.write(" ".join(processed_text[0]))
+        file_name = "omographs.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
             file.write("\n".join(processed_text[1]))
+        file_name = "unknown.txt"
+        with open(file_name, "w", encoding="utf-8") as file:
             file.write("\n".join(processed_text[2]))
+        return render_template("result.html")
+    else:
+        return "Invalid file format. Please upload a text file."
+@app.route("/download/<file_name>")
 def download(file_name):
+    file_name = f"{file_name}"
+    return send_file(file_name, as_attachment=True, download_name=f"{file_name}")
+if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)