|
import streamlit as st |
|
from io import BytesIO |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
from nltk.tokenize import LineTokenizer |
|
nltk.download('punkt') |
|
import math |
|
import torch |
|
from docx import Document |
|
from time import sleep |
|
from stqdm import stqdm |
|
|
|
import docx |
|
def getText(filename): |
|
doc = docx.Document(filename) |
|
fullText = [] |
|
for para in doc.paragraphs: |
|
fullText.append(para.text) |
|
return '\n'.join(fullText) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def btTranslator(docxfile): |
|
if torch.cuda.is_available(): |
|
dev = "cuda" |
|
else: |
|
dev = "cpu" |
|
device = torch.device(dev) |
|
a=getText(docxfile) |
|
a1=a.split('\n') |
|
bigtext=''' ''' |
|
for a in a1: |
|
bigtext=bigtext+'\n'+a |
|
|
|
files=Document() |
|
|
|
a="Helsinki-NLP/opus-mt-en-ru" |
|
b="Helsinki-NLP/opus-mt-ru-fr" |
|
c="Helsinki-NLP/opus-mt-fr-en" |
|
|
|
langs=[a,b,c] |
|
text=bigtext |
|
|
|
for _,lang in zip(stqdm(langs),langs): |
|
st.spinner('Wait for it...') |
|
sleep(0.5) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(lang) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(lang) |
|
model.to(device) |
|
lt = LineTokenizer() |
|
batch_size = 64 |
|
paragraphs = lt.tokenize(bigtext) |
|
translated_paragraphs = [] |
|
|
|
for _, paragraph in zip(stqdm(paragraphs),paragraphs): |
|
st.spinner('Wait for it...') |
|
|
|
sleep(0.5) |
|
|
|
|
|
sentences = sent_tokenize(paragraph) |
|
batches = math.ceil(len(sentences) / batch_size) |
|
translated = [] |
|
for i in range(batches): |
|
sent_batch = sentences[i*batch_size:(i+1)*batch_size] |
|
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device) |
|
with torch.no_grad(): |
|
translated_batch = model.generate(**model_inputs) |
|
translated += translated_batch |
|
translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] |
|
translated_paragraphs += [" ".join(translated)] |
|
|
|
translated_text = "\n".join(translated_paragraphs) |
|
bigtext=translated_text |
|
files.add_paragraph(bigtext) |
|
|
|
|
|
|
|
|
|
|
|
return files |
|
|
|
|
|
|
|
st.title('Translator App') |
|
st.markdown("Translate from Docx file") |
|
st.subheader("File Upload") |
|
|
|
datas=st.file_uploader("Original File") |
|
name=st.text_input('Enter New File Name: ') |
|
|
|
|
|
|
|
binary_output = BytesIO() |
|
if st.button(label='Translate'): |
|
st.spinner('Waiting...') |
|
btTranslator(datas).save(binary_output) |
|
binary_output.getbuffer() |
|
st.success("Translated") |
|
|
|
st.download_button(label='Download Translated File',file_name=(f"{name}_Translated.docx"), data=binary_output.getvalue()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|