Spaces:
Build error
Build error
import streamlit as st | |
from io import BytesIO | |
# import gradio as gr | |
# Def_04 Docx file to translated_Docx file | |
from transformers import MarianMTModel, MarianTokenizer | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import LineTokenizer | |
nltk.download('punkt') | |
import math | |
import torch | |
from docx import Document | |
from time import sleep | |
from stqdm import stqdm | |
import docx | |
def getText(filename): | |
doc = docx.Document(filename) | |
fullText = [] | |
for para in doc.paragraphs: | |
fullText.append(para.text) | |
return '\n'.join(fullText) | |
if torch.cuda.is_available(): | |
dev = "cuda" | |
else: | |
dev = "cpu" | |
device = torch.device(dev) | |
# mname = 'Helsinki-NLP/opus-mt-en-hi' | |
# tokenizer = MarianTokenizer.from_pretrained(mname) | |
# model = MarianMTModel.from_pretrained(mname) | |
# model.to(device) | |
#@st.cache | |
def btTranslator(docxfile): | |
a=getText(docxfile) | |
a1=a.split('\n') | |
bigtext=''' ''' | |
for a in a1: | |
bigtext=bigtext+'\n'+a | |
files=Document() | |
a="Helsinki-NLP/opus-mt-en-ru" | |
b="Helsinki-NLP/opus-mt-ru-fr" | |
c="Helsinki-NLP/opus-mt-fr-en" | |
# d="Helsinki-NLP/opus-mt-es-en" | |
langs=[a,b,c] | |
text=bigtext | |
for _,lang in zip(stqdm(langs),langs): | |
sleep(0.5) | |
# mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model' | |
tokenizer = MarianTokenizer.from_pretrained(lang) | |
model = MarianMTModel.from_pretrained(lang) | |
model.to(device) | |
lt = LineTokenizer() | |
batch_size = 8 | |
paragraphs = lt.tokenize(bigtext) | |
translated_paragraphs = [] | |
for _, paragraph in zip(stqdm(paragraphs),paragraphs): | |
# ###################################### | |
sleep(0.5) | |
# ###################################### | |
sentences = sent_tokenize(paragraph) | |
batches = math.ceil(len(sentences) / batch_size) | |
translated = [] | |
for i in range(batches): | |
sent_batch = sentences[i*batch_size:(i+1)*batch_size] | |
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device) | |
with torch.no_grad(): | |
translated_batch = model.generate(**model_inputs) | |
translated += translated_batch | |
translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] | |
translated_paragraphs += [" ".join(translated)] | |
#files.add_paragraph(translated) | |
translated_text = "\n".join(translated_paragraphs) | |
bigtext=translated_text | |
files.add_paragraph(bigtext) | |
#files=files.save("Translated.docx") | |
#binary_output = BytesIO() | |
#f=files.save(binary_output) | |
#f2=f.getvalue() | |
return files | |
#return translated_text | |
st.title('Translator App') | |
st.markdown("Translate from Docx file") | |
st.sidebar.subheader("File Upload") | |
datas=st.sidebar.file_uploader("Original File") | |
#data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx") | |
binary_output = BytesIO() | |
f3=btTranslator(datas).save(binary_output) | |
#f4=binary_output(f3) | |
st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue()) | |
# st.text_area(label="",value=btTranslator(datas),height=100) | |
# Footer |