Spaces:
Build error
Build error
File size: 3,974 Bytes
a4821cd 4cd28f3 a4821cd e8e9287 a4821cd e8e9287 a4821cd 9404cec a4821cd 4cd28f3 a4821cd 4f2119e a4821cd 9404cec a4821cd d9d284e a4821cd 2f2160a a4821cd 38861df a4821cd 38861df 99a70c2 a4821cd b128f72 a9d44d4 6cbbe03 018e211 068434c c4c9ef6 22a6b7a 018e211 9baada4 3049e45 b128f72 3e815a9 cb7c3ae a4821cd cb7c3ae a4821cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import streamlit as st
from io import BytesIO
# import gradio as gr
# Def_04 Docx file to translated_Docx file
#from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import LineTokenizer
nltk.download('punkt')
import math
import torch
from docx import Document
from time import sleep
from stqdm import stqdm
import docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
# mname = 'Helsinki-NLP/opus-mt-en-hi'
# tokenizer = MarianTokenizer.from_pretrained(mname)
# model = MarianMTModel.from_pretrained(mname)
# model.to(device)
#@st.cache
def btTranslator(docxfile):
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
device = torch.device(dev)
a=getText(docxfile)
a1=a.split('\n')
bigtext=''' '''
for a in a1:
bigtext=bigtext+'\n'+a
files=Document()
a="Helsinki-NLP/opus-mt-en-ru"
b="Helsinki-NLP/opus-mt-ru-fr"
c="Helsinki-NLP/opus-mt-fr-en"
# d="Helsinki-NLP/opus-mt-es-en"
langs=[a,b,c]
text=bigtext
for _,lang in zip(stqdm(langs),langs):
st.spinner('Wait for it...')
sleep(0.5)
# mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
tokenizer = AutoTokenizer.from_pretrained(lang)
model = AutoModelForSeq2SeqLM.from_pretrained(lang)
model.to(device)
lt = LineTokenizer()
batch_size = 64
paragraphs = lt.tokenize(bigtext)
translated_paragraphs = []
for _, paragraph in zip(stqdm(paragraphs),paragraphs):
st.spinner('Wait for it...')
# ######################################
sleep(0.5)
# ######################################
sentences = sent_tokenize(paragraph)
batches = math.ceil(len(sentences) / batch_size)
translated = []
for i in range(batches):
sent_batch = sentences[i*batch_size:(i+1)*batch_size]
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
with torch.no_grad():
translated_batch = model.generate(**model_inputs)
translated += translated_batch
translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
translated_paragraphs += [" ".join(translated)]
#files.add_paragraph(translated)
translated_text = "\n".join(translated_paragraphs)
bigtext=translated_text
files.add_paragraph(bigtext)
#files2save=files.save("Translated.docx")
#files.save("Translated.docx")
#binary_output = BytesIO()
#f=files.save(binary_output)
#f2=f.getvalue()
return files
#return translated_text
st.title('Translator App')
st.markdown("Translate from Docx file")
st.subheader("File Upload")
datas=st.file_uploader("Original File")
name=st.text_input('Enter New File Name: ')
#data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx")
#if datas :
#if st.button(label='Data Process'):
binary_output = BytesIO()
if st.button(label='Translate'):
st.spinner('Waiting...')
btTranslator(datas).save(binary_output)
binary_output.getbuffer()
st.success("Translated")
st.download_button(label='Download Translated File',file_name=(f"{name}_Translated.docx"), data=binary_output.getvalue())
#files.save(f"{name}_Translated.docx")
#else:
# st.text('Upload File and Start the process')
#f4=binary_output(f3)
#st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue())
# st.text_area(label="",value=btTranslator(datas),height=100)
# Footer
|