File size: 3,974 Bytes
a4821cd
 
 
 
4cd28f3
 
a4821cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8e9287
a4821cd
 
 
 
 
 
 
 
e8e9287
 
 
 
 
a4821cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9404cec
a4821cd
 
4cd28f3
 
a4821cd
 
4f2119e
a4821cd
 
 
 
9404cec
a4821cd
 
 
 
 
 
 
 
 
 
 
 
 
d9d284e
a4821cd
 
 
 
 
2f2160a
 
a4821cd
 
 
 
 
 
 
 
 
38861df
a4821cd
38861df
99a70c2
a4821cd
b128f72
a9d44d4
6cbbe03
018e211
068434c
 
c4c9ef6
22a6b7a
018e211
9baada4
3049e45
b128f72
 
3e815a9
cb7c3ae
a4821cd
 
cb7c3ae
a4821cd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
from io import BytesIO
# import gradio as gr
# Def_04 Docx file to translated_Docx file
#from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import LineTokenizer
nltk.download('punkt')
import math
import torch
from docx import Document
from time import sleep
from stqdm import stqdm

import docx
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)
    


 
# mname = 'Helsinki-NLP/opus-mt-en-hi'
# tokenizer = MarianTokenizer.from_pretrained(mname)
# model = MarianMTModel.from_pretrained(mname)
# model.to(device)

#@st.cache
def btTranslator(docxfile):
  if torch.cuda.is_available():  
    dev = "cuda"
  else:  
    dev = "cpu" 
  device = torch.device(dev)
  a=getText(docxfile)
  a1=a.split('\n')
  bigtext='''  '''
  for a in a1:
    bigtext=bigtext+'\n'+a
    
  files=Document()
  
  a="Helsinki-NLP/opus-mt-en-ru"
  b="Helsinki-NLP/opus-mt-ru-fr"
  c="Helsinki-NLP/opus-mt-fr-en"
  # d="Helsinki-NLP/opus-mt-es-en"
  langs=[a,b,c]
  text=bigtext
  
  for _,lang in zip(stqdm(langs),langs):
        st.spinner('Wait for it...')
        sleep(0.5)
        # mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
        tokenizer = AutoTokenizer.from_pretrained(lang)
        model = AutoModelForSeq2SeqLM.from_pretrained(lang)
        model.to(device)
        lt = LineTokenizer()
        batch_size = 64
        paragraphs = lt.tokenize(bigtext)   
        translated_paragraphs = []
        
        for _, paragraph in zip(stqdm(paragraphs),paragraphs):
            st.spinner('Wait for it...')
        # ######################################
            sleep(0.5)

        # ######################################
            sentences = sent_tokenize(paragraph)
            batches = math.ceil(len(sentences) / batch_size)     
            translated = []
            for i in range(batches):
                sent_batch = sentences[i*batch_size:(i+1)*batch_size]
                model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
                with torch.no_grad():
                    translated_batch = model.generate(**model_inputs)
                    translated += translated_batch
                translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
                translated_paragraphs += [" ".join(translated)]
                #files.add_paragraph(translated)
        translated_text = "\n".join(translated_paragraphs)
        bigtext=translated_text
  files.add_paragraph(bigtext) 
  #files2save=files.save("Translated.docx")
  #files.save("Translated.docx")
  #binary_output = BytesIO()
  #f=files.save(binary_output)
  #f2=f.getvalue()
  return files


  #return translated_text
st.title('Translator App')
st.markdown("Translate from Docx file")
st.subheader("File Upload")

datas=st.file_uploader("Original File")
name=st.text_input('Enter New File Name: ')
#data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx")
#if datas :
    #if st.button(label='Data Process'):
binary_output = BytesIO()
if st.button(label='Translate'):
    st.spinner('Waiting...')
    btTranslator(datas).save(binary_output)
    binary_output.getbuffer()
    st.success("Translated")

st.download_button(label='Download Translated File',file_name=(f"{name}_Translated.docx"), data=binary_output.getvalue())
#files.save(f"{name}_Translated.docx")
#else:
 #   st.text('Upload File and Start the process')
        

#f4=binary_output(f3)

#st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue()) 
# st.text_area(label="",value=btTranslator(datas),height=100)
# Footer