ambreshrc commited on
Commit
8e853e1
1 Parent(s): 13dfb69

Create new file

Browse files
Files changed (1) hide show
  1. streamlit_app.py +105 -0
streamlit_app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import BytesIO
3
+ # import gradio as gr
4
+ # Def_04 Docx file to translated_Docx file
5
+ from transformers import MarianMTModel, MarianTokenizer
6
+ import nltk
7
+ from nltk.tokenize import sent_tokenize
8
+ from nltk.tokenize import LineTokenizer
9
+ nltk.download('punkt')
10
+ import math
11
+ import torch
12
+ from docx import Document
13
+ from time import sleep
14
+ from stqdm import stqdm
15
+
16
+ import docx
17
+ def getText(filename):
18
+ doc = docx.Document(filename)
19
+ fullText = []
20
+ for para in doc.paragraphs:
21
+ fullText.append(para.text)
22
+ return '\n'.join(fullText)
23
+
24
+
25
+ if torch.cuda.is_available():
26
+ dev = "cuda"
27
+ else:
28
+ dev = "cpu"
29
+ device = torch.device(dev)
30
+
31
+ # mname = 'Helsinki-NLP/opus-mt-en-hi'
32
+ # tokenizer = MarianTokenizer.from_pretrained(mname)
33
+ # model = MarianMTModel.from_pretrained(mname)
34
+ # model.to(device)
35
+
36
+ #@st.cache
37
+ def btTranslator(docxfile):
38
+ a=getText(docxfile)
39
+ a1=a.split('\n')
40
+ bigtext=''' '''
41
+ for a in a1:
42
+ bigtext=bigtext+'\n'+a
43
+
44
+ files=Document()
45
+
46
+ a="Helsinki-NLP/opus-mt-en-ru"
47
+ b="Helsinki-NLP/opus-mt-ru-fr"
48
+ c="Helsinki-NLP/opus-mt-fr-en"
49
+ # d="Helsinki-NLP/opus-mt-es-en"
50
+ langs=[a,b,c]
51
+ text=bigtext
52
+
53
+ for _,lang in zip(stqdm(langs),langs):
54
+ sleep(0.5)
55
+ # mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
56
+ tokenizer = MarianTokenizer.from_pretrained(lang)
57
+ model = MarianMTModel.from_pretrained(lang)
58
+ model.to(device)
59
+ lt = LineTokenizer()
60
+ batch_size = 8
61
+ paragraphs = lt.tokenize(bigtext)
62
+ translated_paragraphs = []
63
+
64
+ for _, paragraph in zip(stqdm(paragraphs),paragraphs):
65
+
66
+ # ######################################
67
+ sleep(0.5)
68
+
69
+ # ######################################
70
+ sentences = sent_tokenize(paragraph)
71
+ batches = math.ceil(len(sentences) / batch_size)
72
+ translated = []
73
+ for i in range(batches):
74
+ sent_batch = sentences[i*batch_size:(i+1)*batch_size]
75
+ model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
76
+ with torch.no_grad():
77
+ translated_batch = model.generate(**model_inputs)
78
+ translated += translated_batch
79
+ translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
80
+ translated_paragraphs += [" ".join(translated)]
81
+ #files.add_paragraph(translated)
82
+ translated_text = "\n".join(translated_paragraphs)
83
+ bigtext=translated_text
84
+ files.add_paragraph(bigtext)
85
+ #files=files.save("Translated.docx")
86
+ #binary_output = BytesIO()
87
+ #f=files.save(binary_output)
88
+ #f2=f.getvalue()
89
+ return files
90
+
91
+
92
+ #return translated_text
93
+ st.title('Translator App')
94
+ st.markdown("Translate from Docx file")
95
+ st.sidebar.subheader("File Upload")
96
+
97
+ datas=st.sidebar.file_uploader("Original File")
98
+ #data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx")
99
+ binary_output = BytesIO()
100
+ f3=btTranslator(datas).save(binary_output)
101
+ #f4=binary_output(f3)
102
+
103
+ st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue())
104
+ # st.text_area(label="",value=btTranslator(datas),height=100)
105
+ # Footer