Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import fitz
|
3 |
+
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
|
4 |
+
from multiprocessing import Pool, cpu_count
|
5 |
+
|
6 |
+
# Load summarization pipeline
|
7 |
+
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
|
8 |
+
|
9 |
+
# Load translation model and tokenizer
|
10 |
+
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
|
11 |
+
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
|
12 |
+
|
13 |
+
# Define max chunk length
|
14 |
+
max_chunk_length = 1024
|
15 |
+
|
16 |
+
# Function to chunk text
|
17 |
+
def chunk_text(text, max_chunk_length):
|
18 |
+
chunks = []
|
19 |
+
current_chunk = ""
|
20 |
+
for sentence in text.split("."):
|
21 |
+
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
|
22 |
+
if current_chunk != "":
|
23 |
+
current_chunk += " "
|
24 |
+
current_chunk += sentence.strip()
|
25 |
+
else:
|
26 |
+
chunks.append(current_chunk)
|
27 |
+
current_chunk = sentence.strip()
|
28 |
+
if current_chunk != "":
|
29 |
+
chunks.append(current_chunk)
|
30 |
+
return chunks
|
31 |
+
|
32 |
+
# Function to summarize and translate a chunk
|
33 |
+
def summarize_and_translate_chunk(chunk, lang):
|
34 |
+
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
|
35 |
+
summary_text = summary[0]['summary_text']
|
36 |
+
|
37 |
+
# Translate summary
|
38 |
+
translated_chunk = translate_summary(summary_text, lang)
|
39 |
+
return translated_chunk
|
40 |
+
|
41 |
+
# Function to translate the summary
|
42 |
+
def translate_summary(summary, lang):
|
43 |
+
# Chunk text if it exceeds maximum length
|
44 |
+
if len(summary) > max_chunk_length:
|
45 |
+
chunks = chunk_text(summary, max_chunk_length)
|
46 |
+
else:
|
47 |
+
chunks = [summary]
|
48 |
+
|
49 |
+
# Translate each chunk
|
50 |
+
translated_chunks = []
|
51 |
+
for chunk in chunks:
|
52 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
|
53 |
+
generated_tokens = model.generate(
|
54 |
+
**inputs,
|
55 |
+
forced_bos_token_id=tokenizer.lang_code_to_id[lang],
|
56 |
+
max_length=1024,
|
57 |
+
num_beams=4,
|
58 |
+
early_stopping=True,
|
59 |
+
length_penalty=2.0,
|
60 |
+
)
|
61 |
+
translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
|
62 |
+
|
63 |
+
return " ".join(translated_chunks)
|
64 |
+
|
65 |
+
# Function to read PDF and summarize and translate chunk by chunk
|
66 |
+
def summarize_and_translate_pdf(pdf_path, lang):
|
67 |
+
doc = fitz.open(pdf_path)
|
68 |
+
total_chunks = len(doc)
|
69 |
+
chunks = []
|
70 |
+
|
71 |
+
for i in range(total_chunks):
|
72 |
+
page = doc.load_page(i)
|
73 |
+
text = page.get_text()
|
74 |
+
chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])
|
75 |
+
|
76 |
+
# Use multiprocessing to parallelize the process
|
77 |
+
with Pool(cpu_count()) as pool:
|
78 |
+
translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])
|
79 |
+
|
80 |
+
return translated_chunks
|
81 |
+
|
82 |
+
# Streamlit UI
|
83 |
+
st.title("PDF Summarization and Translation")
|
84 |
+
|
85 |
+
# File upload
|
86 |
+
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
|
87 |
+
if uploaded_file:
|
88 |
+
# Display uploaded file
|
89 |
+
st.write("Uploaded PDF file:", uploaded_file.name)
|
90 |
+
|
91 |
+
# Language selection
|
92 |
+
languages = {
|
93 |
+
"Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
|
94 |
+
"Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
|
95 |
+
"Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
|
96 |
+
"Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
|
97 |
+
"Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
|
98 |
+
"Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
|
99 |
+
"Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
|
100 |
+
"Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
|
101 |
+
"Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
|
102 |
+
"Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
|
103 |
+
"Galician": "gl_ES", "Slovene": "sl_SI"
|
104 |
+
}
|
105 |
+
|
106 |
+
lang = st.selectbox("Select language for translation", list(languages.keys()))
|
107 |
+
|
108 |
+
# Translate PDF
|
109 |
+
if st.button("Summarize and Translate"):
|
110 |
+
translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
|
111 |
+
|
112 |
+
# Display translated text
|
113 |
+
st.header("Translated Summary")
|
114 |
+
for chunk in translated_chunks:
|
115 |
+
st.write(chunk)
|