import streamlit as st from transformers import T5Tokenizer, T5ForConditionalGeneration from pathlib import Path from pdfminer.high_level import extract_text def main(): st.title("PDF Translation") st.write("Upload a PDF file and we will translate the text inside to German and French.") # Upload the pdf uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file is not None: # Extract text from pdf documents = extract_text(uploaded_file) tokenizer = T5Tokenizer.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small") # Define translation prefixes for each language translation_prefixes = { "german": "translate English to German: ", "french": "translate English to French: " } # Generate translations for each language for each document translations = {} # Buttons to trigger translation translate_german = st.button("Translate to German") translate_french = st.button("Translate to French") for language, prefix in translation_prefixes.items(): document_translations = [] for idx, document in enumerate(documents, 1): text = prefix + document.text input_ids = tokenizer(text, return_tensors="pt").input_ids outputs = model.generate(input_ids=input_ids, max_length=50, num_beams=4, no_repeat_ngram_size=2) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) document_translations.append(translated_text) translations[language] = document_translations # Display the translations based on the button clicked if translate_german: display_translations(translations["german"], "German") if translate_french: display_translations(translations["french"], "French") def display_translations(translations, language): st.write(f"\nLanguage: {language}") for idx, translation in enumerate(translations, 1): st.write(f"Page {idx}: {translation}") if __name__ == "__main__": main()