import streamlit as st import sparknlp import os from sparknlp.base import * from sparknlp.common import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline # Page Configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # Custom CSS for Styling st.markdown(""" """, unsafe_allow_html=True) # Initialize Spark NLP @st.cache_resource def init_spark(): return sparknlp.start() # Create a Spark NLP Pipeline for MarianTransformer @st.cache_resource def create_pipeline(model_name): document_assembler = DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") sentence_detector = SentenceDetectorDLModel()\ .pretrained("sentence_detector_dl", "xx")\ .setInputCols(["document"])\ .setOutputCol("sentences") marian_translator = MarianTransformer.pretrained(model_name, "xx")\ .setInputCols(["sentences"])\ .setOutputCol("translation") return Pipeline(stages=[document_assembler, sentence_detector, marian_translator]) # Process the Input Text Through the Pipeline def fit_data(pipeline, text): data = spark.createDataFrame([[text]]).toDF("text") result = pipeline.fit(data).transform(data) return result.select('translation.result').collect() # Title and Subtitle title = 'Multilingual Text Translation with Spark NLP and MarianMT' sub_title = """ The MarianTransformer is a powerful, state-of-the-art machine translation model based on the Transformer architecture. Developed by the MarianMT project, this annotator supports over 1,000 translation directions, making it one of the most versatile tools for multilingual natural language processing. Integrated within Spark NLP, the MarianTransformer Annotator allows for scalable and efficient text translation, leveraging the parallel processing capabilities of Apache Spark. Whether you're translating large documents or handling multiple languages simultaneously, this tool ensures high-quality translations with minimal latency. """ st.markdown(f'

{title}

', unsafe_allow_html=True) st.markdown(f'

{sub_title}

', unsafe_allow_html=True) # Mapping Models to Descriptions model_mappings = { "opus_mt_en_fr": "Translate text from English to French", "opus_mt_en_it": "Translate text from English to Italian", "opus_mt_en_es": "Translate text from English to Spanish", "opus_mt_en_de": "Translate text from English to German", "opus_mt_en_cpp": "Translate text from English to Portuguese", "opus_mt_fr_en": "Translate text from French to English", "opus_mt_it_en": "Translate text from Italian to English", "opus_mt_es_en": "Translate text from Spanish to English", "opus_mt_de_en": "Translate text from German to English", "opus_mt_cpp_en": "Translate text from Portuguese to English" } # Sidebar for Language Selection st.sidebar.title("Language Selection") language_mapping = { "English": 'en', "French": 'fr', "Italian": 'it', "Spanish": 'es', "German": 'de', "Portuguese": 'cpp' } from_language = st.sidebar.selectbox("Translate From", list(language_mapping.keys())) if from_language == 'English': to_language = st.sidebar.selectbox("Translate To", ['French', 'Italian', 'Spanish', 'German', 'Portuguese']) else: to_language = st.sidebar.selectbox("Translate To", ['English']) selected_model = f'opus_mt_{language_mapping[from_language]}_{language_mapping[to_language]}' st.subheader(model_mappings[selected_model]) # Reference Notebook Link in Sidebar link= """

""" st.sidebar.title('') st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load Sample Text Files folder_path = f"inputs/{selected_model}" examples = [ lines[1].strip() for filename in os.listdir(folder_path) if filename.endswith('.txt') for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] if len(lines) >= 2 ] selected_text = st.selectbox("Select a Sample Text", examples) custom_input = st.text_input("Try it for yourself!") if custom_input: selected_text = custom_input # Display the Selected or Entered Text st.subheader('Selected Text') st.write(selected_text) # Perform Translation and Display the Result st.subheader("Translation Result") spark = init_spark() pipeline = create_pipeline(selected_model) output = fit_data(pipeline, selected_text) res = "".join(output[0][0]) HTML_WRAPPER = """

{}

""" st.markdown(HTML_WRAPPER.format(res), unsafe_allow_html=True)