MarianMT / Demo.py
abdullahmubeen10's picture
Upload 75 files
8d64fe0 verified
import streamlit as st
import sparknlp
import os
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
# Page Configuration
st.set_page_config(
layout="wide",
initial_sidebar_state="auto"
)
# Custom CSS for Styling
st.markdown("""
<style>
.main-title {
font-size: 36px;
color: #4A90E2;
font-weight: bold;
text-align: center;
}
.section {
background-color: #f9f9f9;
padding: 10px;
border-radius: 10px;
margin-top: 10px;
}
.section p, .section ul {
color: #666666;
}
</style>
""", unsafe_allow_html=True)
# Initialize Spark NLP
@st.cache_resource
def init_spark():
return sparknlp.start()
# Create a Spark NLP Pipeline for MarianTransformer
@st.cache_resource
def create_pipeline(model_name):
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel()\
.pretrained("sentence_detector_dl", "xx")\
.setInputCols(["document"])\
.setOutputCol("sentences")
marian_translator = MarianTransformer.pretrained(model_name, "xx")\
.setInputCols(["sentences"])\
.setOutputCol("translation")
return Pipeline(stages=[document_assembler, sentence_detector, marian_translator])
# Process the Input Text Through the Pipeline
def fit_data(pipeline, text):
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
return result.select('translation.result').collect()
# Title and Subtitle
title = 'Multilingual Text Translation with Spark NLP and MarianMT'
sub_title = """
The MarianTransformer is a powerful, state-of-the-art machine translation model based on the Transformer architecture. Developed by the MarianMT project, this annotator supports over 1,000 translation directions, making it one of the most versatile tools for multilingual natural language processing. Integrated within Spark NLP, the MarianTransformer Annotator allows for scalable and efficient text translation, leveraging the parallel processing capabilities of Apache Spark. Whether you're translating large documents or handling multiple languages simultaneously, this tool ensures high-quality translations with minimal latency.
"""
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
# Mapping Models to Descriptions
model_mappings = {
"opus_mt_en_fr": "Translate text from English to French",
"opus_mt_en_it": "Translate text from English to Italian",
"opus_mt_en_es": "Translate text from English to Spanish",
"opus_mt_en_de": "Translate text from English to German",
"opus_mt_en_cpp": "Translate text from English to Portuguese",
"opus_mt_fr_en": "Translate text from French to English",
"opus_mt_it_en": "Translate text from Italian to English",
"opus_mt_es_en": "Translate text from Spanish to English",
"opus_mt_de_en": "Translate text from German to English",
"opus_mt_cpp_en": "Translate text from Portuguese to English"
}
# Sidebar for Language Selection
st.sidebar.title("Language Selection")
language_mapping = {
"English": 'en',
"French": 'fr',
"Italian": 'it',
"Spanish": 'es',
"German": 'de',
"Portuguese": 'cpp'
}
from_language = st.sidebar.selectbox("Translate From", list(language_mapping.keys()))
if from_language == 'English':
to_language = st.sidebar.selectbox("Translate To", ['French', 'Italian', 'Spanish', 'German', 'Portuguese'])
else:
to_language = st.sidebar.selectbox("Translate To", ['English'])
selected_model = f'opus_mt_{language_mapping[from_language]}_{language_mapping[to_language]}'
st.subheader(model_mappings[selected_model])
# Reference Notebook Link in Sidebar
link= """<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/TRANSLATION_MARIAN.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
st.sidebar.title('')
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Load Sample Text Files
folder_path = f"inputs/{selected_model}"
examples = [
lines[1].strip()
for filename in os.listdir(folder_path)
if filename.endswith('.txt')
for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
if len(lines) >= 2
]
selected_text = st.selectbox("Select a Sample Text", examples)
custom_input = st.text_input("Try it for yourself!")
if custom_input:
selected_text = custom_input
# Display the Selected or Entered Text
st.subheader('Selected Text')
st.write(selected_text)
# Perform Translation and Display the Result
st.subheader("Translation Result")
spark = init_spark()
pipeline = create_pipeline(selected_model)
output = fit_data(pipeline, selected_text)
res = "".join(output[0][0])
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(res), unsafe_allow_html=True)