import streamlit as st import sparknlp from johnsnowlabs import nlp from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline import pyspark.sql.functions as F import pandas as pd # Page Configuration st.set_page_config( layout="wide", initial_sidebar_state="expanded" ) # CSS Styling st.markdown(""" """, unsafe_allow_html=True) # Initialize Spark @st.cache_resource def init_spark(): return sparknlp.start() # Create NLP Pipeline @st.cache_resource def create_pipeline(): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") pos_tagger = PerceptronModel.pretrained("pos_anc", 'en') \ .setInputCols("document", "token") \ .setOutputCol("pos") dep_parser = DependencyParserModel.pretrained('dependency_conllu') \ .setInputCols(["document", "pos", "token"]) \ .setOutputCol("dependency") typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu') \ .setInputCols(["token", "pos", "dependency"]) \ .setOutputCol("dependency_type") pipeline = Pipeline(stages=[ document_assembler, tokenizer, pos_tagger, dep_parser, typed_dep_parser ]) return pipeline # Fit Data to Pipeline def fit_data(pipeline, text): df = spark.createDataFrame([[text]]).toDF("text") result = pipeline.fit(df).transform(df) return result # Render DataFrame as HTML Table def render_table(df, sidebar=False): html = df.to_html(classes="table", index=False, escape=False) if sidebar: st.sidebar.markdown(html, unsafe_allow_html=True) else: st.markdown(html, unsafe_allow_html=True) def explain_tags(tag_type, tags, tag_dict): explanations = [(tag, tag_dict[tag]) for tag in tags if tag in tag_dict] if explanations: df = pd.DataFrame(explanations, columns=[f"{tag_type} Tag", f"{tag_type} Meaning"]) df.index = [''] * len(df) # Hide the index render_table(df, sidebar=True) # Page Title and Subtitle title = "Grammar Analysis & Dependency Parsing" sub_title = "Visualize the syntactic structure of a sentence as a directed labeled graph." st.markdown(f'

{title}

', unsafe_allow_html=True) st.markdown(f'

{sub_title}

', unsafe_allow_html=True) # Example Sentences examples = [ "John Snow is a good man. He knows a lot about science.", "In what country is the WTO headquartered?", "I was wearing my dark blue shirt and tie.", "The Geneva Motor Show is the most popular car show of the year.", "Bill Gates and Steve Jobs had periods of civility." ] # Text Selection selected_text = st.selectbox("Select an example", examples) custom_input = st.text_input("Try it with your own sentence!") text_to_analyze = custom_input if custom_input else selected_text st.write('Text to analyze:') HTML_WRAPPER = """

{}

""" st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) # Initialize Spark and Pipeline spark = init_spark() pipeline = create_pipeline() output = fit_data(pipeline, text_to_analyze) # Display Dependency Tree st.write("Dependency Tree:") nlp.load('dep.typed').viz_streamlit_dep_tree( text=text_to_analyze, title='', sub_title='', set_wide_layout_CSS=False, generate_code_sample=False, key="NLU_streamlit", show_infos=False, show_logo=False, show_text_input=False, ) # Display Raw Result st.write("Raw Result:") df = output.select(F.explode(F.arrays_zip( output.token.result, output.token.begin, output.token.end, output.pos.result, output.dependency.result, output.dependency_type.result )).alias("cols")) \ .select(F.expr("cols['0']").alias("chunk"), F.expr("cols['1']").alias("begin"), F.expr("cols['2']").alias("end"), F.expr("cols['3']").alias("pos"), F.expr("cols['4']").alias("dependency"), F.expr("cols['5']").alias("dependency_type")).toPandas() render_table(df) # Sidebar Content # POS and Dependency dictionaries pos_dict = { "CC": "Coordinating conjunction", "CD": "Cardinal number", "DT": "Determiner", "EX": "Existential there", "FW": "Foreign word", "IN": "Preposition or subordinating conjunction", "JJ": "Adjective", "JJR": "Adjective, comparative", "JJS": "Adjective, superlative", "LS": "List item marker", "MD": "Modal", "NN": "Noun, singular or mass", "NNS": "Noun, plural", "NNP": "Proper noun, singular", "NNPS": "Proper noun, plural", "PDT": "Predeterminer", "POS": "Possessive ending", "PRP": "Personal pronoun", "PRP$": "Possessive pronoun", "RB": "Adverb", "RBR": "Adverb, comparative", "RBS": "Adverb, superlative", "RP": "Particle", "SYM": "Symbol", "TO": "to", "UH": "Interjection", "VB": "Verb, base form", "VBD": "Verb, past tense", "VBG": "Verb, gerund or present participle", "VBN": "Verb, past participle", "VBP": "Verb, non-3rd person singular present", "VBZ": "Verb, 3rd person singular present", "WDT": "Wh-determiner", "WP": "Wh-pronoun", "WP$": "Possessive wh-pronoun", "WRB": "Wh-adverb" } dependency_dict = { "acl": "clausal modifier of noun (adjectival clause)", "advcl": "adverbial clause modifier", "advmod": "adverbial modifier", "amod": "adjectival modifier", "appos": "appositional modifier", "aux": "auxiliary", "case": "case marking", "cc": "coordinating conjunction", "ccomp": "clausal complement", "clf": "classifier", "compound": "compound", "conj": "conjunct", "cop": "copula", "csubj": "clausal subject", "dep": "unspecified dependency", "det": "determiner", "discourse": "discourse element", "dislocated": "dislocated elements", "expl": "expletive", "fixed": "fixed multiword expression", "flat": "flat multiword expression", "goeswith": "goes with", "iobj": "indirect object", "list": "list", "mark": "marker", "nmod": "nominal modifier", "nsubj": "nominal subject", "nummod": "numeric modifier", "obj": "object", "obl": "oblique nominal", "orphan": "orphan", "parataxis": "parataxis", "punct": "punctuation", "reparandum": "overridden disfluency", "root": "root", "vocative": "vocative", "xcomp": "open clausal complement" } # Get unique POS and dependency tags unique_pos = df['pos'].unique() unique_dep = df['dependency_type'].unique() # Sidebar options for explanations if st.sidebar.checkbox("Explain POS Tags"): explain_tags("POS", unique_pos, pos_dict) if st.sidebar.checkbox("Explain Dependency Types"): explain_tags("Dependency", unique_dep, dependency_dict) # Sidebar with Reference Notebook Link colab_link = """

""" st.sidebar.markdown('Reference Notebook:') st.sidebar.markdown(colab_link, unsafe_allow_html=True)