Spaces:

spark-nlp
/

sparknlp-grammar-analysis-and-dependency-parsing

Sleeping

sparknlp-grammar-analysis-and-dependency-parsing

File size: 8,646 Bytes

6520bbf

import streamlit as st
import sparknlp
from johnsnowlabs import nlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pandas as pd

# Page Configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="expanded"
)

# CSS Styling
st.markdown("""

    <style>

        .main-title {

            font-size: 36px;

            color: #4A90E2;

            font-weight: bold;

            text-align: center;

        }

        .section {

            background-color: #f9f9f9;

            padding: 10px;

            border-radius: 10px;

            margin-top: 10px;

        }

        .section p, .section ul {

            color: #666666;

        }

        .table {

            width: 100%;

            border-collapse: collapse;

            margin-top: 20px;

        }

        .table th, .table td {

            border: 1px solid #ddd;

            padding: 8px;

            text-align: left;

        }

        .table th {

            background-color: #4A90E2;

            color: white;

        }

        .table td {

            background-color: #f2f2f2;

        }

    </style>

""", unsafe_allow_html=True)

# Initialize Spark
@st.cache_resource
def init_spark():
    return sparknlp.start()

# Create NLP Pipeline
@st.cache_resource
def create_pipeline():
    document_assembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")

    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    pos_tagger = PerceptronModel.pretrained("pos_anc", 'en') \
        .setInputCols("document", "token") \
        .setOutputCol("pos")

    dep_parser = DependencyParserModel.pretrained('dependency_conllu') \
        .setInputCols(["document", "pos", "token"]) \
        .setOutputCol("dependency")

    typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu') \
        .setInputCols(["token", "pos", "dependency"]) \
        .setOutputCol("dependency_type")

    pipeline = Pipeline(stages=[
        document_assembler, 
        tokenizer, 
        pos_tagger, 
        dep_parser, 
        typed_dep_parser
    ])
    return pipeline

# Fit Data to Pipeline
def fit_data(pipeline, text):
    df = spark.createDataFrame([[text]]).toDF("text")
    result = pipeline.fit(df).transform(df)
    return result

# Render DataFrame as HTML Table
def render_table(df, sidebar=False):
    html = df.to_html(classes="table", index=False, escape=False)
    if sidebar:
      st.sidebar.markdown(html, unsafe_allow_html=True)
    else:
      st.markdown(html, unsafe_allow_html=True)

def explain_tags(tag_type, tags, tag_dict):
    explanations = [(tag, tag_dict[tag]) for tag in tags if tag in tag_dict]
    if explanations:
        df = pd.DataFrame(explanations, columns=[f"{tag_type} Tag", f"{tag_type} Meaning"])
        df.index = [''] * len(df)  # Hide the index
        render_table(df, sidebar=True)

# Page Title and Subtitle
title = "Grammar Analysis & Dependency Parsing"
sub_title = "Visualize the syntactic structure of a sentence as a directed labeled graph."

st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True)

# Example Sentences
examples = [
    "John Snow is a good man. He knows a lot about science.",
    "In what country is the WTO headquartered?",
    "I was wearing my dark blue shirt and tie.",
    "The Geneva Motor Show is the most popular car show of the year.",
    "Bill Gates and Steve Jobs had periods of civility."
]

# Text Selection
selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own sentence!")

text_to_analyze = custom_input if custom_input else selected_text

st.write('Text to analyze:')
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; 

                   border: 1px solid #e6e9ef; border-radius: 0.25rem; 

                   padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)

# Initialize Spark and Pipeline
spark = init_spark()
pipeline = create_pipeline()
output = fit_data(pipeline, text_to_analyze)

# Display Dependency Tree
st.write("Dependency Tree:")
nlp.load('dep.typed').viz_streamlit_dep_tree(
    text=text_to_analyze,
    title='',
    sub_title='',
    set_wide_layout_CSS=False,
    generate_code_sample=False,
    key="NLU_streamlit",
    show_infos=False,
    show_logo=False,
    show_text_input=False,
)

# Display Raw Result
st.write("Raw Result:")
df = output.select(F.explode(F.arrays_zip(
    output.token.result,
    output.token.begin,
    output.token.end,
    output.pos.result,
    output.dependency.result,
    output.dependency_type.result
)).alias("cols")) \
    .select(F.expr("cols['0']").alias("chunk"),
            F.expr("cols['1']").alias("begin"),
            F.expr("cols['2']").alias("end"),
            F.expr("cols['3']").alias("pos"),
            F.expr("cols['4']").alias("dependency"),
            F.expr("cols['5']").alias("dependency_type")).toPandas()

render_table(df)

# Sidebar Content 
# POS and Dependency dictionaries
pos_dict = {
    "CC": "Coordinating conjunction", "CD": "Cardinal number", "DT": "Determiner", 
    "EX": "Existential there", "FW": "Foreign word", "IN": "Preposition or subordinating conjunction", 
    "JJ": "Adjective", "JJR": "Adjective, comparative", "JJS": "Adjective, superlative", 
    "LS": "List item marker", "MD": "Modal", "NN": "Noun, singular or mass", 
    "NNS": "Noun, plural", "NNP": "Proper noun, singular", "NNPS": "Proper noun, plural", 
    "PDT": "Predeterminer", "POS": "Possessive ending", "PRP": "Personal pronoun", 
    "PRP$": "Possessive pronoun", "RB": "Adverb", "RBR": "Adverb, comparative", 
    "RBS": "Adverb, superlative", "RP": "Particle", "SYM": "Symbol", "TO": "to", 
    "UH": "Interjection", "VB": "Verb, base form", "VBD": "Verb, past tense", 
    "VBG": "Verb, gerund or present participle", "VBN": "Verb, past participle", 
    "VBP": "Verb, non-3rd person singular present", "VBZ": "Verb, 3rd person singular present", 
    "WDT": "Wh-determiner", "WP": "Wh-pronoun", "WP$": "Possessive wh-pronoun", 
    "WRB": "Wh-adverb"
}

dependency_dict = {
    "acl": "clausal modifier of noun (adjectival clause)", 
    "advcl": "adverbial clause modifier", 
    "advmod": "adverbial modifier", 
    "amod": "adjectival modifier", 
    "appos": "appositional modifier", 
    "aux": "auxiliary", 
    "case": "case marking", 
    "cc": "coordinating conjunction", 
    "ccomp": "clausal complement", 
    "clf": "classifier", 
    "compound": "compound", 
    "conj": "conjunct", 
    "cop": "copula", 
    "csubj": "clausal subject", 
    "dep": "unspecified dependency", 
    "det": "determiner", 
    "discourse": "discourse element", 
    "dislocated": "dislocated elements", 
    "expl": "expletive", 
    "fixed": "fixed multiword expression", 
    "flat": "flat multiword expression", 
    "goeswith": "goes with", 
    "iobj": "indirect object", 
    "list": "list", 
    "mark": "marker", 
    "nmod": "nominal modifier", 
    "nsubj": "nominal subject", 
    "nummod": "numeric modifier", 
    "obj": "object", 
    "obl": "oblique nominal", 
    "orphan": "orphan", 
    "parataxis": "parataxis", 
    "punct": "punctuation", 
    "reparandum": "overridden disfluency", 
    "root": "root", 
    "vocative": "vocative", 
    "xcomp": "open clausal complement"
}

# Get unique POS and dependency tags
unique_pos = df['pos'].unique()
unique_dep = df['dependency_type'].unique()

# Sidebar options for explanations
if st.sidebar.checkbox("Explain POS Tags"):
    explain_tags("POS", unique_pos, pos_dict)

if st.sidebar.checkbox("Explain Dependency Types"):
    explain_tags("Dependency", unique_dep, dependency_dict)

# Sidebar with Reference Notebook Link
colab_link = """

<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb">

    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>

</a>

"""
st.sidebar.markdown('Reference Notebook:')
st.sidebar.markdown(colab_link, unsafe_allow_html=True)