|
import streamlit as st
|
|
import sparknlp
|
|
from johnsnowlabs import nlp
|
|
from sparknlp.base import *
|
|
from sparknlp.annotator import *
|
|
from pyspark.ml import Pipeline
|
|
import pyspark.sql.functions as F
|
|
import pandas as pd
|
|
|
|
|
|
st.set_page_config(
|
|
layout="wide",
|
|
initial_sidebar_state="expanded"
|
|
)
|
|
|
|
|
|
st.markdown("""
|
|
<style>
|
|
.main-title {
|
|
font-size: 36px;
|
|
color: #4A90E2;
|
|
font-weight: bold;
|
|
text-align: center;
|
|
}
|
|
.section {
|
|
background-color: #f9f9f9;
|
|
padding: 10px;
|
|
border-radius: 10px;
|
|
margin-top: 10px;
|
|
}
|
|
.section p, .section ul {
|
|
color: #666666;
|
|
}
|
|
.table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin-top: 20px;
|
|
}
|
|
.table th, .table td {
|
|
border: 1px solid #ddd;
|
|
padding: 8px;
|
|
text-align: left;
|
|
}
|
|
.table th {
|
|
background-color: #4A90E2;
|
|
color: white;
|
|
}
|
|
.table td {
|
|
background-color: #f2f2f2;
|
|
}
|
|
</style>
|
|
""", unsafe_allow_html=True)
|
|
|
|
|
|
@st.cache_resource
|
|
def init_spark():
|
|
return sparknlp.start()
|
|
|
|
|
|
@st.cache_resource
|
|
def create_pipeline():
|
|
document_assembler = DocumentAssembler() \
|
|
.setInputCol("text") \
|
|
.setOutputCol("document")
|
|
|
|
tokenizer = Tokenizer() \
|
|
.setInputCols(["document"]) \
|
|
.setOutputCol("token")
|
|
|
|
pos_tagger = PerceptronModel.pretrained("pos_anc", 'en') \
|
|
.setInputCols("document", "token") \
|
|
.setOutputCol("pos")
|
|
|
|
dep_parser = DependencyParserModel.pretrained('dependency_conllu') \
|
|
.setInputCols(["document", "pos", "token"]) \
|
|
.setOutputCol("dependency")
|
|
|
|
typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu') \
|
|
.setInputCols(["token", "pos", "dependency"]) \
|
|
.setOutputCol("dependency_type")
|
|
|
|
pipeline = Pipeline(stages=[
|
|
document_assembler,
|
|
tokenizer,
|
|
pos_tagger,
|
|
dep_parser,
|
|
typed_dep_parser
|
|
])
|
|
return pipeline
|
|
|
|
|
|
def fit_data(pipeline, text):
|
|
df = spark.createDataFrame([[text]]).toDF("text")
|
|
result = pipeline.fit(df).transform(df)
|
|
return result
|
|
|
|
|
|
def render_table(df, sidebar=False):
|
|
html = df.to_html(classes="table", index=False, escape=False)
|
|
if sidebar:
|
|
st.sidebar.markdown(html, unsafe_allow_html=True)
|
|
else:
|
|
st.markdown(html, unsafe_allow_html=True)
|
|
|
|
def explain_tags(tag_type, tags, tag_dict):
|
|
explanations = [(tag, tag_dict[tag]) for tag in tags if tag in tag_dict]
|
|
if explanations:
|
|
df = pd.DataFrame(explanations, columns=[f"{tag_type} Tag", f"{tag_type} Meaning"])
|
|
df.index = [''] * len(df)
|
|
render_table(df, sidebar=True)
|
|
|
|
|
|
title = "Grammar Analysis & Dependency Parsing"
|
|
sub_title = "Visualize the syntactic structure of a sentence as a directed labeled graph."
|
|
|
|
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
|
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True)
|
|
|
|
|
|
examples = [
|
|
"John Snow is a good man. He knows a lot about science.",
|
|
"In what country is the WTO headquartered?",
|
|
"I was wearing my dark blue shirt and tie.",
|
|
"The Geneva Motor Show is the most popular car show of the year.",
|
|
"Bill Gates and Steve Jobs had periods of civility."
|
|
]
|
|
|
|
|
|
selected_text = st.selectbox("Select an example", examples)
|
|
custom_input = st.text_input("Try it with your own sentence!")
|
|
|
|
text_to_analyze = custom_input if custom_input else selected_text
|
|
|
|
st.write('Text to analyze:')
|
|
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto;
|
|
border: 1px solid #e6e9ef; border-radius: 0.25rem;
|
|
padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
|
|
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
|
|
|
|
|
|
spark = init_spark()
|
|
pipeline = create_pipeline()
|
|
output = fit_data(pipeline, text_to_analyze)
|
|
|
|
|
|
st.write("Dependency Tree:")
|
|
nlp.load('dep.typed').viz_streamlit_dep_tree(
|
|
text=text_to_analyze,
|
|
title='',
|
|
sub_title='',
|
|
set_wide_layout_CSS=False,
|
|
generate_code_sample=False,
|
|
key="NLU_streamlit",
|
|
show_infos=False,
|
|
show_logo=False,
|
|
show_text_input=False,
|
|
)
|
|
|
|
|
|
st.write("Raw Result:")
|
|
df = output.select(F.explode(F.arrays_zip(
|
|
output.token.result,
|
|
output.token.begin,
|
|
output.token.end,
|
|
output.pos.result,
|
|
output.dependency.result,
|
|
output.dependency_type.result
|
|
)).alias("cols")) \
|
|
.select(F.expr("cols['0']").alias("chunk"),
|
|
F.expr("cols['1']").alias("begin"),
|
|
F.expr("cols['2']").alias("end"),
|
|
F.expr("cols['3']").alias("pos"),
|
|
F.expr("cols['4']").alias("dependency"),
|
|
F.expr("cols['5']").alias("dependency_type")).toPandas()
|
|
|
|
render_table(df)
|
|
|
|
|
|
|
|
pos_dict = {
|
|
"CC": "Coordinating conjunction", "CD": "Cardinal number", "DT": "Determiner",
|
|
"EX": "Existential there", "FW": "Foreign word", "IN": "Preposition or subordinating conjunction",
|
|
"JJ": "Adjective", "JJR": "Adjective, comparative", "JJS": "Adjective, superlative",
|
|
"LS": "List item marker", "MD": "Modal", "NN": "Noun, singular or mass",
|
|
"NNS": "Noun, plural", "NNP": "Proper noun, singular", "NNPS": "Proper noun, plural",
|
|
"PDT": "Predeterminer", "POS": "Possessive ending", "PRP": "Personal pronoun",
|
|
"PRP$": "Possessive pronoun", "RB": "Adverb", "RBR": "Adverb, comparative",
|
|
"RBS": "Adverb, superlative", "RP": "Particle", "SYM": "Symbol", "TO": "to",
|
|
"UH": "Interjection", "VB": "Verb, base form", "VBD": "Verb, past tense",
|
|
"VBG": "Verb, gerund or present participle", "VBN": "Verb, past participle",
|
|
"VBP": "Verb, non-3rd person singular present", "VBZ": "Verb, 3rd person singular present",
|
|
"WDT": "Wh-determiner", "WP": "Wh-pronoun", "WP$": "Possessive wh-pronoun",
|
|
"WRB": "Wh-adverb"
|
|
}
|
|
|
|
dependency_dict = {
|
|
"acl": "clausal modifier of noun (adjectival clause)",
|
|
"advcl": "adverbial clause modifier",
|
|
"advmod": "adverbial modifier",
|
|
"amod": "adjectival modifier",
|
|
"appos": "appositional modifier",
|
|
"aux": "auxiliary",
|
|
"case": "case marking",
|
|
"cc": "coordinating conjunction",
|
|
"ccomp": "clausal complement",
|
|
"clf": "classifier",
|
|
"compound": "compound",
|
|
"conj": "conjunct",
|
|
"cop": "copula",
|
|
"csubj": "clausal subject",
|
|
"dep": "unspecified dependency",
|
|
"det": "determiner",
|
|
"discourse": "discourse element",
|
|
"dislocated": "dislocated elements",
|
|
"expl": "expletive",
|
|
"fixed": "fixed multiword expression",
|
|
"flat": "flat multiword expression",
|
|
"goeswith": "goes with",
|
|
"iobj": "indirect object",
|
|
"list": "list",
|
|
"mark": "marker",
|
|
"nmod": "nominal modifier",
|
|
"nsubj": "nominal subject",
|
|
"nummod": "numeric modifier",
|
|
"obj": "object",
|
|
"obl": "oblique nominal",
|
|
"orphan": "orphan",
|
|
"parataxis": "parataxis",
|
|
"punct": "punctuation",
|
|
"reparandum": "overridden disfluency",
|
|
"root": "root",
|
|
"vocative": "vocative",
|
|
"xcomp": "open clausal complement"
|
|
}
|
|
|
|
|
|
unique_pos = df['pos'].unique()
|
|
unique_dep = df['dependency_type'].unique()
|
|
|
|
|
|
if st.sidebar.checkbox("Explain POS Tags"):
|
|
explain_tags("POS", unique_pos, pos_dict)
|
|
|
|
if st.sidebar.checkbox("Explain Dependency Types"):
|
|
explain_tags("Dependency", unique_dep, dependency_dict)
|
|
|
|
|
|
colab_link = """
|
|
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb">
|
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
|
</a>
|
|
"""
|
|
st.sidebar.markdown('Reference Notebook:')
|
|
st.sidebar.markdown(colab_link, unsafe_allow_html=True) |