File size: 8,646 Bytes
6520bbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import streamlit as st
import sparknlp
from johnsnowlabs import nlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pandas as pd

# Page Configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="expanded"
)

# CSS Styling
st.markdown("""

    <style>

        .main-title {

            font-size: 36px;

            color: #4A90E2;

            font-weight: bold;

            text-align: center;

        }

        .section {

            background-color: #f9f9f9;

            padding: 10px;

            border-radius: 10px;

            margin-top: 10px;

        }

        .section p, .section ul {

            color: #666666;

        }

        .table {

            width: 100%;

            border-collapse: collapse;

            margin-top: 20px;

        }

        .table th, .table td {

            border: 1px solid #ddd;

            padding: 8px;

            text-align: left;

        }

        .table th {

            background-color: #4A90E2;

            color: white;

        }

        .table td {

            background-color: #f2f2f2;

        }

    </style>

""", unsafe_allow_html=True)

# Initialize Spark
@st.cache_resource
def init_spark():
    return sparknlp.start()

# Create NLP Pipeline
@st.cache_resource
def create_pipeline():
    document_assembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")

    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")

    pos_tagger = PerceptronModel.pretrained("pos_anc", 'en') \
        .setInputCols("document", "token") \
        .setOutputCol("pos")

    dep_parser = DependencyParserModel.pretrained('dependency_conllu') \
        .setInputCols(["document", "pos", "token"]) \
        .setOutputCol("dependency")

    typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu') \
        .setInputCols(["token", "pos", "dependency"]) \
        .setOutputCol("dependency_type")

    pipeline = Pipeline(stages=[
        document_assembler, 
        tokenizer, 
        pos_tagger, 
        dep_parser, 
        typed_dep_parser
    ])
    return pipeline

# Fit Data to Pipeline
def fit_data(pipeline, text):
    df = spark.createDataFrame([[text]]).toDF("text")
    result = pipeline.fit(df).transform(df)
    return result

# Render DataFrame as HTML Table
def render_table(df, sidebar=False):
    html = df.to_html(classes="table", index=False, escape=False)
    if sidebar:
      st.sidebar.markdown(html, unsafe_allow_html=True)
    else:
      st.markdown(html, unsafe_allow_html=True)

def explain_tags(tag_type, tags, tag_dict):
    explanations = [(tag, tag_dict[tag]) for tag in tags if tag in tag_dict]
    if explanations:
        df = pd.DataFrame(explanations, columns=[f"{tag_type} Tag", f"{tag_type} Meaning"])
        df.index = [''] * len(df)  # Hide the index
        render_table(df, sidebar=True)

# Page Title and Subtitle
title = "Grammar Analysis & Dependency Parsing"
sub_title = "Visualize the syntactic structure of a sentence as a directed labeled graph."

st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True)

# Example Sentences
examples = [
    "John Snow is a good man. He knows a lot about science.",
    "In what country is the WTO headquartered?",
    "I was wearing my dark blue shirt and tie.",
    "The Geneva Motor Show is the most popular car show of the year.",
    "Bill Gates and Steve Jobs had periods of civility."
]

# Text Selection
selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own sentence!")

text_to_analyze = custom_input if custom_input else selected_text

st.write('Text to analyze:')
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; 

                   border: 1px solid #e6e9ef; border-radius: 0.25rem; 

                   padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)

# Initialize Spark and Pipeline
spark = init_spark()
pipeline = create_pipeline()
output = fit_data(pipeline, text_to_analyze)

# Display Dependency Tree
st.write("Dependency Tree:")
nlp.load('dep.typed').viz_streamlit_dep_tree(
    text=text_to_analyze,
    title='',
    sub_title='',
    set_wide_layout_CSS=False,
    generate_code_sample=False,
    key="NLU_streamlit",
    show_infos=False,
    show_logo=False,
    show_text_input=False,
)

# Display Raw Result
st.write("Raw Result:")
df = output.select(F.explode(F.arrays_zip(
    output.token.result,
    output.token.begin,
    output.token.end,
    output.pos.result,
    output.dependency.result,
    output.dependency_type.result
)).alias("cols")) \
    .select(F.expr("cols['0']").alias("chunk"),
            F.expr("cols['1']").alias("begin"),
            F.expr("cols['2']").alias("end"),
            F.expr("cols['3']").alias("pos"),
            F.expr("cols['4']").alias("dependency"),
            F.expr("cols['5']").alias("dependency_type")).toPandas()

render_table(df)

# Sidebar Content 
# POS and Dependency dictionaries
pos_dict = {
    "CC": "Coordinating conjunction", "CD": "Cardinal number", "DT": "Determiner", 
    "EX": "Existential there", "FW": "Foreign word", "IN": "Preposition or subordinating conjunction", 
    "JJ": "Adjective", "JJR": "Adjective, comparative", "JJS": "Adjective, superlative", 
    "LS": "List item marker", "MD": "Modal", "NN": "Noun, singular or mass", 
    "NNS": "Noun, plural", "NNP": "Proper noun, singular", "NNPS": "Proper noun, plural", 
    "PDT": "Predeterminer", "POS": "Possessive ending", "PRP": "Personal pronoun", 
    "PRP$": "Possessive pronoun", "RB": "Adverb", "RBR": "Adverb, comparative", 
    "RBS": "Adverb, superlative", "RP": "Particle", "SYM": "Symbol", "TO": "to", 
    "UH": "Interjection", "VB": "Verb, base form", "VBD": "Verb, past tense", 
    "VBG": "Verb, gerund or present participle", "VBN": "Verb, past participle", 
    "VBP": "Verb, non-3rd person singular present", "VBZ": "Verb, 3rd person singular present", 
    "WDT": "Wh-determiner", "WP": "Wh-pronoun", "WP$": "Possessive wh-pronoun", 
    "WRB": "Wh-adverb"
}

dependency_dict = {
    "acl": "clausal modifier of noun (adjectival clause)", 
    "advcl": "adverbial clause modifier", 
    "advmod": "adverbial modifier", 
    "amod": "adjectival modifier", 
    "appos": "appositional modifier", 
    "aux": "auxiliary", 
    "case": "case marking", 
    "cc": "coordinating conjunction", 
    "ccomp": "clausal complement", 
    "clf": "classifier", 
    "compound": "compound", 
    "conj": "conjunct", 
    "cop": "copula", 
    "csubj": "clausal subject", 
    "dep": "unspecified dependency", 
    "det": "determiner", 
    "discourse": "discourse element", 
    "dislocated": "dislocated elements", 
    "expl": "expletive", 
    "fixed": "fixed multiword expression", 
    "flat": "flat multiword expression", 
    "goeswith": "goes with", 
    "iobj": "indirect object", 
    "list": "list", 
    "mark": "marker", 
    "nmod": "nominal modifier", 
    "nsubj": "nominal subject", 
    "nummod": "numeric modifier", 
    "obj": "object", 
    "obl": "oblique nominal", 
    "orphan": "orphan", 
    "parataxis": "parataxis", 
    "punct": "punctuation", 
    "reparandum": "overridden disfluency", 
    "root": "root", 
    "vocative": "vocative", 
    "xcomp": "open clausal complement"
}

# Get unique POS and dependency tags
unique_pos = df['pos'].unique()
unique_dep = df['dependency_type'].unique()

# Sidebar options for explanations
if st.sidebar.checkbox("Explain POS Tags"):
    explain_tags("POS", unique_pos, pos_dict)

if st.sidebar.checkbox("Explain Dependency Types"):
    explain_tags("Dependency", unique_dep, dependency_dict)

# Sidebar with Reference Notebook Link
colab_link = """

<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb">

    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>

</a>

"""
st.sidebar.markdown('Reference Notebook:')
st.sidebar.markdown(colab_link, unsafe_allow_html=True)