Update pages/Text Preprocessing.py
Browse files- pages/Text Preprocessing.py +35 -6
pages/Text Preprocessing.py
CHANGED
@@ -10,7 +10,29 @@ from sparknlp.annotator import *
|
|
10 |
from sparknlp.base import *
|
11 |
from sparknlp.pretrained import PretrainedPipeline
|
12 |
from pyspark.sql.types import StringType, IntegerType
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
@st.cache_resource
|
15 |
def init_spark():
|
16 |
spark = sparknlp.start()
|
@@ -71,8 +93,15 @@ def fit_data(pipeline, data):
|
|
71 |
def extract_annotations(output, annotation_type):
|
72 |
return [anno.result for anno in output[0][annotation_type]]
|
73 |
|
74 |
-
|
75 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
|
78 |
#model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
|
@@ -105,7 +134,7 @@ examples = [
|
|
105 |
|
106 |
]
|
107 |
|
108 |
-
st.
|
109 |
|
110 |
selected_text = st.selectbox("Select an example", examples)
|
111 |
|
@@ -116,7 +145,7 @@ if custom_input:
|
|
116 |
elif selected_text:
|
117 |
selected_text = selected_text
|
118 |
|
119 |
-
st.
|
120 |
st.write(selected_text)
|
121 |
|
122 |
spark = init_spark()
|
@@ -151,5 +180,5 @@ if selected_models[5]:
|
|
151 |
|
152 |
if data_dict:
|
153 |
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
|
154 |
-
st.
|
155 |
st.dataframe(df)
|
|
|
10 |
from sparknlp.base import *
|
11 |
from sparknlp.pretrained import PretrainedPipeline
|
12 |
from pyspark.sql.types import StringType, IntegerType
|
13 |
+
|
14 |
+
# Configure Streamlit page
|
15 |
+
st.set_page_config(
|
16 |
+
layout="wide",
|
17 |
+
page_title="Spark NLP Demos App",
|
18 |
+
initial_sidebar_state="auto"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Custom CSS for better styling
|
22 |
+
st.markdown("""
|
23 |
+
<style>
|
24 |
+
.main-title {
|
25 |
+
font-size: 36px;
|
26 |
+
color: #4A90E2;
|
27 |
+
font-weight: bold;
|
28 |
+
text-align: center;
|
29 |
+
}
|
30 |
+
.section p, .section ul {
|
31 |
+
color: #666666;
|
32 |
+
}
|
33 |
+
</style>
|
34 |
+
""", unsafe_allow_html=True)
|
35 |
+
|
36 |
@st.cache_resource
|
37 |
def init_spark():
|
38 |
spark = sparknlp.start()
|
|
|
93 |
def extract_annotations(output, annotation_type):
|
94 |
return [anno.result for anno in output[0][annotation_type]]
|
95 |
|
96 |
+
# st.title("Summarize Text")
|
97 |
+
st.markdown('<div class="main-title">State-of-the-Art Text Preprocessing with Spark NLP</div>', unsafe_allow_html=True)
|
98 |
+
st.write("")
|
99 |
+
st.write("")
|
100 |
+
st.markdown("""
|
101 |
+
<div class="section">
|
102 |
+
<p>This demo utilizes a comprehensive text preprocessing pipeline using Spark NLP. The pipeline includes several stages such as document assembly, tokenization, sentence detection, normalization, stopword cleaning, stemming, and lemmatization. These steps are essential for preparing text data for downstream NLP tasks, ensuring the text is clean and standardized for effective model training and evaluation.</p>
|
103 |
+
</div>
|
104 |
+
""", unsafe_allow_html=True)
|
105 |
|
106 |
model_name = "SentenceDetector|Tokenizer|Stemmer|Lemmatizer|Normalizer|Stop Words Remover"
|
107 |
#model = st.sidebar.selectbox("Choose the pretrained model", model_name, help="For more info about the models visit: https://sparknlp.org/models",)
|
|
|
134 |
|
135 |
]
|
136 |
|
137 |
+
st.write("Split and clean text")
|
138 |
|
139 |
selected_text = st.selectbox("Select an example", examples)
|
140 |
|
|
|
145 |
elif selected_text:
|
146 |
selected_text = selected_text
|
147 |
|
148 |
+
st.write('Selected Text')
|
149 |
st.write(selected_text)
|
150 |
|
151 |
spark = init_spark()
|
|
|
180 |
|
181 |
if data_dict:
|
182 |
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
|
183 |
+
st.write("Annotation Results:")
|
184 |
st.dataframe(df)
|