abdullahmubeen10 commited on
Commit
bb10779
·
verified ·
1 Parent(s): 4c52abe

Create Sequence Classification.py

Browse files
Files changed (1) hide show
  1. pages/Sequence Classification.py +137 -0
pages/Sequence Classification.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+
4
+ from sparknlp.base import *
5
+ from sparknlp.annotator import *
6
+ from pyspark.ml import Pipeline
7
+ from annotated_text import annotated_text
8
+
9
+ # Page configuration
10
+ st.set_page_config(
11
+ layout="wide",
12
+ initial_sidebar_state="auto"
13
+ )
14
+
15
+ # CSS for styling
16
+ st.markdown("""
17
+ <style>
18
+ .main-title {
19
+ font-size: 36px;
20
+ color: #4A90E2;
21
+ font-weight: bold;
22
+ text-align: center;
23
+ }
24
+ .section {
25
+ background-color: #f9f9f9;
26
+ padding: 10px;
27
+ border-radius: 10px;
28
+ margin-top: 10px;
29
+ }
30
+ .section p, .section ul {
31
+ color: #666666;
32
+ }
33
+ </style>
34
+ """, unsafe_allow_html=True)
35
+
36
+ @st.cache_resource
37
+ def init_spark():
38
+ return sparknlp.start()
39
+
40
+ @st.cache_resource
41
+ def create_pipeline():
42
+ document_assembler = DocumentAssembler() \
43
+ .setInputCol('text') \
44
+ .setOutputCol('document')
45
+
46
+ tokenizer = Tokenizer() \
47
+ .setInputCols(['document']) \
48
+ .setOutputCol('token')
49
+
50
+ sequence_classifier = XlmRoBertaForSequenceClassification.pretrained('xlm_roberta_base_sequence_classifier_imdb', 'en') \
51
+ .setInputCols(["document", "token"]) \
52
+ .setOutputCol("class")
53
+
54
+ pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier])
55
+ return pipeline
56
+
57
+ def fit_data(pipeline, data):
58
+ empty_df = spark.createDataFrame([['']]).toDF('text')
59
+ pipeline_model = pipeline.fit(empty_df)
60
+ model = LightPipeline(pipeline_model)
61
+ result = model.fullAnnotate(data)
62
+ return result
63
+
64
+ def annotate(data):
65
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
66
+ annotated_words = []
67
+ for chunk, label in zip(chunks, labels):
68
+ parts = document.split(chunk, 1)
69
+ if parts[0]:
70
+ annotated_words.append(parts[0])
71
+ annotated_words.append((chunk, label))
72
+ document = parts[1]
73
+ if document:
74
+ annotated_words.append(document)
75
+ annotated_text(*annotated_words)
76
+
77
+ tasks_models_descriptions = {
78
+ "Sequence Classification": {
79
+ "models": ["xlm_roberta_base_sequence_classifier_imdb"],
80
+ "description": "The 'xlm_roberta_base_sequence_classifier_imdb' model is specialized for sentiment analysis of movie reviews. It accurately classifies IMDb reviews as positive or negative, leveraging the multilingual capabilities of XLM-RoBERTa to analyze text content and sentiment across different languages."
81
+ }
82
+ }
83
+
84
+ # Sidebar content
85
+ task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys()))
86
+ model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models")
87
+
88
+ # Reference notebook link in sidebar
89
+ link = """
90
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/357691d18373d6e8f13b5b1015137a398fd0a45f/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L103">
91
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
92
+ </a>
93
+ """
94
+ st.sidebar.markdown('Reference notebook:')
95
+ st.sidebar.markdown(link, unsafe_allow_html=True)
96
+
97
+ # Page content
98
+ title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"])
99
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
100
+ container = st.container(border=True)
101
+ container.write(sub_title)
102
+
103
+ # Load examples
104
+ examples_mapping = {
105
+ "Sequence Classification": [
106
+ "This movie was absolutely fantastic! The storyline was gripping, the characters were well-developed, and the cinematography was stunning. I was on the edge of my seat the entire time.",
107
+ "A heartwarming and beautiful film. The performances were top-notch, and the direction was flawless. This is easily one of the best movies I've seen this year.",
108
+ "What a delightful surprise! The humor was spot on, and the plot was refreshingly original. The cast did an amazing job bringing the characters to life. Highly recommended!",
109
+ "This was one of the worst movies I’ve ever seen. The plot was predictable, the acting was wooden, and the pacing was painfully slow. I couldn’t wait for it to end.",
110
+ "A complete waste of time. The movie lacked any real substance or direction, and the dialogue was cringe-worthy. I wouldn’t recommend this to anyone.",
111
+ "I had high hopes for this film, but it turned out to be a huge disappointment. The story was disjointed, and the special effects were laughably bad. Don’t bother watching this one.",
112
+ "The movie was okay, but nothing special. It had a few good moments, but overall, it felt pretty average. Not something I would watch again, but it wasn’t terrible either.",
113
+ "An average film with a decent plot. The acting was passable, but it didn't leave much of an impression on me. It's a movie you might watch once and forget about.",
114
+ "This movie was neither good nor bad, just kind of there. It had some interesting ideas, but they weren’t executed very well. It’s a film you could take or leave."
115
+ ]
116
+ }
117
+
118
+ examples = examples_mapping[task]
119
+ selected_text = st.selectbox("Select an example", examples)
120
+ custom_input = st.text_input("Try it with your own Sentence!")
121
+
122
+ try:
123
+ text_to_analyze = custom_input if custom_input else selected_text
124
+ st.subheader('Full example text')
125
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
126
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
127
+ except:
128
+ text_to_analyze = selected_text
129
+
130
+ # Initialize Spark and create pipeline
131
+ spark = init_spark()
132
+ pipeline = create_pipeline()
133
+ output = fit_data(pipeline, text_to_analyze)
134
+
135
+ # Display matched sentence
136
+ st.subheader("Prediction:")
137
+ st.markdown(f"Classified as : **{output[0]['class'][0].result}**")