Manolis Fragkidakis commited on
Commit
ab990a6
1 Parent(s): c166965
Files changed (2) hide show
  1. app.py +212 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from deep_translator import GoogleTranslator
4
+ from bertopic import BERTopic
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ import base64
8
+ from io import BytesIO
9
+ import plotly.graph_objects as go
10
+ import plotly.subplots as sp
11
+ import plotly.express as px
12
+
13
+
14
+ def translate_feedback(feedback_df, column_name):
15
+ feedback_df["translated"] = "-" # Add a new column "translated" and initialize all rows with "-"
16
+
17
+ for i, feedback in enumerate(feedback_df[column_name]):
18
+ try:
19
+ translation = GoogleTranslator(source='auto', target='en').translate(feedback)
20
+ feedback_df.loc[i, "translated"] = translation # Store the translation in the "translated" column
21
+ except Exception as e:
22
+ feedback_df.loc[i, "translated"] = "-" # Store "-" in the "translated" column if an error occurs
23
+
24
+ feedback_df = feedback_df[feedback_df["translated"] != "-"] # Remove "-" rows
25
+ return feedback_df
26
+
27
+ @st.cache
28
+ def convert_df(df):
29
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
30
+ return df.to_csv().encode('utf-8')
31
+
32
+ def download_csv(df):
33
+ csv = df.to_csv(index=False)
34
+ b64 = base64.b64encode(csv.encode()).decode() # Encode the DataFrame as base64
35
+ href = f'<a href="data:file/csv;base64,{b64}" download="translated_feedback.csv">Download CSV file</a>'
36
+ return href
37
+
38
+ def topics_over_time(topic_model, dataframe, training_column):
39
+ timestamps = list(dataframe.day.values)
40
+ feedback_list = list(dataframe[training_column])
41
+ topics_over_time = topic_model.topics_over_time(feedback_list, timestamps, global_tuning=True, evolution_tuning=True)
42
+ f = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True)
43
+ f.update_layout(width=800,height=500)
44
+ return f
45
+
46
+ def area_over_time(topic_model, df, training_column, datetime_column):
47
+ df['Topic'] = topic_model.get_document_info(df[training_column])["Name"].values
48
+
49
+ df[datetime_column] = pd.to_datetime(df[datetime_column])
50
+ df['year'] = df[datetime_column].dt.year
51
+ df['month'] = df[datetime_column].dt.month
52
+
53
+ # Group the data by year, month, and topic
54
+ grouped = df.groupby(['year', 'month', 'Topic'])[training_column].count().reset_index()
55
+
56
+ # Normalize the document counts by the total document count for each month and topic
57
+ grouped['total_count'] = grouped.groupby(['year', 'month'])[training_column].transform('sum')
58
+ grouped['document_pct'] = grouped[training_column] / grouped['total_count'] * 100
59
+
60
+ # Pivot the data to create a table with months as rows, topics as columns, and document percentages as values
61
+ pivoted = pd.pivot_table(grouped, index=['year', 'month'], columns='Topic', values='document_pct', fill_value=0)
62
+ pivoted = pivoted.reset_index()
63
+
64
+ # Melt the data to create a long format with separate rows for each topic
65
+ melted = pd.melt(pivoted, id_vars=['year', 'month'], var_name='Topic', value_name='document_pct')
66
+
67
+ # Create the interactive plot using Plotly Express
68
+ fig = px.area(melted, x='month', y='document_pct', color='Topic', facet_col='year', facet_col_wrap=3,
69
+ title='Distribution of Documents by Topic and Month (Relative to 100%)',
70
+ labels={'month': 'Month', 'document_pct': 'Document Percentage', 'Topic': 'Topic', 'year': 'Year'},
71
+ hover_data={'month': False, 'document_pct': ':.2f'})
72
+
73
+ return fig
74
+
75
+ # Sidebar configuration
76
+ st.sidebar.title("Translation and Analysis App")
77
+ tab = st.sidebar.selectbox("Select Tab", ("Translate", "Analyse Feedback"))
78
+
79
+
80
+ if tab == "Translate":
81
+ st.title("Translate Feedback")
82
+ file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"], accept_multiple_files=False)
83
+
84
+ if file is not None:
85
+ file.seek(0)
86
+ feedback_df = pd.read_csv(file, low_memory=False, on_bad_lines='skip', engine='c') if file.name.endswith(".csv") else pd.read_excel(file)
87
+ st.write('**Data Head:**')
88
+ st.write(feedback_df.head())
89
+ column_name = st.selectbox("Select Column", feedback_df.columns)
90
+ feedback_df = feedback_df.dropna(subset=[column_name])
91
+ feedback_df = feedback_df.reset_index(drop=True)
92
+ if st.button("Translate"):
93
+ translated_df = translate_feedback(feedback_df, column_name)
94
+ csv = convert_df(translated_df)
95
+
96
+ st.write('**Translated Data Head:**')
97
+ st.write(translated_df.head())
98
+
99
+ st.download_button(
100
+ label="Download data as CSV",
101
+ data=csv,
102
+ file_name='translated_data.csv',
103
+ mime='text/csv',
104
+ )
105
+
106
+
107
+
108
+ elif tab == "Analyse Feedback":
109
+ # Analyse Feedback tab code
110
+ st.title("Analyse Feedback")
111
+
112
+ file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"])
113
+
114
+ if file is not None:
115
+ df = pd.read_csv(file, on_bad_lines='skip') if file.name.endswith(".csv") else pd.read_excel(file)
116
+ st.write('**Data Head:**')
117
+ st.write(df.head())
118
+ column_names = df.columns.tolist()
119
+
120
+ datetime_column = st.selectbox("Select Datetime Column", column_names + ["None"])
121
+ feedback_column = st.selectbox("Select Feedback Column", column_names)
122
+
123
+ model_select = st.selectbox(
124
+ "Select model to train:",
125
+ [
126
+ 'all-mpnet-base-v2',
127
+ 'all-distilroberta-v1',
128
+ 'distiluse-base-multilingual-cased-v2',
129
+ 'multi-qa-mpnet-base-dot-v1',
130
+ 'multi-qa-distilbert-cos-v1',
131
+ 'paraphrase-multilingual-mpnet-base-v2'
132
+ ]
133
+ )
134
+
135
+ if st.button("Train Model"):
136
+ if model_select is not None:
137
+ new_df = df.copy()
138
+ if datetime_column != "None":
139
+ new_df[datetime_column] = pd.to_datetime(new_df[datetime_column])
140
+ sentence_model = SentenceTransformer(model_select)
141
+
142
+ vectorizer_model = CountVectorizer(stop_words="english")
143
+
144
+ # Initialize a BERTopic model with the SentenceTransformer embeddings
145
+ my_model = BERTopic(
146
+ language="en",
147
+ calculate_probabilities=True,
148
+ verbose=True,
149
+ n_gram_range=(1, 3),
150
+ embedding_model=sentence_model,
151
+ vectorizer_model=vectorizer_model,
152
+ nr_topics = 15
153
+ )
154
+
155
+ # Preprocess the data by replacing missing values with empty strings
156
+ new_df[feedback_column] = new_df[feedback_column].fillna('')
157
+ new_df.reset_index(inplace = True,drop = True)
158
+
159
+ # Fit the BERTopic model on the dataframe
160
+ my_model.fit(new_df[feedback_column])
161
+ st.success("Model trained successfully")
162
+
163
+ # Store the trained model in session state
164
+ st.session_state.trained_model = my_model
165
+ st.session_state.new_df = new_df
166
+ st.session_state.feedback_colomn = feedback_column
167
+ st.session_state.datetime_column = datetime_column
168
+
169
+ if "trained_model" in st.session_state:
170
+ trained_model = st.session_state.trained_model
171
+ new_df = st.session_state.new_df
172
+ new_feedback_column = st.session_state.feedback_colomn
173
+
174
+
175
+ visualization_options = [
176
+ "Visualize documents",
177
+ "Topic Hierarchy",
178
+ "Barchart",
179
+ "Topics over time",
180
+ "Representative docs per topic"
181
+ ]
182
+ selected_visualization = st.selectbox("Select Visualization", visualization_options)
183
+
184
+ if selected_visualization == "Barchart":
185
+ umap_fig = trained_model.visualize_barchart(n_words=5)
186
+ st.plotly_chart(umap_fig)
187
+ elif selected_visualization == "Visualize documents":
188
+ viz_doc = trained_model.visualize_documents(new_df[new_feedback_column])
189
+ st.plotly_chart(viz_doc)
190
+ elif selected_visualization == "Topic Hierarchy":
191
+ tsne_fig = trained_model.visualize_hierarchy(top_n_topics=20)
192
+ st.plotly_chart(tsne_fig)
193
+ elif selected_visualization == "Topics over time":
194
+ time_fig = area_over_time(trained_model, new_df, new_feedback_column, datetime_column)
195
+ st.plotly_chart(time_fig)
196
+ elif selected_visualization == "Representative docs per topic":
197
+ st.write(trained_model.get_representative_docs())
198
+
199
+ result = pd.merge(new_df[feedback_column],
200
+ trained_model.get_document_info(new_df[feedback_column]),
201
+ left_on=feedback_column,
202
+ right_on='Document',
203
+ how = 'left'
204
+ )
205
+
206
+ feedback_and_docs = convert_df(result)
207
+ st.download_button(
208
+ label="Download documents and topics",
209
+ data=feedback_and_docs,
210
+ file_name='document_info.csv',
211
+ mime='text/csv',
212
+ )
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bertopic==0.15.0
2
+ deep_translator==1.11.4
3
+ langdetect==1.0.9
4
+ matplotlib==3.7.2
5
+ numpy==1.24.4
6
+ openpyxl==3.0.9
7
+ pandas==2.0.3
8
+ plotly==5.15.0
9
+ scikit_learn==1.2.2
10
+ seaborn==0.12.2
11
+ sentence_transformers==2.2.2
12
+ streamlit==1.24.1
13
+ streamlit_scrollable_textbox==0.0.3
14
+ transformers==4.31.0