import streamlit as st | |
import pandas as pd | |
from deep_translator import GoogleTranslator | |
from bertopic import BERTopic | |
from sentence_transformers import SentenceTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
import base64 | |
from io import BytesIO | |
import plotly.graph_objects as go | |
import plotly.subplots as sp | |
import as px | |
def translate_feedback(feedback_df, column_name): | |
feedback_df["translated"] = "-" # Add a new column "translated" and initialize all rows with "-" | |
for i, feedback in enumerate(feedback_df[column_name]): | |
try: | |
translation = GoogleTranslator(source='auto', target='en').translate(feedback) | |
feedback_df.loc[i, "translated"] = translation # Store the translation in the "translated" column | |
except Exception as e: | |
feedback_df.loc[i, "translated"] = "-" # Store "-" in the "translated" column if an error occurs | |
feedback_df = feedback_df[feedback_df["translated"] != "-"] # Remove "-" rows | |
return feedback_df | |
def convert_df(df): | |
# IMPORTANT: Cache the conversion to prevent computation on every rerun | |
return df.to_csv().encode('utf-8') | |
def download_csv(df): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() # Encode the DataFrame as base64 | |
href = f'<a href="data:file/csv;base64,{b64}" download="translated_feedback.csv">Download CSV file</a>' | |
return href | |
def topics_over_time(topic_model, dataframe, training_column): | |
timestamps = list( | |
feedback_list = list(dataframe[training_column]) | |
topics_over_time = topic_model.topics_over_time(feedback_list, timestamps, global_tuning=True, evolution_tuning=True) | |
f = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True) | |
f.update_layout(width=800,height=500) | |
return f | |
def area_over_time(topic_model, df, training_column, datetime_column): | |
df['Topic'] = topic_model.get_document_info(df[training_column])["Name"].values | |
df[datetime_column] = pd.to_datetime(df[datetime_column]) | |
df['year'] = df[datetime_column].dt.year | |
df['month'] = df[datetime_column].dt.month | |
# Group the data by year, month, and topic | |
grouped = df.groupby(['year', 'month', 'Topic'])[training_column].count().reset_index() | |
# Normalize the document counts by the total document count for each month and topic | |
grouped['total_count'] = grouped.groupby(['year', 'month'])[training_column].transform('sum') | |
grouped['document_pct'] = grouped[training_column] / grouped['total_count'] * 100 | |
# Pivot the data to create a table with months as rows, topics as columns, and document percentages as values | |
pivoted = pd.pivot_table(grouped, index=['year', 'month'], columns='Topic', values='document_pct', fill_value=0) | |
pivoted = pivoted.reset_index() | |
# Melt the data to create a long format with separate rows for each topic | |
melted = pd.melt(pivoted, id_vars=['year', 'month'], var_name='Topic', value_name='document_pct') | |
# Create the interactive plot using Plotly Express | |
fig = px.area(melted, x='month', y='document_pct', color='Topic', facet_col='year', facet_col_wrap=3, | |
title='Distribution of Documents by Topic and Month (Relative to 100%)', | |
labels={'month': 'Month', 'document_pct': 'Document Percentage', 'Topic': 'Topic', 'year': 'Year'}, | |
hover_data={'month': False, 'document_pct': ':.2f'}) | |
return fig | |
# Sidebar configuration | |
st.sidebar.title("Translation and Analysis App") | |
tab = st.sidebar.selectbox("Select Tab", ("Translate", "Analyse Feedback")) | |
if tab == "Translate": | |
st.title("Translate Feedback") | |
file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"], accept_multiple_files=False) | |
if file is not None: | | | |
feedback_df = pd.read_csv(file, low_memory=False, on_bad_lines='skip', engine='c') if".csv") else pd.read_excel(file) | |
st.write('**Data Head:**') | |
st.write(feedback_df.head()) | |
column_name = st.selectbox("Select Column", feedback_df.columns) | |
feedback_df = feedback_df.dropna(subset=[column_name]) | |
feedback_df = feedback_df.reset_index(drop=True) | |
if st.button("Translate"): | |
translated_df = translate_feedback(feedback_df, column_name) | |
csv = convert_df(translated_df) | |
st.write('**Translated Data Head:**') | |
st.write(translated_df.head()) | |
st.download_button( | |
label="Download data as CSV", | |
data=csv, | |
file_name='translated_data.csv', | |
mime='text/csv', | |
) | |
elif tab == "Analyse Feedback": | |
# Analyse Feedback tab code | |
st.title("Analyse Feedback") | |
file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"]) | |
if file is not None: | |
df = pd.read_csv(file, on_bad_lines='skip') if".csv") else pd.read_excel(file) | |
st.write('**Data Head:**') | |
st.write(df.head()) | |
column_names = df.columns.tolist() | |
datetime_column = st.selectbox("Select Datetime Column", column_names + ["None"]) | |
feedback_column = st.selectbox("Select Feedback Column", column_names) | |
model_select = st.selectbox( | |
"Select model to train:", | |
[ | |
'all-mpnet-base-v2', | |
'all-distilroberta-v1', | |
'distiluse-base-multilingual-cased-v2', | |
'multi-qa-mpnet-base-dot-v1', | |
'multi-qa-distilbert-cos-v1', | |
'paraphrase-multilingual-mpnet-base-v2', | |
'BAAI/bge-small-en-v1.5', | |
'Cohere/Cohere-embed-english-v3.0' | |
] | |
) | |
if st.button("Train Model"): | |
if model_select is not None: | |
new_df = df.copy() | |
if datetime_column != "None": | |
new_df[datetime_column] = pd.to_datetime(new_df[datetime_column]) | |
sentence_model = SentenceTransformer(model_select) | |
vectorizer_model = CountVectorizer(stop_words="english") | |
# Initialize a BERTopic model with the SentenceTransformer embeddings | |
my_model = BERTopic( | |
language="en", | |
calculate_probabilities=True, | |
verbose=True, | |
n_gram_range=(1, 3), | |
embedding_model=sentence_model, | |
vectorizer_model=vectorizer_model, | |
nr_topics = 15 | |
) | |
# Preprocess the data by replacing missing values with empty strings | |
new_df[feedback_column] = new_df[feedback_column].fillna('') | |
new_df.reset_index(inplace = True,drop = True) | |
# Fit the BERTopic model on the dataframe | |[feedback_column]) | |
st.success("Model trained successfully") | |
# Store the trained model in session state | |
st.session_state.trained_model = my_model | |
st.session_state.new_df = new_df | |
st.session_state.feedback_colomn = feedback_column | |
st.session_state.datetime_column = datetime_column | |
if "trained_model" in st.session_state: | |
trained_model = st.session_state.trained_model | |
new_df = st.session_state.new_df | |
new_feedback_column = st.session_state.feedback_colomn | |
visualization_options = [ | |
"Visualize documents", | |
"Topic Hierarchy", | |
"Barchart", | |
"Topics over time", | |
"Representative docs per topic" | |
] | |
selected_visualization = st.selectbox("Select Visualization", visualization_options) | |
if selected_visualization == "Barchart": | |
umap_fig = trained_model.visualize_barchart(n_words=5) | |
st.plotly_chart(umap_fig) | |
elif selected_visualization == "Visualize documents": | |
viz_doc = trained_model.visualize_documents(new_df[new_feedback_column]) | |
st.plotly_chart(viz_doc) | |
elif selected_visualization == "Topic Hierarchy": | |
tsne_fig = trained_model.visualize_hierarchy(top_n_topics=20) | |
st.plotly_chart(tsne_fig) | |
elif selected_visualization == "Topics over time": | |
time_fig = area_over_time(trained_model, new_df, new_feedback_column, datetime_column) | |
st.plotly_chart(time_fig) | |
elif selected_visualization == "Representative docs per topic": | |
st.write(trained_model.get_representative_docs()) | |
result = pd.merge(new_df[feedback_column], | |
trained_model.get_document_info(new_df[feedback_column]), | |
left_on=feedback_column, | |
right_on='Document', | |
how = 'left' | |
) | |
feedback_and_docs = convert_df(result) | |
st.download_button( | |
label="Download documents and topics", | |
data=feedback_and_docs, | |
file_name='document_info.csv', | |
mime='text/csv', | |
) | |