import streamlit as st import pandas as pd from deep_translator import GoogleTranslator from bertopic import BERTopic from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer import base64 from io import BytesIO import plotly.graph_objects as go import plotly.subplots as sp import plotly.express as px def translate_feedback(feedback_df, column_name): feedback_df["translated"] = "-" # Add a new column "translated" and initialize all rows with "-" for i, feedback in enumerate(feedback_df[column_name]): try: translation = GoogleTranslator(source='auto', target='en').translate(feedback) feedback_df.loc[i, "translated"] = translation # Store the translation in the "translated" column except Exception as e: feedback_df.loc[i, "translated"] = "-" # Store "-" in the "translated" column if an error occurs feedback_df = feedback_df[feedback_df["translated"] != "-"] # Remove "-" rows return feedback_df @st.cache def convert_df(df): # IMPORTANT: Cache the conversion to prevent computation on every rerun return df.to_csv().encode('utf-8') def download_csv(df): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() # Encode the DataFrame as base64 href = f'Download CSV file' return href def topics_over_time(topic_model, dataframe, training_column): timestamps = list(dataframe.day.values) feedback_list = list(dataframe[training_column]) topics_over_time = topic_model.topics_over_time(feedback_list, timestamps, global_tuning=True, evolution_tuning=True) f = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True) f.update_layout(width=800,height=500) return f def area_over_time(topic_model, df, training_column, datetime_column): df['Topic'] = topic_model.get_document_info(df[training_column])["Name"].values df[datetime_column] = pd.to_datetime(df[datetime_column]) df['year'] = df[datetime_column].dt.year df['month'] = df[datetime_column].dt.month # Group the data by year, month, and topic grouped = df.groupby(['year', 'month', 'Topic'])[training_column].count().reset_index() # Normalize the document counts by the total document count for each month and topic grouped['total_count'] = grouped.groupby(['year', 'month'])[training_column].transform('sum') grouped['document_pct'] = grouped[training_column] / grouped['total_count'] * 100 # Pivot the data to create a table with months as rows, topics as columns, and document percentages as values pivoted = pd.pivot_table(grouped, index=['year', 'month'], columns='Topic', values='document_pct', fill_value=0) pivoted = pivoted.reset_index() # Melt the data to create a long format with separate rows for each topic melted = pd.melt(pivoted, id_vars=['year', 'month'], var_name='Topic', value_name='document_pct') # Create the interactive plot using Plotly Express fig = px.area(melted, x='month', y='document_pct', color='Topic', facet_col='year', facet_col_wrap=3, title='Distribution of Documents by Topic and Month (Relative to 100%)', labels={'month': 'Month', 'document_pct': 'Document Percentage', 'Topic': 'Topic', 'year': 'Year'}, hover_data={'month': False, 'document_pct': ':.2f'}) return fig # Sidebar configuration st.sidebar.title("Translation and Analysis App") tab = st.sidebar.selectbox("Select Tab", ("Translate", "Analyse Feedback")) if tab == "Translate": st.title("Translate Feedback") file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"], accept_multiple_files=False) if file is not None: file.seek(0) feedback_df = pd.read_csv(file, low_memory=False, on_bad_lines='skip', engine='c') if file.name.endswith(".csv") else pd.read_excel(file) st.write('**Data Head:**') st.write(feedback_df.head()) column_name = st.selectbox("Select Column", feedback_df.columns) feedback_df = feedback_df.dropna(subset=[column_name]) feedback_df = feedback_df.reset_index(drop=True) if st.button("Translate"): translated_df = translate_feedback(feedback_df, column_name) csv = convert_df(translated_df) st.write('**Translated Data Head:**') st.write(translated_df.head()) st.download_button( label="Download data as CSV", data=csv, file_name='translated_data.csv', mime='text/csv', ) elif tab == "Analyse Feedback": # Analyse Feedback tab code st.title("Analyse Feedback") file = st.file_uploader("Upload CSV or Excel file", type=["csv", "xlsx"]) if file is not None: df = pd.read_csv(file, on_bad_lines='skip') if file.name.endswith(".csv") else pd.read_excel(file) st.write('**Data Head:**') st.write(df.head()) column_names = df.columns.tolist() datetime_column = st.selectbox("Select Datetime Column", column_names + ["None"]) feedback_column = st.selectbox("Select Feedback Column", column_names) model_select = st.selectbox( "Select model to train:", [ 'all-mpnet-base-v2', 'all-distilroberta-v1', 'distiluse-base-multilingual-cased-v2', 'multi-qa-mpnet-base-dot-v1', 'multi-qa-distilbert-cos-v1', 'paraphrase-multilingual-mpnet-base-v2', 'BAAI/bge-small-en-v1.5', 'Cohere/Cohere-embed-english-v3.0' ] ) if st.button("Train Model"): if model_select is not None: new_df = df.copy() if datetime_column != "None": new_df[datetime_column] = pd.to_datetime(new_df[datetime_column]) sentence_model = SentenceTransformer(model_select) vectorizer_model = CountVectorizer(stop_words="english") # Initialize a BERTopic model with the SentenceTransformer embeddings my_model = BERTopic( language="en", calculate_probabilities=True, verbose=True, n_gram_range=(1, 3), embedding_model=sentence_model, vectorizer_model=vectorizer_model, nr_topics = 15 ) # Preprocess the data by replacing missing values with empty strings new_df[feedback_column] = new_df[feedback_column].fillna('') new_df.reset_index(inplace = True,drop = True) # Fit the BERTopic model on the dataframe my_model.fit(new_df[feedback_column]) st.success("Model trained successfully") # Store the trained model in session state st.session_state.trained_model = my_model st.session_state.new_df = new_df st.session_state.feedback_colomn = feedback_column st.session_state.datetime_column = datetime_column if "trained_model" in st.session_state: trained_model = st.session_state.trained_model new_df = st.session_state.new_df new_feedback_column = st.session_state.feedback_colomn visualization_options = [ "Visualize documents", "Topic Hierarchy", "Barchart", "Topics over time", "Representative docs per topic" ] selected_visualization = st.selectbox("Select Visualization", visualization_options) if selected_visualization == "Barchart": umap_fig = trained_model.visualize_barchart(n_words=5) st.plotly_chart(umap_fig) elif selected_visualization == "Visualize documents": viz_doc = trained_model.visualize_documents(new_df[new_feedback_column]) st.plotly_chart(viz_doc) elif selected_visualization == "Topic Hierarchy": tsne_fig = trained_model.visualize_hierarchy(top_n_topics=20) st.plotly_chart(tsne_fig) elif selected_visualization == "Topics over time": time_fig = area_over_time(trained_model, new_df, new_feedback_column, datetime_column) st.plotly_chart(time_fig) elif selected_visualization == "Representative docs per topic": st.write(trained_model.get_representative_docs()) result = pd.merge(new_df[feedback_column], trained_model.get_document_info(new_df[feedback_column]), left_on=feedback_column, right_on='Document', how = 'left' ) feedback_and_docs = convert_df(result) st.download_button( label="Download documents and topics", data=feedback_and_docs, file_name='document_info.csv', mime='text/csv', )