|
from dash import Dash, dcc, html, Input, Output, State |
|
import dash.dependencies as dd |
|
import plotly.express as px |
|
import pandas as pd |
|
|
|
|
|
df_result_bad_distil_2 = df = pd.read_csv( |
|
"hf://datasets/Jbddai/customer_reviews/bad_distil_2_with_cluster_labels_cleaned_company.csv" |
|
) |
|
df_result_good_distil_2 = df = pd.read_csv( |
|
"hf://datasets/Jbddai/customer_reviews/good_distil_2_with_cluster_labels_cleaned_company.csv" |
|
) |
|
|
|
|
|
def preprocess_data_for_slider_marks(df): |
|
min_label = df["labels"].min() |
|
max_label = df["labels"].max() |
|
min_cluster_rank = df["cluster_rank"].min() |
|
max_cluster_rank = df["cluster_rank"].max() |
|
|
|
label_marks = {i: str(i + 1) for i in range(min_label, max_label + 1, 10)} |
|
cluster_rank_marks = {i: str(i + 1) for i in range(min_cluster_rank, max_cluster_rank + 1, 10)} |
|
|
|
return label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank |
|
|
|
|
|
label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = ( |
|
preprocess_data_for_slider_marks(df_result_good_distil_2) |
|
) |
|
|
|
sentiment_options = [ |
|
{"label": "gut", "value": "gut"}, |
|
{"label": "schlecht", "value": "schlecht"}, |
|
] |
|
|
|
app = Dash(__name__) |
|
|
|
app.layout = html.Div( |
|
[ |
|
html.Div( |
|
[ |
|
html.H4("Interactive Plot of Customer Reviews"), |
|
html.Div( |
|
[ |
|
html.P("Select sentiment:"), |
|
dcc.Dropdown( |
|
id="sentiment-dropdown", |
|
options=sentiment_options, |
|
value="gut", |
|
style={"width": "50%", "margin": "auto"}, |
|
clearable=False, |
|
multi=False, |
|
), |
|
], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
html.Div( |
|
[ |
|
html.P("Select range of labels:"), |
|
dcc.RangeSlider( |
|
id="label-range-slider", |
|
min=df_result_good_distil_2["labels"].min(), |
|
max=df_result_good_distil_2["labels"].max(), |
|
step=1, |
|
value=[ |
|
df_result_good_distil_2["labels"].min(), |
|
df_result_good_distil_2["labels"].max(), |
|
], |
|
marks={ |
|
i: str(i + 1) |
|
for i in range( |
|
df_result_good_distil_2["labels"].min(), |
|
df_result_good_distil_2["labels"].max() + 1, |
|
10, |
|
) |
|
}, |
|
tooltip={"always_visible": True, "placement": "bottom"}, |
|
), |
|
html.Button("Reset Range", id="reset-button", n_clicks=0), |
|
], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
html.Div( |
|
[ |
|
html.P("Select range of cluster by rank/popularity (by number of reviews descending):"), |
|
dcc.RangeSlider( |
|
id="cluster-rank-slider", |
|
min=df_result_good_distil_2["cluster_rank"].min(), |
|
max=df_result_good_distil_2["cluster_rank"].max(), |
|
step=1, |
|
value=[ |
|
df_result_good_distil_2["cluster_rank"].min(), |
|
df_result_good_distil_2["cluster_rank"].max(), |
|
], |
|
marks={ |
|
i: str(i + 1) |
|
for i in range( |
|
df_result_good_distil_2["cluster_rank"].min(), |
|
df_result_good_distil_2["cluster_rank"].max() + 1, |
|
10, |
|
) |
|
}, |
|
tooltip={"always_visible": True, "placement": "bottom"}, |
|
), |
|
html.Button("Reset Cluster Rank", id="reset-cluster-button", n_clicks=0), |
|
], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
html.Div( |
|
[ |
|
html.P("Show Cluster Labels:"), |
|
dcc.Checklist( |
|
id="show-cluster-labels", |
|
options=[{"label": "Show", "value": "on"}], |
|
value=["off"], |
|
), |
|
], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
], |
|
style={"position": "relative", "zIndex": "1001", "marginBottom": "20px"}, |
|
), |
|
dcc.Graph( |
|
id="scatter-plot", |
|
style={ |
|
"height": "80vh", |
|
"width": "90vw", |
|
"position": "relative", |
|
"zIndex": "999", |
|
}, |
|
), |
|
html.Div( |
|
[html.Button("Generate LLM Prompt from current selection", id="generate-cluster-button", n_clicks=0)], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
html.Div( |
|
[ |
|
html.P("Prompt for LLM:"), |
|
dcc.Textarea( |
|
id="cluster-text-output", |
|
style={"width": "100%", "height": "200px", "display": "none"}, |
|
value="", |
|
), |
|
], |
|
style={"width": "50%", "margin": "auto"}, |
|
), |
|
] |
|
) |
|
|
|
|
|
@app.callback( |
|
Output("scatter-plot", "figure"), |
|
[ |
|
Input("label-range-slider", "value"), |
|
Input("cluster-rank-slider", "value"), |
|
Input("sentiment-dropdown", "value"), |
|
Input("show-cluster-labels", "value"), |
|
], |
|
) |
|
def update_scatter_plot(label_range, cluster_rank_range, selected_sentiment, show_cluster_labels): |
|
show_labels = "on" in show_cluster_labels |
|
|
|
if selected_sentiment == "gut": |
|
df_filtered = df_result_good_distil_2 |
|
else: |
|
df_filtered = df_result_bad_distil_2 |
|
|
|
df_filtered = df_filtered[ |
|
(df_filtered["labels"].between(label_range[0], label_range[1])) |
|
& (df_filtered["cluster_rank"].between(cluster_rank_range[0], cluster_rank_range[1])) |
|
] |
|
|
|
outliers = df_filtered[df_filtered.labels == -1] |
|
clustered = df_filtered[df_filtered.labels != -1] |
|
|
|
fig = px.scatter( |
|
clustered, |
|
x="x", |
|
y="y", |
|
hover_data=[ |
|
"summary_good_bad", |
|
"sentiment", |
|
"cluster_rank", |
|
"cluster_count", |
|
"clean_review_br", |
|
], |
|
hover_name="cluster_label", |
|
color="labels", |
|
color_continuous_scale="rainbow", |
|
opacity=0.7, |
|
) |
|
|
|
if show_labels: |
|
centroids = clustered.groupby("labels", sort=False).agg( |
|
{ |
|
"x": "mean", |
|
"y": "mean", |
|
"cluster_label": "first", |
|
"cluster_count": "count", |
|
} |
|
) |
|
for row in centroids.itertuples(): |
|
fig.add_annotation( |
|
x=row.x, |
|
y=row.y, |
|
text=f"{row.cluster_label}, #reviews: {row.cluster_count}", |
|
showarrow=False, |
|
) |
|
|
|
fig.add_scatter( |
|
x=outliers["x"], |
|
y=outliers["y"], |
|
mode="markers", |
|
marker=dict(color="lightgray", opacity=0.5, size=5.0), |
|
name="No cluster", |
|
selectedpoints=False, |
|
hoverinfo="skip", |
|
) |
|
|
|
fig.update_layout(coloraxis_colorbar=dict(len=0.9, x=1.0), height=600) |
|
fig.update_traces(marker=dict(size=3), selector=dict(mode="markers")) |
|
|
|
return fig |
|
|
|
|
|
@app.callback( |
|
[ |
|
Output("label-range-slider", "marks"), |
|
Output("label-range-slider", "min"), |
|
Output("label-range-slider", "max"), |
|
Output("cluster-rank-slider", "marks"), |
|
Output("cluster-rank-slider", "min"), |
|
Output("cluster-rank-slider", "max"), |
|
], |
|
[Input("sentiment-dropdown", "value")], |
|
) |
|
def update_slider_marks(selected_sentiment): |
|
if selected_sentiment == "gut": |
|
df_filtered = df_result_good_distil_2 |
|
else: |
|
df_filtered = df_result_bad_distil_2 |
|
|
|
label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = ( |
|
preprocess_data_for_slider_marks(df_filtered) |
|
) |
|
|
|
return ( |
|
label_marks, |
|
min_label, |
|
max_label, |
|
cluster_rank_marks, |
|
min_cluster_rank, |
|
max_cluster_rank, |
|
) |
|
|
|
|
|
@app.callback( |
|
Output("label-range-slider", "value"), |
|
[Input("reset-button", "n_clicks")], |
|
[State("label-range-slider", "min"), State("label-range-slider", "max")], |
|
) |
|
def reset_label_slider(n_clicks, min_val, max_val): |
|
return [min_val, max_val] |
|
|
|
|
|
@app.callback( |
|
Output("cluster-rank-slider", "value"), |
|
[Input("reset-cluster-button", "n_clicks")], |
|
[State("cluster-rank-slider", "min"), State("cluster-rank-slider", "max")], |
|
) |
|
def reset_cluster_slider(n_clicks, min_val, max_val): |
|
return [min_val, max_val] |
|
|
|
|
|
@app.callback( |
|
Output("cluster-text-output", "style"), |
|
[Input("generate-cluster-button", "n_clicks")], |
|
) |
|
def show_cluster_text_output(n_clicks): |
|
if n_clicks > 0: |
|
return {"width": "100%", "height": "200px", "display": "block"} |
|
else: |
|
return {"width": "100%", "height": "200px", "display": "none"} |
|
|
|
|
|
@app.callback( |
|
Output("cluster-text-output", "value"), |
|
[Input("generate-cluster-button", "n_clicks")], |
|
[State("cluster-rank-slider", "value"), State("sentiment-dropdown", "value")], |
|
) |
|
def update_cluster_text_output(n_clicks, cluster_rank_range, selected_sentiment): |
|
if n_clicks > 0: |
|
if selected_sentiment == "gut": |
|
df_text_outp = df_result_good_distil_2 |
|
else: |
|
df_text_outp = df_result_bad_distil_2 |
|
|
|
df_text_outp = df_text_outp[ |
|
(df_text_outp["cluster_rank"] <= cluster_rank_range[1]) |
|
& (df_text_outp["cluster_rank"] >= cluster_rank_range[0]) |
|
] |
|
|
|
df_text_outp["summary_good_bad"] = df_text_outp["summary_good_bad"].fillna("").astype(str) |
|
|
|
sampled_data = df_text_outp.sample(frac=0.1, random_state=42) |
|
|
|
grouped_data = ( |
|
sampled_data.groupby("cluster_label", sort=False)["summary_good_bad"].agg("\n".join).reset_index() |
|
) |
|
|
|
prompt_instruction = """Analysiere die nach ### folgenden CLUSTER "Clustertitel", die einzelene Bestandteile von Bewertungen erhalten\nund leite pro Cluster eine Business Massnahme ab um das Hauptproblem des Clusters zu lösen oder zu Verbessern.\nGib das Cluster mit seinem "Clustertitel" sowie die dazugehörige Maßnahme zurück.\n###""" |
|
|
|
cluster_texts = prompt_instruction + "\n\n".join( |
|
f"\nCLUSTER - {row['cluster_label']}\n{row['summary_good_bad']}" for _, row in grouped_data.iterrows() |
|
) |
|
return cluster_texts |
|
else: |
|
return "" |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run_server(debug=True, host="0.0.0.0", port=7860) |
|
|