initial commit
Browse files- Dockerfile +22 -0
- README.md +4 -3
- app.py +322 -0
- requirements.txt +5 -0
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10.14-slim
|
2 |
+
|
3 |
+
COPY requirements.txt .
|
4 |
+
|
5 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
6 |
+
|
7 |
+
RUN useradd -m -u 1000 user
|
8 |
+
|
9 |
+
USER user
|
10 |
+
|
11 |
+
ENV HOME=/home/user \
|
12 |
+
PATH=/home/user/.local/bin:$PATH
|
13 |
+
|
14 |
+
WORKDIR $HOME/app
|
15 |
+
|
16 |
+
COPY --chown=user . $HOME/app
|
17 |
+
|
18 |
+
COPY . /app
|
19 |
+
|
20 |
+
EXPOSE 7860
|
21 |
+
|
22 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -6,6 +6,7 @@ colorTo: indigo
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
short_description: Interactive web application to get insights from reviews
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
short_description: Interactive web application to get insights from reviews
|
9 |
+
datasets:
|
10 |
+
- Jbddai/customer_reviews
|
11 |
+
app_port: 7860
|
12 |
+
---
|
app.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dash import Dash, dcc, html, Input, Output, State
|
2 |
+
import dash.dependencies as dd
|
3 |
+
import plotly.express as px
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
# Load DataFrame from CSV
|
7 |
+
df_result_bad_distil_2 = df = pd.read_csv(
|
8 |
+
"hf://datasets/Jbddai/customer_reviews/bad_distil_2_with_cluster_labels_cleaned_company.csv"
|
9 |
+
)
|
10 |
+
df_result_good_distil_2 = df = pd.read_csv(
|
11 |
+
"hf://datasets/Jbddai/customer_reviews/good_distil_2_with_cluster_labels_cleaned_company.csv"
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
def preprocess_data_for_slider_marks(df):
|
16 |
+
min_label = df["labels"].min()
|
17 |
+
max_label = df["labels"].max()
|
18 |
+
min_cluster_rank = df["cluster_rank"].min()
|
19 |
+
max_cluster_rank = df["cluster_rank"].max()
|
20 |
+
|
21 |
+
label_marks = {i: str(i + 1) for i in range(min_label, max_label + 1, 10)}
|
22 |
+
cluster_rank_marks = {i: str(i + 1) for i in range(min_cluster_rank, max_cluster_rank + 1, 10)}
|
23 |
+
|
24 |
+
return label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank
|
25 |
+
|
26 |
+
|
27 |
+
label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = (
|
28 |
+
preprocess_data_for_slider_marks(df_result_good_distil_2)
|
29 |
+
)
|
30 |
+
|
31 |
+
sentiment_options = [
|
32 |
+
{"label": "gut", "value": "gut"},
|
33 |
+
{"label": "schlecht", "value": "schlecht"},
|
34 |
+
]
|
35 |
+
|
36 |
+
app = Dash(__name__)
|
37 |
+
|
38 |
+
app.layout = html.Div(
|
39 |
+
[
|
40 |
+
html.Div(
|
41 |
+
[
|
42 |
+
html.H4("Interactive Plot of Customer Reviews"),
|
43 |
+
html.Div(
|
44 |
+
[
|
45 |
+
html.P("Select sentiment:"),
|
46 |
+
dcc.Dropdown(
|
47 |
+
id="sentiment-dropdown",
|
48 |
+
options=sentiment_options,
|
49 |
+
value="gut",
|
50 |
+
style={"width": "50%", "margin": "auto"},
|
51 |
+
clearable=False,
|
52 |
+
multi=False,
|
53 |
+
),
|
54 |
+
],
|
55 |
+
style={"width": "50%", "margin": "auto"},
|
56 |
+
),
|
57 |
+
html.Div(
|
58 |
+
[
|
59 |
+
html.P("Select range of labels:"),
|
60 |
+
dcc.RangeSlider(
|
61 |
+
id="label-range-slider",
|
62 |
+
min=df_result_good_distil_2["labels"].min(),
|
63 |
+
max=df_result_good_distil_2["labels"].max(),
|
64 |
+
step=1,
|
65 |
+
value=[
|
66 |
+
df_result_good_distil_2["labels"].min(),
|
67 |
+
df_result_good_distil_2["labels"].max(),
|
68 |
+
],
|
69 |
+
marks={
|
70 |
+
i: str(i + 1)
|
71 |
+
for i in range(
|
72 |
+
df_result_good_distil_2["labels"].min(),
|
73 |
+
df_result_good_distil_2["labels"].max() + 1,
|
74 |
+
10,
|
75 |
+
)
|
76 |
+
},
|
77 |
+
tooltip={"always_visible": True, "placement": "bottom"},
|
78 |
+
),
|
79 |
+
html.Button("Reset Range", id="reset-button", n_clicks=0),
|
80 |
+
],
|
81 |
+
style={"width": "50%", "margin": "auto"},
|
82 |
+
),
|
83 |
+
html.Div(
|
84 |
+
[
|
85 |
+
html.P("Select range of cluster by rank/popularity (by number of reviews descending):"),
|
86 |
+
dcc.RangeSlider(
|
87 |
+
id="cluster-rank-slider",
|
88 |
+
min=df_result_good_distil_2["cluster_rank"].min(),
|
89 |
+
max=df_result_good_distil_2["cluster_rank"].max(),
|
90 |
+
step=1,
|
91 |
+
value=[
|
92 |
+
df_result_good_distil_2["cluster_rank"].min(),
|
93 |
+
df_result_good_distil_2["cluster_rank"].max(),
|
94 |
+
],
|
95 |
+
marks={
|
96 |
+
i: str(i + 1)
|
97 |
+
for i in range(
|
98 |
+
df_result_good_distil_2["cluster_rank"].min(),
|
99 |
+
df_result_good_distil_2["cluster_rank"].max() + 1,
|
100 |
+
10,
|
101 |
+
)
|
102 |
+
},
|
103 |
+
tooltip={"always_visible": True, "placement": "bottom"},
|
104 |
+
),
|
105 |
+
html.Button("Reset Cluster Rank", id="reset-cluster-button", n_clicks=0),
|
106 |
+
],
|
107 |
+
style={"width": "50%", "margin": "auto"},
|
108 |
+
),
|
109 |
+
html.Div(
|
110 |
+
[
|
111 |
+
html.P("Show Cluster Labels:"),
|
112 |
+
dcc.Checklist(
|
113 |
+
id="show-cluster-labels",
|
114 |
+
options=[{"label": "Show", "value": "on"}],
|
115 |
+
value=["off"],
|
116 |
+
),
|
117 |
+
],
|
118 |
+
style={"width": "50%", "margin": "auto"},
|
119 |
+
),
|
120 |
+
],
|
121 |
+
style={"position": "relative", "zIndex": "1001", "marginBottom": "20px"},
|
122 |
+
),
|
123 |
+
dcc.Graph(
|
124 |
+
id="scatter-plot",
|
125 |
+
style={
|
126 |
+
"height": "80vh",
|
127 |
+
"width": "90vw",
|
128 |
+
"position": "relative",
|
129 |
+
"zIndex": "999",
|
130 |
+
},
|
131 |
+
),
|
132 |
+
html.Div(
|
133 |
+
[html.Button("Generate LLM Prompt from current selection", id="generate-cluster-button", n_clicks=0)],
|
134 |
+
style={"width": "50%", "margin": "auto"},
|
135 |
+
),
|
136 |
+
html.Div(
|
137 |
+
[
|
138 |
+
html.P("Prompt for LLM:"),
|
139 |
+
dcc.Textarea(
|
140 |
+
id="cluster-text-output",
|
141 |
+
style={"width": "100%", "height": "200px", "display": "none"},
|
142 |
+
value="",
|
143 |
+
),
|
144 |
+
],
|
145 |
+
style={"width": "50%", "margin": "auto"},
|
146 |
+
),
|
147 |
+
]
|
148 |
+
)
|
149 |
+
|
150 |
+
|
151 |
+
@app.callback(
|
152 |
+
Output("scatter-plot", "figure"),
|
153 |
+
[
|
154 |
+
Input("label-range-slider", "value"),
|
155 |
+
Input("cluster-rank-slider", "value"),
|
156 |
+
Input("sentiment-dropdown", "value"),
|
157 |
+
Input("show-cluster-labels", "value"),
|
158 |
+
],
|
159 |
+
)
|
160 |
+
def update_scatter_plot(label_range, cluster_rank_range, selected_sentiment, show_cluster_labels):
|
161 |
+
show_labels = "on" in show_cluster_labels
|
162 |
+
|
163 |
+
if selected_sentiment == "gut":
|
164 |
+
df_filtered = df_result_good_distil_2
|
165 |
+
else:
|
166 |
+
df_filtered = df_result_bad_distil_2
|
167 |
+
|
168 |
+
df_filtered = df_filtered[
|
169 |
+
(df_filtered["labels"].between(label_range[0], label_range[1]))
|
170 |
+
& (df_filtered["cluster_rank"].between(cluster_rank_range[0], cluster_rank_range[1]))
|
171 |
+
]
|
172 |
+
|
173 |
+
outliers = df_filtered[df_filtered.labels == -1]
|
174 |
+
clustered = df_filtered[df_filtered.labels != -1]
|
175 |
+
|
176 |
+
fig = px.scatter(
|
177 |
+
clustered,
|
178 |
+
x="x",
|
179 |
+
y="y",
|
180 |
+
hover_data=[
|
181 |
+
"summary_good_bad",
|
182 |
+
"sentiment",
|
183 |
+
"cluster_rank",
|
184 |
+
"cluster_count",
|
185 |
+
"clean_review_br",
|
186 |
+
],
|
187 |
+
hover_name="cluster_label",
|
188 |
+
color="labels",
|
189 |
+
color_continuous_scale="rainbow",
|
190 |
+
opacity=0.7,
|
191 |
+
)
|
192 |
+
|
193 |
+
if show_labels:
|
194 |
+
centroids = clustered.groupby("labels", sort=False).agg(
|
195 |
+
{
|
196 |
+
"x": "mean",
|
197 |
+
"y": "mean",
|
198 |
+
"cluster_label": "first",
|
199 |
+
"cluster_count": "count",
|
200 |
+
}
|
201 |
+
)
|
202 |
+
for row in centroids.itertuples():
|
203 |
+
fig.add_annotation(
|
204 |
+
x=row.x,
|
205 |
+
y=row.y,
|
206 |
+
text=f"{row.cluster_label}, #reviews: {row.cluster_count}",
|
207 |
+
showarrow=False,
|
208 |
+
)
|
209 |
+
|
210 |
+
fig.add_scatter(
|
211 |
+
x=outliers["x"],
|
212 |
+
y=outliers["y"],
|
213 |
+
mode="markers",
|
214 |
+
marker=dict(color="lightgray", opacity=0.5, size=5.0),
|
215 |
+
name="No cluster",
|
216 |
+
selectedpoints=False,
|
217 |
+
hoverinfo="skip",
|
218 |
+
)
|
219 |
+
|
220 |
+
fig.update_layout(coloraxis_colorbar=dict(len=0.9, x=1.0), height=600)
|
221 |
+
fig.update_traces(marker=dict(size=3), selector=dict(mode="markers"))
|
222 |
+
|
223 |
+
return fig
|
224 |
+
|
225 |
+
|
226 |
+
@app.callback(
|
227 |
+
[
|
228 |
+
Output("label-range-slider", "marks"),
|
229 |
+
Output("label-range-slider", "min"),
|
230 |
+
Output("label-range-slider", "max"),
|
231 |
+
Output("cluster-rank-slider", "marks"),
|
232 |
+
Output("cluster-rank-slider", "min"),
|
233 |
+
Output("cluster-rank-slider", "max"),
|
234 |
+
],
|
235 |
+
[Input("sentiment-dropdown", "value")],
|
236 |
+
)
|
237 |
+
def update_slider_marks(selected_sentiment):
|
238 |
+
if selected_sentiment == "gut":
|
239 |
+
df_filtered = df_result_good_distil_2
|
240 |
+
else:
|
241 |
+
df_filtered = df_result_bad_distil_2
|
242 |
+
|
243 |
+
label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = (
|
244 |
+
preprocess_data_for_slider_marks(df_filtered)
|
245 |
+
)
|
246 |
+
|
247 |
+
return (
|
248 |
+
label_marks,
|
249 |
+
min_label,
|
250 |
+
max_label,
|
251 |
+
cluster_rank_marks,
|
252 |
+
min_cluster_rank,
|
253 |
+
max_cluster_rank,
|
254 |
+
)
|
255 |
+
|
256 |
+
|
257 |
+
@app.callback(
|
258 |
+
Output("label-range-slider", "value"),
|
259 |
+
[Input("reset-button", "n_clicks")],
|
260 |
+
[State("label-range-slider", "min"), State("label-range-slider", "max")],
|
261 |
+
)
|
262 |
+
def reset_label_slider(n_clicks, min_val, max_val):
|
263 |
+
return [min_val, max_val]
|
264 |
+
|
265 |
+
|
266 |
+
@app.callback(
|
267 |
+
Output("cluster-rank-slider", "value"),
|
268 |
+
[Input("reset-cluster-button", "n_clicks")],
|
269 |
+
[State("cluster-rank-slider", "min"), State("cluster-rank-slider", "max")],
|
270 |
+
)
|
271 |
+
def reset_cluster_slider(n_clicks, min_val, max_val):
|
272 |
+
return [min_val, max_val]
|
273 |
+
|
274 |
+
|
275 |
+
@app.callback(
|
276 |
+
Output("cluster-text-output", "style"),
|
277 |
+
[Input("generate-cluster-button", "n_clicks")],
|
278 |
+
)
|
279 |
+
def show_cluster_text_output(n_clicks):
|
280 |
+
if n_clicks > 0:
|
281 |
+
return {"width": "100%", "height": "200px", "display": "block"}
|
282 |
+
else:
|
283 |
+
return {"width": "100%", "height": "200px", "display": "none"}
|
284 |
+
|
285 |
+
|
286 |
+
@app.callback(
|
287 |
+
Output("cluster-text-output", "value"),
|
288 |
+
[Input("generate-cluster-button", "n_clicks")],
|
289 |
+
[State("cluster-rank-slider", "value"), State("sentiment-dropdown", "value")],
|
290 |
+
)
|
291 |
+
def update_cluster_text_output(n_clicks, cluster_rank_range, selected_sentiment):
|
292 |
+
if n_clicks > 0:
|
293 |
+
if selected_sentiment == "gut":
|
294 |
+
df_text_outp = df_result_good_distil_2
|
295 |
+
else:
|
296 |
+
df_text_outp = df_result_bad_distil_2
|
297 |
+
|
298 |
+
df_text_outp = df_text_outp[
|
299 |
+
(df_text_outp["cluster_rank"] <= cluster_rank_range[1])
|
300 |
+
& (df_text_outp["cluster_rank"] >= cluster_rank_range[0])
|
301 |
+
]
|
302 |
+
|
303 |
+
df_text_outp["summary_good_bad"] = df_text_outp["summary_good_bad"].fillna("").astype(str)
|
304 |
+
|
305 |
+
sampled_data = df_text_outp.sample(frac=0.1, random_state=42)
|
306 |
+
|
307 |
+
grouped_data = (
|
308 |
+
sampled_data.groupby("cluster_label", sort=False)["summary_good_bad"].agg("\n".join).reset_index()
|
309 |
+
)
|
310 |
+
|
311 |
+
prompt_instruction = """Analysiere die nach ### folgenden CLUSTER "Clustertitel", die einzelene Bestandteile von Bewertungen erhalten\nund leite pro Cluster eine Business Massnahme ab um das Hauptproblem des Clusters zu lösen oder zu Verbessern.\nGib das Cluster mit seinem "Clustertitel" sowie die dazugehörige Maßnahme zurück.\n###"""
|
312 |
+
|
313 |
+
cluster_texts = prompt_instruction + "\n\n".join(
|
314 |
+
f"\nCLUSTER - {row['cluster_label']}\n{row['summary_good_bad']}" for _, row in grouped_data.iterrows()
|
315 |
+
)
|
316 |
+
return cluster_texts
|
317 |
+
else:
|
318 |
+
return ""
|
319 |
+
|
320 |
+
|
321 |
+
if __name__ == "__main__":
|
322 |
+
app.run_server(debug=True, host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dash==2.15.0
|
2 |
+
fsspec==2024.2.0
|
3 |
+
huggingface-hub==0.20.3
|
4 |
+
pandas==2.2.0
|
5 |
+
plotly==5.18.0
|