Steven Lu @ MBP-M1-Max commited on
Commit
7a410cd
·
1 Parent(s): b17a2c7

- old bugs fixed

Browse files
Files changed (2) hide show
  1. app.py +548 -0
  2. requirements.txt +82 -0
app.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # from googleapiclient.discovery import build
4
+ # from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
5
+ # import pandas as pd
6
+ # import streamlit as st
7
+ # import re
8
+
9
+ # # Set up the YouTube API client
10
+ # api_key = 'AIzaSyAtaMM03J79pb2vhBOvsIYMlQ84sx9Fb2U' # Replace with your API key
11
+ # youtube = build('youtube', 'v3', developerKey=api_key)
12
+
13
+
14
+
15
+ # # Set up the Reddit API client (PRAW)
16
+ # reddit = praw.Reddit(
17
+ # client_id='EhlUF9EavT4rAx42jQshKQ', # Replace with your Reddit client_id
18
+ # client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ', # Replace with your Reddit client_secret
19
+ # user_agent='FondantOk6255' # Replace with your user_agent
20
+ #)
21
+
22
+ import streamlit as st
23
+ st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")
24
+
25
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
26
+ from wordcloud import WordCloud
27
+ import pandas as pd
28
+ import matplotlib.pyplot as plt
29
+ import seaborn as sns
30
+ import re
31
+ import numpy as np
32
+ from nltk.corpus import stopwords
33
+ from collections import Counter
34
+ import praw
35
+ import nltk
36
+ from datetime import datetime
37
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
38
+ from transformers import pipeline
39
+
40
+ # Load sentiment analysis pipeline
41
+ sentiment_analyzer = pipeline('sentiment-analysis')
42
+
43
+ # Load summarization pipeline
44
+ summarizer = pipeline('summarization')
45
+
46
+ def analyze_sentiment(comment):
47
+ if len(comment) <= 500: # Skip long comments
48
+ return sentiment_analyzer(comment)[0]['label']
49
+ else:
50
+ return 'neutral'
51
+
52
+ def summarize_text(text):
53
+ if len(text) <= 500: # Skip long text for summarization
54
+ return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
55
+ else:
56
+ return text[:500] + '...' # Return truncated version
57
+
58
+
59
+
60
+ # Access secrets using st.secrets
61
+ reddit_client_id = st.secrets.default["client_id"]
62
+ reddit_client_secret = st.secrets.default["client_secret"]
63
+ reddit_user_agent = st.secrets.default["user_agent"]
64
+
65
+
66
+ # Initialize Reddit API client using the API keys
67
+ reddit = praw.Reddit(
68
+ client_id=reddit_client_id,
69
+ client_secret=reddit_client_secret,
70
+ user_agent=reddit_user_agent
71
+ )
72
+
73
+ # VADER sentiment analyzer setup
74
+ analyzer = SentimentIntensityAnalyzer()
75
+
76
+ # Download stopwords
77
+ nltk.download('stopwords')
78
+ stop_words_set = ['at', 'how', 'do', 'm', 'during', 'again', 'been', 'dont', 'itself', 'from', 'in',
79
+ 'myself', "wouldn't", 'which', 'than', 'yourselves', 'her', 's', 'further', 'won', 'my',
80
+ 'more', 'would', 'no', 'some', 'yours', "weren't", "haven't", 'over', 'couldn', 'against',
81
+ "mustn't", 'same', 'was', 'himself', "aren't", 'through', 'shan', 'he', "mightn't", 'only',
82
+ 'on', 't', 'ourselves', 'these', 'other', 'up', 'about', 'hers', 'hasn', 'it', "doesn't",
83
+ 'for', 'wouldn', 'doing', 'not', 'his', 'll', 'you', "couldn't", 'too', 'haven', 'those',
84
+ 'our', 'because', 'im', 'know', 'until', 'to', 'mightn', 'such', 'very', 'needn', 'they',
85
+ 'or', 'as', 'having', 'isn', 'here', 'didn', "isn't", "i'm", 'most', 'did', 'have',
86
+ "it's", "hadn't", 'by', 'has', 'into', 'there', 'yourself', 'had', 'am', 'y', 'just',
87
+ 'don', 'are', 'does', 'like', 'whom', 'should', 'after', 'mustn', 'once', 'below',
88
+ 'him', 'who', "you're", 'them', 'why', 'your', "you've", "you'll", 'is', "don't",
89
+ 'aren', 'when', 'so', 'can', 'being', 'and', "should've", 'that', 'above',
90
+ "didn't", 'hadn', 'doesn', 've', 'ma', 'before', 'out', 'the', 'if', 'where',
91
+ "shan't", 'under', 'each', 'ain', 'what', "shouldn't", 'down', 'now', 'weren',
92
+ 'youre', 'a', 'with', "hasn't", 'herself', 'get', 're', "she's", 'of', 'we',
93
+ "wasn't", 'their', 'theirs', 'but', 'o', "that'll", 'its', 'own', 'wasn',
94
+ 'all', 'nor', "you'd", 'shouldn', 'both', 'me', 'd', 'between', 'be', 'an',
95
+ 'any', 'i', 'she', 'this', 'then', "won't", 'were', 'will', "needn't", 'off',
96
+ 'few', 'themselves', 'ours', 'while']
97
+
98
+ # Combine custom stopwords with NLTK stopwords
99
+ stop_words = list(set(stopwords.words('english')).union(stop_words_set))
100
+
101
+ # Set up the TfidfVectorizer using the combined stop words
102
+ # vectorizer = TfidfVectorizer(stop_words=list(stop_words))
103
+
104
+ # Convert the set to a list before passing to TfidfVectorizer
105
+ # stop_words_list = list(stop_words)
106
+
107
+ # Verify that stop_words_list is a list of strings
108
+ # st.write(stop_words_list[:10]) # Print first 10 stop words for verification
109
+
110
+ # Use the vectorizer and pass the stop words list
111
+ try:
112
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5)
113
+ print("Stop words applied:", vectorizer.get_stop_words())
114
+ st.write("TfidfVectorizer initialized successfully!")
115
+ except Exception as e:
116
+ st.error(f"Error initializing TfidfVectorizer: {e}")
117
+
118
+
119
+ # Streamlit app structure
120
+
121
+ st.title("Reddit Keyword-Based Comment Analyzer")
122
+
123
+ @st.cache
124
+ def fetch_reddit_data(query, max_results=50, min_score=10):
125
+ posts = reddit.subreddit('all').search(query, limit=max_results)
126
+ comments, timestamps, scores = [], [], []
127
+ for post in posts:
128
+ post.comments.replace_more(limit=0)
129
+ for comment in post.comments.list():
130
+ if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
131
+ if comment.score >= min_score: # Filter comments by minimum score (upvotes)
132
+ comments.append(comment.body)
133
+ timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
134
+ scores.append(comment.score) # Store the comment score for reference
135
+ return comments, timestamps, scores
136
+
137
+
138
+ def preprocess_text(text):
139
+ # Normalize contractions and remove non-alphabetic characters
140
+ text = re.sub(r"[^\w\s]", '', text.lower()) # Removes punctuation and converts to lower case
141
+ text = re.sub(r'\s+', ' ', text) # Removes excess whitespace
142
+ return text
143
+
144
+ def analyze_sentiment(comments):
145
+ return [analyzer.polarity_scores(comment)['compound'] for comment in comments]
146
+
147
+ def generate_wordcloud(comments):
148
+ filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words])
149
+ return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
150
+
151
+ # Extract features for keywords
152
+ def extract_features(comments):
153
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
154
+ X = vectorizer.fit_transform(comments)
155
+ return vectorizer.get_feature_names_out(), X.sum(axis=0).A1
156
+
157
+ query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
158
+ start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
159
+ end_date = st.date_input("End Date", value=pd.to_datetime("today").date())
160
+
161
+ if st.button("Analyze"):
162
+ comments, timestamps, score = fetch_reddit_data(query, max_results=50, min_score=10)
163
+ print(f'Fetched {len(comments)} comments.')
164
+ if not comments:
165
+ st.warning("No comments found for this search query.")
166
+ else:
167
+ # Clean the comments before passing them to TfidfVectorizer
168
+ cleaned_comments = [preprocess_text(comment) for comment in comments]
169
+ print("Sample of cleaned comments:")
170
+ print(cleaned_comments[:5])
171
+ sentiment_scores = analyze_sentiment(cleaned_comments)
172
+ df = pd.DataFrame({
173
+ 'comment': cleaned_comments,
174
+ 'sentiment': sentiment_scores,
175
+ 'created_at': timestamps
176
+ })
177
+
178
+ # Ensure created_at is in datetime format
179
+ df['created_at'] = pd.to_datetime(df['created_at'])
180
+
181
+ # Set the datetime index for resampling
182
+ df.set_index('created_at', inplace=True)
183
+
184
+ # Filter by date range
185
+ df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]
186
+
187
+ df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
188
+
189
+ # Save results in session state
190
+ st.session_state.df = df
191
+ st.session_state.cleaned_comments = cleaned_comments
192
+
193
+ # If results are in session state, retrieve them
194
+ if 'df' in st.session_state:
195
+ df = st.session_state.df
196
+ cleaned_comments = st.session_state.cleaned_comments
197
+
198
+
199
+ st.subheader("Key Metrics")
200
+ col1, col2, col3, col4 = st.columns(4)
201
+ col1.metric("Total Comments", len(df))
202
+ col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
203
+ col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
204
+ col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))
205
+
206
+ # Sentiment Distribution
207
+ st.subheader("Sentiment Distribution")
208
+ sentiment_counts = df['sentiment_category'].value_counts()
209
+ fig1, ax1 = plt.subplots()
210
+ ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
211
+ ax1.axis('equal')
212
+
213
+ # Top Comments Distribution
214
+ # Keywords Affecting Sentiment
215
+ st.subheader("Top Keywords Affecting Sentiment")
216
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
217
+ X = vectorizer.fit_transform(df['comment'])
218
+ features = vectorizer.get_feature_names_out()
219
+ scores = np.asarray(X.mean(axis=0)).flatten()
220
+
221
+ keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
222
+ top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)
223
+
224
+ fig2, ax2 = plt.subplots(figsize=(10, 5))
225
+ top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
226
+ ax2.set_xlabel("Keyword")
227
+ ax2.set_ylabel("TF-IDF Score")
228
+ ax2.set_title("Top Keywords Affecting Sentiment")
229
+ plt.xticks(rotation=45, ha='right')
230
+
231
+ # Display the charts side by side
232
+ col1, col2 = st.columns(2)
233
+ with col1:
234
+ st.pyplot(fig1)
235
+ with col2:
236
+ st.pyplot(fig2)
237
+
238
+ # Visualizations
239
+ # Word cloud and common words bar chart side by side
240
+ st.subheader("Word Cloud and Feature Importance Analysis")
241
+ col1, col2 = st.columns(2)
242
+
243
+ with col1:
244
+ filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words])
245
+ wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
246
+ plt.figure(figsize=(5, 4))
247
+ plt.imshow(wordcloud, interpolation='bilinear')
248
+ plt.axis('off')
249
+ st.pyplot(plt)
250
+
251
+ with col2:
252
+ # Feature Importance Analysis
253
+ feature_names, feature_counts = extract_features(cleaned_comments)
254
+ feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
255
+ feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)
256
+
257
+ fig3, ax3 = plt.subplots(figsize=(10, 5))
258
+ feature_plot = feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
259
+ ax3.set_xlabel("Feature")
260
+ ax3.set_ylabel("Frequency")
261
+ ax3.set_title("Top Keywords Impacting Sentiment")
262
+ plt.xticks(rotation=45, ha='right')
263
+ st.pyplot(fig3)
264
+
265
+ st.subheader("Sentiment Over Time")
266
+ sentiment_over_time = df['sentiment'].resample('W').mean() # Resample by week
267
+ fig2, ax2 = plt.subplots(figsize=(10, 5))
268
+ ax2.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
269
+ ax2.set_xlabel("Date")
270
+ ax2.set_ylabel("Average Sentiment")
271
+ st.pyplot(fig2)
272
+
273
+
274
+ # Sentiment Distribution by Hour of Day
275
+ st.subheader("Sentiment Distribution by Hour of Day")
276
+ df['hour'] = df.index.hour
277
+ sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
278
+ fig5, ax5 = plt.subplots(figsize=(10, 5))
279
+ ax5.bar(sentiment_by_hour.index, sentiment_by_hour.values, color='skyblue')
280
+ ax5.set_xlabel("Hour of Day")
281
+ ax5.set_ylabel("Average Sentiment")
282
+ ax5.set_title("Average Sentiment by Hour of Day")
283
+ st.pyplot(fig5)
284
+
285
+ # Sentiment Heatmap
286
+ st.subheader("Sentiment Heatmap by Hour and Day")
287
+ df['day_of_week'] = df.index.day_name()
288
+ df['hour'] = df.index.hour
289
+ heatmap_data = df.groupby(['day_of_week', 'hour'])['sentiment'].mean().unstack()
290
+ fig4, ax4 = plt.subplots(figsize=(12, 6))
291
+ sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.2f', ax=ax4)
292
+ ax4.set_xlabel("Hour of Day")
293
+ ax4.set_ylabel("Day of Week")
294
+ ax4.set_title("Sentiment Heatmap by Hour and Day")
295
+ st.pyplot(fig4)
296
+
297
+ # Filter comments by sentiment
298
+ st.subheader("Filter Comments by Sentiment")
299
+ sentiment_option = st.selectbox("Choose Sentiment", ['positive', 'neutral', 'negative'])
300
+ filtered_comments = df[df['sentiment_category'] == sentiment_option]
301
+ if not filtered_comments.empty:
302
+ st.write(filtered_comments[['comment', 'sentiment_category']].head())
303
+ else:
304
+ st.write("No comments found for the selected sentiment.")
305
+
306
+ # Display raw data
307
+ st.subheader("Raw Data")
308
+ st.write(df.head())
309
+
310
+ # Enhanced Top Sentiment-Related Keywords
311
+ st.subheader("Top Sentiment-Related Keywords")
312
+ keywords = ['excellent', 'good', 'great', 'bad', 'terrible']
313
+ filtered_comments_with_keywords = [comment for comment in cleaned_comments if any(keyword in comment for keyword in keywords)]
314
+ if filtered_comments_with_keywords:
315
+ st.write(f"Found {len(filtered_comments_with_keywords)} comments containing sentiment-related keywords.")
316
+ for i, comment in enumerate(filtered_comments_with_keywords[:10]):
317
+ st.write(f"**Comment {i+1}:** {comment}")
318
+ else:
319
+ st.write("No comments with sentiment-related keywords found.")
320
+
321
+
322
+
323
+
324
+ # import streamlit as st
325
+ # from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
326
+ # from wordcloud import WordCloud
327
+ # import pandas as pd
328
+ # import matplotlib.pyplot as plt
329
+ # import seaborn as sns
330
+ # import re
331
+ # import numpy as np
332
+ # from nltk.corpus import stopwords
333
+ # from collections import Counter
334
+ # import praw
335
+ # import nltk
336
+ # from datetime import datetime
337
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
338
+ # from transformers import pipeline
339
+
340
+ # # Load sentiment analysis and summarization pipelines
341
+ # sentiment_analyzer = pipeline('sentiment-analysis')
342
+ # summarizer = pipeline('summarization')
343
+
344
+ # def analyze_sentiment(comment):
345
+ # if len(comment) <= 500: # Skip long comments
346
+ # return sentiment_analyzer(comment)[0]['label']
347
+ # else:
348
+ # return 'neutral'
349
+
350
+ # def summarize_text(text):
351
+ # if len(text) <= 500: # Skip long text for summarization
352
+ # return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
353
+ # else:
354
+ # return text[:500] + '...' # Return truncated version
355
+
356
+ # # Initialize Reddit API client
357
+ # reddit = praw.Reddit(
358
+ # client_id='EhlUF9EavT4rAx42jQshKQ',
359
+ # client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ',
360
+ # user_agent='FondantOk6255'
361
+ # )
362
+
363
+ # # VADER sentiment analyzer setup
364
+ # analyzer = SentimentIntensityAnalyzer()
365
+
366
+ # # Download stopwords
367
+ # nltk.download('stopwords')
368
+ # stop_words_set = set(stopwords.words('english'))
369
+ # stop_words_list = list(stop_words_set)
370
+
371
+ # vectorizer = TfidfVectorizer(stop_words=stop_words_list)
372
+
373
+ # # Streamlit app structure
374
+ # st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")
375
+ # st.title("Reddit Keyword-Based Comment Analyzer")
376
+
377
+ # # @st.cache
378
+ # # def fetch_reddit_data(query, max_results=50):
379
+ # # posts = reddit.subreddit('all').search(query, limit=max_results)
380
+ # # comments, timestamps = [], []
381
+ # # for post in posts:
382
+ # # post.comments.replace_more(limit=0)
383
+ # # for comment in post.comments.list():
384
+ # # if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
385
+ # # comments.append(comment.body)
386
+ # # timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
387
+ # # return comments, timestamps
388
+
389
+ # @st.cache
390
+ # def fetch_reddit_data(query, max_posts=50, max_comments_per_post=10):
391
+ # posts = reddit.subreddit('all').search(query, limit=max_posts)
392
+ # comments, timestamps = [], []
393
+
394
+ # for post in posts:
395
+ # post.comments.replace_more(limit=0) # Replace MoreComments with actual comments
396
+ # comment_count = 0
397
+ # for comment in post.comments.list():
398
+ # if comment_count >= max_comments_per_post: # Stop after max_comments_per_post comments
399
+ # break
400
+ # if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
401
+ # comments.append(comment.body)
402
+ # timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
403
+ # comment_count += 1
404
+
405
+ # # Optional: Stop after reaching max_comments total
406
+ # if len(comments) >= max_posts * max_comments_per_post:
407
+ # break
408
+
409
+ # return comments, timestamps
410
+
411
+
412
+ # def preprocess_text(text):
413
+ # text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
414
+ # text = re.sub(r'\s+', ' ', text)
415
+ # return text.strip()
416
+
417
+ # def analyze_sentiment_vader(comments):
418
+ # return [analyzer.polarity_scores(comment)['compound'] for comment in comments]
419
+
420
+ # def generate_wordcloud(comments):
421
+ # filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words_list])
422
+ # return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
423
+
424
+ # def extract_features(comments):
425
+ # vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=50)
426
+ # X = vectorizer.fit_transform(comments)
427
+ # return vectorizer.get_feature_names_out(), X.sum(axis=0).A1
428
+
429
+ # query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
430
+ # start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
431
+ # end_date = st.date_input("End Date", value=pd.to_datetime("today").date())
432
+
433
+ # if st.button("Analyze"):
434
+ # comments, timestamps = fetch_reddit_data(query)
435
+
436
+ # print('done fetching', len(comments))
437
+
438
+ # if not comments:
439
+ # st.warning("No comments found for this search query.")
440
+ # else:
441
+ # cleaned_comments = [preprocess_text(comment) for comment in comments]
442
+ # print('preprocessed')
443
+ # sentiment_scores = analyze_sentiment(cleaned_comments)
444
+ # df = pd.DataFrame({
445
+ # 'comment': cleaned_comments,
446
+ # 'sentiment': sentiment_scores,
447
+ # 'created_at': timestamps
448
+ # })
449
+ # print('ANALYZED')
450
+
451
+ # df['created_at'] = pd.to_datetime(df['created_at'])
452
+ # df.set_index('created_at', inplace=True)
453
+ # df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]
454
+
455
+ # # Ensure the 'sentiment' column contains numeric values (floats)
456
+ # df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')
457
+
458
+ # # Apply sentiment categorization
459
+ # df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
460
+
461
+ # #df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
462
+
463
+ # st.session_state.df = df
464
+ # st.session_state.cleaned_comments = cleaned_comments
465
+
466
+ # if 'df' in st.session_state:
467
+ # df = st.session_state.df
468
+ # cleaned_comments = st.session_state.cleaned_comments
469
+
470
+ # st.subheader("Key Metrics")
471
+ # col1, col2, col3, col4 = st.columns(4)
472
+ # col1.metric("Total Comments", len(df))
473
+ # col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
474
+ # col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
475
+ # col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))
476
+
477
+ # st.subheader("Sentiment Distribution")
478
+ # sentiment_counts = df['sentiment_category'].value_counts()
479
+ # fig1, ax1 = plt.subplots()
480
+ # ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
481
+ # ax1.axis('equal')
482
+
483
+ # st.subheader("Top Keywords Affecting Sentiment")
484
+ # vectorizer = TfidfVectorizer(stop_words=stop_words_list, max_features=50)
485
+ # X = vectorizer.fit_transform(df['comment'])
486
+ # features = vectorizer.get_feature_names_out()
487
+ # scores = np.asarray(X.mean(axis=0)).flatten()
488
+
489
+ # keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
490
+ # top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)
491
+
492
+ # fig2, ax2 = plt.subplots(figsize=(10, 5))
493
+ # top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
494
+ # ax2.set_xlabel("Keyword")
495
+ # ax2.set_ylabel("TF-IDF Score")
496
+ # ax2.set_title("Top Keywords Affecting Sentiment")
497
+ # plt.xticks(rotation=45, ha='right')
498
+
499
+ # col1, col2 = st.columns(2)
500
+ # with col1:
501
+ # st.pyplot(fig1)
502
+ # with col2:
503
+ # st.pyplot(fig2)
504
+
505
+ # st.subheader("Word Cloud and Feature Importance Analysis")
506
+ # col1, col2 = st.columns(2)
507
+
508
+ # with col1:
509
+ # filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words_list])
510
+ # wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
511
+ # plt.figure(figsize=(5, 4))
512
+ # plt.imshow(wordcloud, interpolation='bilinear')
513
+ # plt.axis('off')
514
+ # st.pyplot(plt)
515
+
516
+ # with col2:
517
+ # feature_names, feature_counts = extract_features(cleaned_comments)
518
+ # feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
519
+ # feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)
520
+
521
+ # fig3, ax3 = plt.subplots(figsize=(10, 5))
522
+ # feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
523
+ # ax3.set_xlabel("Feature")
524
+ # ax3.set_ylabel("Frequency")
525
+ # ax3.set_title("Top Keywords Impacting Sentiment")
526
+ # plt.xticks(rotation=45, ha='right')
527
+ # st.pyplot(fig3)
528
+
529
+ # st.subheader("Sentiment Over Time")
530
+ # sentiment_over_time = df['sentiment'].resample('W').mean()
531
+ # fig4, ax4 = plt.subplots(figsize=(10, 5))
532
+ # ax4.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
533
+ # ax4.set_xlabel("Date")
534
+ # ax4.set_ylabel("Average Sentiment")
535
+ # st.pyplot(fig4)
536
+
537
+ # st.subheader("Sentiment Distribution by Hour of Day")
538
+ # df['hour'] = df.index.hour
539
+ # sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
540
+ # fig5, ax5 = plt.subplots(figsize=(10, 5))
541
+ # ax5.plot(sentiment_by_hour.index, sentiment_by_hour.values, marker='o')
542
+ # ax5.set_xlabel("Hour of Day")
543
+ # ax5.set_ylabel("Average Sentiment")
544
+ # st.pyplot(fig5)
545
+
546
+ # st.subheader("Comment Summaries")
547
+ # for comment in cleaned_comments:
548
+ # st.write(summarize_text(comment))
requirements.txt ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ attrs==24.2.0
3
+ blinker==1.8.2
4
+ cachetools==5.4.0
5
+ certifi==2024.7.4
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ contourpy==1.2.1
9
+ cycler==0.12.1
10
+ entrypoints==0.4
11
+ filelock==3.15.4
12
+ fonttools==4.53.1
13
+ fsspec==2024.6.1
14
+ gitdb==4.0.11
15
+ GitPython==3.1.43
16
+ huggingface-hub==0.24.5
17
+ idna==3.7
18
+ importlib_metadata==8.2.0
19
+ importlib_resources==6.4.3
20
+ Jinja2==3.1.4
21
+ joblib==1.4.2
22
+ jsonschema==4.23.0
23
+ jsonschema-specifications==2023.12.1
24
+ kiwisolver==1.4.5
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==2.1.5
27
+ matplotlib==3.9.2
28
+ mdurl==0.1.2
29
+ mpmath==1.3.0
30
+ narwhals==1.4.2
31
+ networkx==3.2.1
32
+ nltk==3.8.1
33
+ numpy==1.26.4
34
+ packaging==24.1
35
+ pandas==2.2.2
36
+ pillow==10.4.0
37
+ praw==7.7.1
38
+ prawcore==2.4.0
39
+ protobuf==3.20.3
40
+ pyarrow==17.0.0
41
+ pydeck==0.9.1
42
+ Pygments==2.18.0
43
+ Pympler==1.1
44
+ pyparsing==3.1.2
45
+ python-dateutil==2.9.0.post0
46
+ pytz==2024.1
47
+ PyYAML==6.0.2
48
+ referencing==0.35.1
49
+ regex==2024.7.24
50
+ requests==2.32.3
51
+ rich==13.7.1
52
+ rpds-py==0.20.0
53
+ safetensors==0.4.4
54
+ scikit-learn==1.5.1
55
+ scipy==1.13.1
56
+ seaborn==0.13.2
57
+ semver==3.0.2
58
+ six==1.16.0
59
+ sklearn==0.0
60
+ smmap==5.0.1
61
+ streamlit==1.12.0
62
+ sympy==1.13.2
63
+ threadpoolctl==3.5.0
64
+ tokenizers==0.19.1
65
+ toml==0.10.2
66
+ toolz==0.12.1
67
+ torch==2.2.2
68
+ torchaudio==2.2.2
69
+ torchvision==0.17.2
70
+ tornado==6.4.1
71
+ tqdm==4.66.5
72
+ transformers==4.44.0
73
+ typing_extensions==4.12.2
74
+ tzdata==2024.1
75
+ tzlocal==5.2
76
+ update-checker==0.18.0
77
+ urllib3==2.2.2
78
+ vaderSentiment==3.3.2
79
+ validators==0.33.0
80
+ websocket-client==1.8.0
81
+ wordcloud==1.9.3
82
+ zipp==3.20.0