Spaces:
Runtime error
Runtime error
Steven Lu @ MBP-M1-Max
commited on
Commit
·
7a410cd
1
Parent(s):
b17a2c7
- old bugs fixed
Browse files- app.py +548 -0
- requirements.txt +82 -0
app.py
ADDED
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
# from googleapiclient.discovery import build
|
4 |
+
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
5 |
+
# import pandas as pd
|
6 |
+
# import streamlit as st
|
7 |
+
# import re
|
8 |
+
|
9 |
+
# # Set up the YouTube API client
|
10 |
+
# api_key = 'AIzaSyAtaMM03J79pb2vhBOvsIYMlQ84sx9Fb2U' # Replace with your API key
|
11 |
+
# youtube = build('youtube', 'v3', developerKey=api_key)
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
# # Set up the Reddit API client (PRAW)
|
16 |
+
# reddit = praw.Reddit(
|
17 |
+
# client_id='EhlUF9EavT4rAx42jQshKQ', # Replace with your Reddit client_id
|
18 |
+
# client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ', # Replace with your Reddit client_secret
|
19 |
+
# user_agent='FondantOk6255' # Replace with your user_agent
|
20 |
+
#)
|
21 |
+
|
22 |
+
import streamlit as st
|
23 |
+
st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")
|
24 |
+
|
25 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
26 |
+
from wordcloud import WordCloud
|
27 |
+
import pandas as pd
|
28 |
+
import matplotlib.pyplot as plt
|
29 |
+
import seaborn as sns
|
30 |
+
import re
|
31 |
+
import numpy as np
|
32 |
+
from nltk.corpus import stopwords
|
33 |
+
from collections import Counter
|
34 |
+
import praw
|
35 |
+
import nltk
|
36 |
+
from datetime import datetime
|
37 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
38 |
+
from transformers import pipeline
|
39 |
+
|
40 |
+
# Load sentiment analysis pipeline
|
41 |
+
sentiment_analyzer = pipeline('sentiment-analysis')
|
42 |
+
|
43 |
+
# Load summarization pipeline
|
44 |
+
summarizer = pipeline('summarization')
|
45 |
+
|
46 |
+
def analyze_sentiment(comment):
|
47 |
+
if len(comment) <= 500: # Skip long comments
|
48 |
+
return sentiment_analyzer(comment)[0]['label']
|
49 |
+
else:
|
50 |
+
return 'neutral'
|
51 |
+
|
52 |
+
def summarize_text(text):
|
53 |
+
if len(text) <= 500: # Skip long text for summarization
|
54 |
+
return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
55 |
+
else:
|
56 |
+
return text[:500] + '...' # Return truncated version
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
# Access secrets using st.secrets
|
61 |
+
reddit_client_id = st.secrets.default["client_id"]
|
62 |
+
reddit_client_secret = st.secrets.default["client_secret"]
|
63 |
+
reddit_user_agent = st.secrets.default["user_agent"]
|
64 |
+
|
65 |
+
|
66 |
+
# Initialize Reddit API client using the API keys
|
67 |
+
reddit = praw.Reddit(
|
68 |
+
client_id=reddit_client_id,
|
69 |
+
client_secret=reddit_client_secret,
|
70 |
+
user_agent=reddit_user_agent
|
71 |
+
)
|
72 |
+
|
73 |
+
# VADER sentiment analyzer setup
|
74 |
+
analyzer = SentimentIntensityAnalyzer()
|
75 |
+
|
76 |
+
# Download stopwords
|
77 |
+
nltk.download('stopwords')
|
78 |
+
stop_words_set = ['at', 'how', 'do', 'm', 'during', 'again', 'been', 'dont', 'itself', 'from', 'in',
|
79 |
+
'myself', "wouldn't", 'which', 'than', 'yourselves', 'her', 's', 'further', 'won', 'my',
|
80 |
+
'more', 'would', 'no', 'some', 'yours', "weren't", "haven't", 'over', 'couldn', 'against',
|
81 |
+
"mustn't", 'same', 'was', 'himself', "aren't", 'through', 'shan', 'he', "mightn't", 'only',
|
82 |
+
'on', 't', 'ourselves', 'these', 'other', 'up', 'about', 'hers', 'hasn', 'it', "doesn't",
|
83 |
+
'for', 'wouldn', 'doing', 'not', 'his', 'll', 'you', "couldn't", 'too', 'haven', 'those',
|
84 |
+
'our', 'because', 'im', 'know', 'until', 'to', 'mightn', 'such', 'very', 'needn', 'they',
|
85 |
+
'or', 'as', 'having', 'isn', 'here', 'didn', "isn't", "i'm", 'most', 'did', 'have',
|
86 |
+
"it's", "hadn't", 'by', 'has', 'into', 'there', 'yourself', 'had', 'am', 'y', 'just',
|
87 |
+
'don', 'are', 'does', 'like', 'whom', 'should', 'after', 'mustn', 'once', 'below',
|
88 |
+
'him', 'who', "you're", 'them', 'why', 'your', "you've", "you'll", 'is', "don't",
|
89 |
+
'aren', 'when', 'so', 'can', 'being', 'and', "should've", 'that', 'above',
|
90 |
+
"didn't", 'hadn', 'doesn', 've', 'ma', 'before', 'out', 'the', 'if', 'where',
|
91 |
+
"shan't", 'under', 'each', 'ain', 'what', "shouldn't", 'down', 'now', 'weren',
|
92 |
+
'youre', 'a', 'with', "hasn't", 'herself', 'get', 're', "she's", 'of', 'we',
|
93 |
+
"wasn't", 'their', 'theirs', 'but', 'o', "that'll", 'its', 'own', 'wasn',
|
94 |
+
'all', 'nor', "you'd", 'shouldn', 'both', 'me', 'd', 'between', 'be', 'an',
|
95 |
+
'any', 'i', 'she', 'this', 'then', "won't", 'were', 'will', "needn't", 'off',
|
96 |
+
'few', 'themselves', 'ours', 'while']
|
97 |
+
|
98 |
+
# Combine custom stopwords with NLTK stopwords
|
99 |
+
stop_words = list(set(stopwords.words('english')).union(stop_words_set))
|
100 |
+
|
101 |
+
# Set up the TfidfVectorizer using the combined stop words
|
102 |
+
# vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
103 |
+
|
104 |
+
# Convert the set to a list before passing to TfidfVectorizer
|
105 |
+
# stop_words_list = list(stop_words)
|
106 |
+
|
107 |
+
# Verify that stop_words_list is a list of strings
|
108 |
+
# st.write(stop_words_list[:10]) # Print first 10 stop words for verification
|
109 |
+
|
110 |
+
# Use the vectorizer and pass the stop words list
|
111 |
+
try:
|
112 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5)
|
113 |
+
print("Stop words applied:", vectorizer.get_stop_words())
|
114 |
+
st.write("TfidfVectorizer initialized successfully!")
|
115 |
+
except Exception as e:
|
116 |
+
st.error(f"Error initializing TfidfVectorizer: {e}")
|
117 |
+
|
118 |
+
|
119 |
+
# Streamlit app structure
|
120 |
+
|
121 |
+
st.title("Reddit Keyword-Based Comment Analyzer")
|
122 |
+
|
123 |
+
@st.cache
|
124 |
+
def fetch_reddit_data(query, max_results=50, min_score=10):
|
125 |
+
posts = reddit.subreddit('all').search(query, limit=max_results)
|
126 |
+
comments, timestamps, scores = [], [], []
|
127 |
+
for post in posts:
|
128 |
+
post.comments.replace_more(limit=0)
|
129 |
+
for comment in post.comments.list():
|
130 |
+
if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
|
131 |
+
if comment.score >= min_score: # Filter comments by minimum score (upvotes)
|
132 |
+
comments.append(comment.body)
|
133 |
+
timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
|
134 |
+
scores.append(comment.score) # Store the comment score for reference
|
135 |
+
return comments, timestamps, scores
|
136 |
+
|
137 |
+
|
138 |
+
def preprocess_text(text):
|
139 |
+
# Normalize contractions and remove non-alphabetic characters
|
140 |
+
text = re.sub(r"[^\w\s]", '', text.lower()) # Removes punctuation and converts to lower case
|
141 |
+
text = re.sub(r'\s+', ' ', text) # Removes excess whitespace
|
142 |
+
return text
|
143 |
+
|
144 |
+
def analyze_sentiment(comments):
|
145 |
+
return [analyzer.polarity_scores(comment)['compound'] for comment in comments]
|
146 |
+
|
147 |
+
def generate_wordcloud(comments):
|
148 |
+
filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words])
|
149 |
+
return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
|
150 |
+
|
151 |
+
# Extract features for keywords
|
152 |
+
def extract_features(comments):
|
153 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
|
154 |
+
X = vectorizer.fit_transform(comments)
|
155 |
+
return vectorizer.get_feature_names_out(), X.sum(axis=0).A1
|
156 |
+
|
157 |
+
query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
|
158 |
+
start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
|
159 |
+
end_date = st.date_input("End Date", value=pd.to_datetime("today").date())
|
160 |
+
|
161 |
+
if st.button("Analyze"):
|
162 |
+
comments, timestamps, score = fetch_reddit_data(query, max_results=50, min_score=10)
|
163 |
+
print(f'Fetched {len(comments)} comments.')
|
164 |
+
if not comments:
|
165 |
+
st.warning("No comments found for this search query.")
|
166 |
+
else:
|
167 |
+
# Clean the comments before passing them to TfidfVectorizer
|
168 |
+
cleaned_comments = [preprocess_text(comment) for comment in comments]
|
169 |
+
print("Sample of cleaned comments:")
|
170 |
+
print(cleaned_comments[:5])
|
171 |
+
sentiment_scores = analyze_sentiment(cleaned_comments)
|
172 |
+
df = pd.DataFrame({
|
173 |
+
'comment': cleaned_comments,
|
174 |
+
'sentiment': sentiment_scores,
|
175 |
+
'created_at': timestamps
|
176 |
+
})
|
177 |
+
|
178 |
+
# Ensure created_at is in datetime format
|
179 |
+
df['created_at'] = pd.to_datetime(df['created_at'])
|
180 |
+
|
181 |
+
# Set the datetime index for resampling
|
182 |
+
df.set_index('created_at', inplace=True)
|
183 |
+
|
184 |
+
# Filter by date range
|
185 |
+
df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]
|
186 |
+
|
187 |
+
df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
|
188 |
+
|
189 |
+
# Save results in session state
|
190 |
+
st.session_state.df = df
|
191 |
+
st.session_state.cleaned_comments = cleaned_comments
|
192 |
+
|
193 |
+
# If results are in session state, retrieve them
|
194 |
+
if 'df' in st.session_state:
|
195 |
+
df = st.session_state.df
|
196 |
+
cleaned_comments = st.session_state.cleaned_comments
|
197 |
+
|
198 |
+
|
199 |
+
st.subheader("Key Metrics")
|
200 |
+
col1, col2, col3, col4 = st.columns(4)
|
201 |
+
col1.metric("Total Comments", len(df))
|
202 |
+
col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
|
203 |
+
col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
|
204 |
+
col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))
|
205 |
+
|
206 |
+
# Sentiment Distribution
|
207 |
+
st.subheader("Sentiment Distribution")
|
208 |
+
sentiment_counts = df['sentiment_category'].value_counts()
|
209 |
+
fig1, ax1 = plt.subplots()
|
210 |
+
ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
|
211 |
+
ax1.axis('equal')
|
212 |
+
|
213 |
+
# Top Comments Distribution
|
214 |
+
# Keywords Affecting Sentiment
|
215 |
+
st.subheader("Top Keywords Affecting Sentiment")
|
216 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50)
|
217 |
+
X = vectorizer.fit_transform(df['comment'])
|
218 |
+
features = vectorizer.get_feature_names_out()
|
219 |
+
scores = np.asarray(X.mean(axis=0)).flatten()
|
220 |
+
|
221 |
+
keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
|
222 |
+
top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)
|
223 |
+
|
224 |
+
fig2, ax2 = plt.subplots(figsize=(10, 5))
|
225 |
+
top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
|
226 |
+
ax2.set_xlabel("Keyword")
|
227 |
+
ax2.set_ylabel("TF-IDF Score")
|
228 |
+
ax2.set_title("Top Keywords Affecting Sentiment")
|
229 |
+
plt.xticks(rotation=45, ha='right')
|
230 |
+
|
231 |
+
# Display the charts side by side
|
232 |
+
col1, col2 = st.columns(2)
|
233 |
+
with col1:
|
234 |
+
st.pyplot(fig1)
|
235 |
+
with col2:
|
236 |
+
st.pyplot(fig2)
|
237 |
+
|
238 |
+
# Visualizations
|
239 |
+
# Word cloud and common words bar chart side by side
|
240 |
+
st.subheader("Word Cloud and Feature Importance Analysis")
|
241 |
+
col1, col2 = st.columns(2)
|
242 |
+
|
243 |
+
with col1:
|
244 |
+
filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words])
|
245 |
+
wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
|
246 |
+
plt.figure(figsize=(5, 4))
|
247 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
248 |
+
plt.axis('off')
|
249 |
+
st.pyplot(plt)
|
250 |
+
|
251 |
+
with col2:
|
252 |
+
# Feature Importance Analysis
|
253 |
+
feature_names, feature_counts = extract_features(cleaned_comments)
|
254 |
+
feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
|
255 |
+
feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)
|
256 |
+
|
257 |
+
fig3, ax3 = plt.subplots(figsize=(10, 5))
|
258 |
+
feature_plot = feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
|
259 |
+
ax3.set_xlabel("Feature")
|
260 |
+
ax3.set_ylabel("Frequency")
|
261 |
+
ax3.set_title("Top Keywords Impacting Sentiment")
|
262 |
+
plt.xticks(rotation=45, ha='right')
|
263 |
+
st.pyplot(fig3)
|
264 |
+
|
265 |
+
st.subheader("Sentiment Over Time")
|
266 |
+
sentiment_over_time = df['sentiment'].resample('W').mean() # Resample by week
|
267 |
+
fig2, ax2 = plt.subplots(figsize=(10, 5))
|
268 |
+
ax2.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
|
269 |
+
ax2.set_xlabel("Date")
|
270 |
+
ax2.set_ylabel("Average Sentiment")
|
271 |
+
st.pyplot(fig2)
|
272 |
+
|
273 |
+
|
274 |
+
# Sentiment Distribution by Hour of Day
|
275 |
+
st.subheader("Sentiment Distribution by Hour of Day")
|
276 |
+
df['hour'] = df.index.hour
|
277 |
+
sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
|
278 |
+
fig5, ax5 = plt.subplots(figsize=(10, 5))
|
279 |
+
ax5.bar(sentiment_by_hour.index, sentiment_by_hour.values, color='skyblue')
|
280 |
+
ax5.set_xlabel("Hour of Day")
|
281 |
+
ax5.set_ylabel("Average Sentiment")
|
282 |
+
ax5.set_title("Average Sentiment by Hour of Day")
|
283 |
+
st.pyplot(fig5)
|
284 |
+
|
285 |
+
# Sentiment Heatmap
|
286 |
+
st.subheader("Sentiment Heatmap by Hour and Day")
|
287 |
+
df['day_of_week'] = df.index.day_name()
|
288 |
+
df['hour'] = df.index.hour
|
289 |
+
heatmap_data = df.groupby(['day_of_week', 'hour'])['sentiment'].mean().unstack()
|
290 |
+
fig4, ax4 = plt.subplots(figsize=(12, 6))
|
291 |
+
sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.2f', ax=ax4)
|
292 |
+
ax4.set_xlabel("Hour of Day")
|
293 |
+
ax4.set_ylabel("Day of Week")
|
294 |
+
ax4.set_title("Sentiment Heatmap by Hour and Day")
|
295 |
+
st.pyplot(fig4)
|
296 |
+
|
297 |
+
# Filter comments by sentiment
|
298 |
+
st.subheader("Filter Comments by Sentiment")
|
299 |
+
sentiment_option = st.selectbox("Choose Sentiment", ['positive', 'neutral', 'negative'])
|
300 |
+
filtered_comments = df[df['sentiment_category'] == sentiment_option]
|
301 |
+
if not filtered_comments.empty:
|
302 |
+
st.write(filtered_comments[['comment', 'sentiment_category']].head())
|
303 |
+
else:
|
304 |
+
st.write("No comments found for the selected sentiment.")
|
305 |
+
|
306 |
+
# Display raw data
|
307 |
+
st.subheader("Raw Data")
|
308 |
+
st.write(df.head())
|
309 |
+
|
310 |
+
# Enhanced Top Sentiment-Related Keywords
|
311 |
+
st.subheader("Top Sentiment-Related Keywords")
|
312 |
+
keywords = ['excellent', 'good', 'great', 'bad', 'terrible']
|
313 |
+
filtered_comments_with_keywords = [comment for comment in cleaned_comments if any(keyword in comment for keyword in keywords)]
|
314 |
+
if filtered_comments_with_keywords:
|
315 |
+
st.write(f"Found {len(filtered_comments_with_keywords)} comments containing sentiment-related keywords.")
|
316 |
+
for i, comment in enumerate(filtered_comments_with_keywords[:10]):
|
317 |
+
st.write(f"**Comment {i+1}:** {comment}")
|
318 |
+
else:
|
319 |
+
st.write("No comments with sentiment-related keywords found.")
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
# import streamlit as st
|
325 |
+
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
326 |
+
# from wordcloud import WordCloud
|
327 |
+
# import pandas as pd
|
328 |
+
# import matplotlib.pyplot as plt
|
329 |
+
# import seaborn as sns
|
330 |
+
# import re
|
331 |
+
# import numpy as np
|
332 |
+
# from nltk.corpus import stopwords
|
333 |
+
# from collections import Counter
|
334 |
+
# import praw
|
335 |
+
# import nltk
|
336 |
+
# from datetime import datetime
|
337 |
+
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
338 |
+
# from transformers import pipeline
|
339 |
+
|
340 |
+
# # Load sentiment analysis and summarization pipelines
|
341 |
+
# sentiment_analyzer = pipeline('sentiment-analysis')
|
342 |
+
# summarizer = pipeline('summarization')
|
343 |
+
|
344 |
+
# def analyze_sentiment(comment):
|
345 |
+
# if len(comment) <= 500: # Skip long comments
|
346 |
+
# return sentiment_analyzer(comment)[0]['label']
|
347 |
+
# else:
|
348 |
+
# return 'neutral'
|
349 |
+
|
350 |
+
# def summarize_text(text):
|
351 |
+
# if len(text) <= 500: # Skip long text for summarization
|
352 |
+
# return summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
353 |
+
# else:
|
354 |
+
# return text[:500] + '...' # Return truncated version
|
355 |
+
|
356 |
+
# # Initialize Reddit API client
|
357 |
+
# reddit = praw.Reddit(
|
358 |
+
# client_id='EhlUF9EavT4rAx42jQshKQ',
|
359 |
+
# client_secret='Zwc8iLJN8saS3B6booPKjabXw63cZQ',
|
360 |
+
# user_agent='FondantOk6255'
|
361 |
+
# )
|
362 |
+
|
363 |
+
# # VADER sentiment analyzer setup
|
364 |
+
# analyzer = SentimentIntensityAnalyzer()
|
365 |
+
|
366 |
+
# # Download stopwords
|
367 |
+
# nltk.download('stopwords')
|
368 |
+
# stop_words_set = set(stopwords.words('english'))
|
369 |
+
# stop_words_list = list(stop_words_set)
|
370 |
+
|
371 |
+
# vectorizer = TfidfVectorizer(stop_words=stop_words_list)
|
372 |
+
|
373 |
+
# # Streamlit app structure
|
374 |
+
# st.set_page_config(page_title="Reddit Comment Analyzer", layout="wide")
|
375 |
+
# st.title("Reddit Keyword-Based Comment Analyzer")
|
376 |
+
|
377 |
+
# # @st.cache
|
378 |
+
# # def fetch_reddit_data(query, max_results=50):
|
379 |
+
# # posts = reddit.subreddit('all').search(query, limit=max_results)
|
380 |
+
# # comments, timestamps = [], []
|
381 |
+
# # for post in posts:
|
382 |
+
# # post.comments.replace_more(limit=0)
|
383 |
+
# # for comment in post.comments.list():
|
384 |
+
# # if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
|
385 |
+
# # comments.append(comment.body)
|
386 |
+
# # timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
|
387 |
+
# # return comments, timestamps
|
388 |
+
|
389 |
+
# @st.cache
|
390 |
+
# def fetch_reddit_data(query, max_posts=50, max_comments_per_post=10):
|
391 |
+
# posts = reddit.subreddit('all').search(query, limit=max_posts)
|
392 |
+
# comments, timestamps = [], []
|
393 |
+
|
394 |
+
# for post in posts:
|
395 |
+
# post.comments.replace_more(limit=0) # Replace MoreComments with actual comments
|
396 |
+
# comment_count = 0
|
397 |
+
# for comment in post.comments.list():
|
398 |
+
# if comment_count >= max_comments_per_post: # Stop after max_comments_per_post comments
|
399 |
+
# break
|
400 |
+
# if isinstance(comment, praw.models.Comment) and comment.body and comment.created_utc:
|
401 |
+
# comments.append(comment.body)
|
402 |
+
# timestamps.append(pd.to_datetime(comment.created_utc, unit='s'))
|
403 |
+
# comment_count += 1
|
404 |
+
|
405 |
+
# # Optional: Stop after reaching max_comments total
|
406 |
+
# if len(comments) >= max_posts * max_comments_per_post:
|
407 |
+
# break
|
408 |
+
|
409 |
+
# return comments, timestamps
|
410 |
+
|
411 |
+
|
412 |
+
# def preprocess_text(text):
|
413 |
+
# text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
|
414 |
+
# text = re.sub(r'\s+', ' ', text)
|
415 |
+
# return text.strip()
|
416 |
+
|
417 |
+
# def analyze_sentiment_vader(comments):
|
418 |
+
# return [analyzer.polarity_scores(comment)['compound'] for comment in comments]
|
419 |
+
|
420 |
+
# def generate_wordcloud(comments):
|
421 |
+
# filtered_words = ' '.join([word for word in ' '.join(comments).split() if word not in stop_words_list])
|
422 |
+
# return WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
|
423 |
+
|
424 |
+
# def extract_features(comments):
|
425 |
+
# vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=50)
|
426 |
+
# X = vectorizer.fit_transform(comments)
|
427 |
+
# return vectorizer.get_feature_names_out(), X.sum(axis=0).A1
|
428 |
+
|
429 |
+
# query = st.text_input("Enter a keyword to search for Reddit comments", value="data analyst bootcamp online course")
|
430 |
+
# start_date = st.date_input("Start Date", value=pd.to_datetime("2024-01-01").date())
|
431 |
+
# end_date = st.date_input("End Date", value=pd.to_datetime("today").date())
|
432 |
+
|
433 |
+
# if st.button("Analyze"):
|
434 |
+
# comments, timestamps = fetch_reddit_data(query)
|
435 |
+
|
436 |
+
# print('done fetching', len(comments))
|
437 |
+
|
438 |
+
# if not comments:
|
439 |
+
# st.warning("No comments found for this search query.")
|
440 |
+
# else:
|
441 |
+
# cleaned_comments = [preprocess_text(comment) for comment in comments]
|
442 |
+
# print('preprocessed')
|
443 |
+
# sentiment_scores = analyze_sentiment(cleaned_comments)
|
444 |
+
# df = pd.DataFrame({
|
445 |
+
# 'comment': cleaned_comments,
|
446 |
+
# 'sentiment': sentiment_scores,
|
447 |
+
# 'created_at': timestamps
|
448 |
+
# })
|
449 |
+
# print('ANALYZED')
|
450 |
+
|
451 |
+
# df['created_at'] = pd.to_datetime(df['created_at'])
|
452 |
+
# df.set_index('created_at', inplace=True)
|
453 |
+
# df = df[(df.index >= pd.Timestamp(start_date)) & (df.index <= pd.Timestamp(end_date))]
|
454 |
+
|
455 |
+
# # Ensure the 'sentiment' column contains numeric values (floats)
|
456 |
+
# df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')
|
457 |
+
|
458 |
+
# # Apply sentiment categorization
|
459 |
+
# df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
|
460 |
+
|
461 |
+
# #df['sentiment_category'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
|
462 |
+
|
463 |
+
# st.session_state.df = df
|
464 |
+
# st.session_state.cleaned_comments = cleaned_comments
|
465 |
+
|
466 |
+
# if 'df' in st.session_state:
|
467 |
+
# df = st.session_state.df
|
468 |
+
# cleaned_comments = st.session_state.cleaned_comments
|
469 |
+
|
470 |
+
# st.subheader("Key Metrics")
|
471 |
+
# col1, col2, col3, col4 = st.columns(4)
|
472 |
+
# col1.metric("Total Comments", len(df))
|
473 |
+
# col2.metric("Positive", len(df[df['sentiment_category'] == 'positive']))
|
474 |
+
# col3.metric("Neutral", len(df[df['sentiment_category'] == 'neutral']))
|
475 |
+
# col4.metric("Negative", len(df[df['sentiment_category'] == 'negative']))
|
476 |
+
|
477 |
+
# st.subheader("Sentiment Distribution")
|
478 |
+
# sentiment_counts = df['sentiment_category'].value_counts()
|
479 |
+
# fig1, ax1 = plt.subplots()
|
480 |
+
# ax1.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
|
481 |
+
# ax1.axis('equal')
|
482 |
+
|
483 |
+
# st.subheader("Top Keywords Affecting Sentiment")
|
484 |
+
# vectorizer = TfidfVectorizer(stop_words=stop_words_list, max_features=50)
|
485 |
+
# X = vectorizer.fit_transform(df['comment'])
|
486 |
+
# features = vectorizer.get_feature_names_out()
|
487 |
+
# scores = np.asarray(X.mean(axis=0)).flatten()
|
488 |
+
|
489 |
+
# keywords_df = pd.DataFrame({'Keyword': features, 'Score': scores})
|
490 |
+
# top_keywords = keywords_df.sort_values(by='Score', ascending=False).head(10)
|
491 |
+
|
492 |
+
# fig2, ax2 = plt.subplots(figsize=(10, 5))
|
493 |
+
# top_keywords.plot(kind='bar', x='Keyword', y='Score', ax=ax2, color='darkorange')
|
494 |
+
# ax2.set_xlabel("Keyword")
|
495 |
+
# ax2.set_ylabel("TF-IDF Score")
|
496 |
+
# ax2.set_title("Top Keywords Affecting Sentiment")
|
497 |
+
# plt.xticks(rotation=45, ha='right')
|
498 |
+
|
499 |
+
# col1, col2 = st.columns(2)
|
500 |
+
# with col1:
|
501 |
+
# st.pyplot(fig1)
|
502 |
+
# with col2:
|
503 |
+
# st.pyplot(fig2)
|
504 |
+
|
505 |
+
# st.subheader("Word Cloud and Feature Importance Analysis")
|
506 |
+
# col1, col2 = st.columns(2)
|
507 |
+
|
508 |
+
# with col1:
|
509 |
+
# filtered_words = ' '.join([word for word in ' '.join(cleaned_comments).split() if word.lower() not in stop_words_list])
|
510 |
+
# wordcloud = WordCloud(width=400, height=400, background_color='white').generate(filtered_words)
|
511 |
+
# plt.figure(figsize=(5, 4))
|
512 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
513 |
+
# plt.axis('off')
|
514 |
+
# st.pyplot(plt)
|
515 |
+
|
516 |
+
# with col2:
|
517 |
+
# feature_names, feature_counts = extract_features(cleaned_comments)
|
518 |
+
# feature_df = pd.DataFrame({'Feature': feature_names, 'Count': feature_counts})
|
519 |
+
# feature_df = feature_df.sort_values(by='Count', ascending=False).head(10)
|
520 |
+
|
521 |
+
# fig3, ax3 = plt.subplots(figsize=(10, 5))
|
522 |
+
# feature_df.plot(kind='bar', x='Feature', y='Count', ax=ax3, color='salmon')
|
523 |
+
# ax3.set_xlabel("Feature")
|
524 |
+
# ax3.set_ylabel("Frequency")
|
525 |
+
# ax3.set_title("Top Keywords Impacting Sentiment")
|
526 |
+
# plt.xticks(rotation=45, ha='right')
|
527 |
+
# st.pyplot(fig3)
|
528 |
+
|
529 |
+
# st.subheader("Sentiment Over Time")
|
530 |
+
# sentiment_over_time = df['sentiment'].resample('W').mean()
|
531 |
+
# fig4, ax4 = plt.subplots(figsize=(10, 5))
|
532 |
+
# ax4.plot(sentiment_over_time.index, sentiment_over_time.values, marker='o')
|
533 |
+
# ax4.set_xlabel("Date")
|
534 |
+
# ax4.set_ylabel("Average Sentiment")
|
535 |
+
# st.pyplot(fig4)
|
536 |
+
|
537 |
+
# st.subheader("Sentiment Distribution by Hour of Day")
|
538 |
+
# df['hour'] = df.index.hour
|
539 |
+
# sentiment_by_hour = df.groupby('hour')['sentiment'].mean()
|
540 |
+
# fig5, ax5 = plt.subplots(figsize=(10, 5))
|
541 |
+
# ax5.plot(sentiment_by_hour.index, sentiment_by_hour.values, marker='o')
|
542 |
+
# ax5.set_xlabel("Hour of Day")
|
543 |
+
# ax5.set_ylabel("Average Sentiment")
|
544 |
+
# st.pyplot(fig5)
|
545 |
+
|
546 |
+
# st.subheader("Comment Summaries")
|
547 |
+
# for comment in cleaned_comments:
|
548 |
+
# st.write(summarize_text(comment))
|
requirements.txt
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==4.2.0
|
2 |
+
attrs==24.2.0
|
3 |
+
blinker==1.8.2
|
4 |
+
cachetools==5.4.0
|
5 |
+
certifi==2024.7.4
|
6 |
+
charset-normalizer==3.3.2
|
7 |
+
click==8.1.7
|
8 |
+
contourpy==1.2.1
|
9 |
+
cycler==0.12.1
|
10 |
+
entrypoints==0.4
|
11 |
+
filelock==3.15.4
|
12 |
+
fonttools==4.53.1
|
13 |
+
fsspec==2024.6.1
|
14 |
+
gitdb==4.0.11
|
15 |
+
GitPython==3.1.43
|
16 |
+
huggingface-hub==0.24.5
|
17 |
+
idna==3.7
|
18 |
+
importlib_metadata==8.2.0
|
19 |
+
importlib_resources==6.4.3
|
20 |
+
Jinja2==3.1.4
|
21 |
+
joblib==1.4.2
|
22 |
+
jsonschema==4.23.0
|
23 |
+
jsonschema-specifications==2023.12.1
|
24 |
+
kiwisolver==1.4.5
|
25 |
+
markdown-it-py==3.0.0
|
26 |
+
MarkupSafe==2.1.5
|
27 |
+
matplotlib==3.9.2
|
28 |
+
mdurl==0.1.2
|
29 |
+
mpmath==1.3.0
|
30 |
+
narwhals==1.4.2
|
31 |
+
networkx==3.2.1
|
32 |
+
nltk==3.8.1
|
33 |
+
numpy==1.26.4
|
34 |
+
packaging==24.1
|
35 |
+
pandas==2.2.2
|
36 |
+
pillow==10.4.0
|
37 |
+
praw==7.7.1
|
38 |
+
prawcore==2.4.0
|
39 |
+
protobuf==3.20.3
|
40 |
+
pyarrow==17.0.0
|
41 |
+
pydeck==0.9.1
|
42 |
+
Pygments==2.18.0
|
43 |
+
Pympler==1.1
|
44 |
+
pyparsing==3.1.2
|
45 |
+
python-dateutil==2.9.0.post0
|
46 |
+
pytz==2024.1
|
47 |
+
PyYAML==6.0.2
|
48 |
+
referencing==0.35.1
|
49 |
+
regex==2024.7.24
|
50 |
+
requests==2.32.3
|
51 |
+
rich==13.7.1
|
52 |
+
rpds-py==0.20.0
|
53 |
+
safetensors==0.4.4
|
54 |
+
scikit-learn==1.5.1
|
55 |
+
scipy==1.13.1
|
56 |
+
seaborn==0.13.2
|
57 |
+
semver==3.0.2
|
58 |
+
six==1.16.0
|
59 |
+
sklearn==0.0
|
60 |
+
smmap==5.0.1
|
61 |
+
streamlit==1.12.0
|
62 |
+
sympy==1.13.2
|
63 |
+
threadpoolctl==3.5.0
|
64 |
+
tokenizers==0.19.1
|
65 |
+
toml==0.10.2
|
66 |
+
toolz==0.12.1
|
67 |
+
torch==2.2.2
|
68 |
+
torchaudio==2.2.2
|
69 |
+
torchvision==0.17.2
|
70 |
+
tornado==6.4.1
|
71 |
+
tqdm==4.66.5
|
72 |
+
transformers==4.44.0
|
73 |
+
typing_extensions==4.12.2
|
74 |
+
tzdata==2024.1
|
75 |
+
tzlocal==5.2
|
76 |
+
update-checker==0.18.0
|
77 |
+
urllib3==2.2.2
|
78 |
+
vaderSentiment==3.3.2
|
79 |
+
validators==0.33.0
|
80 |
+
websocket-client==1.8.0
|
81 |
+
wordcloud==1.9.3
|
82 |
+
zipp==3.20.0
|