Karthikeyen92 commited on
Commit
fd33638
1 Parent(s): 42df7a2

Update py/data_fetch.py

Browse files
Files changed (1) hide show
  1. py/data_fetch.py +237 -236
py/data_fetch.py CHANGED
@@ -1,236 +1,237 @@
1
- import os
2
- from datetime import datetime, timedelta
3
- from typing import Dict, List
4
- import pandas as pd
5
- import tweepy
6
- import praw
7
- import googleapiclient.discovery
8
- import pytumblr
9
- from gnews import GNews
10
- import requests
11
- from bs4 import BeautifulSoup
12
- import time
13
- import math
14
-
15
-
16
- class DataFetch:
17
- def __init__(self):
18
- # Load company list and set date range
19
- self.end_date = datetime.now()
20
- self.start_date = self.end_date - timedelta(days=1)
21
-
22
- # Initialize API clients
23
- self.tumblr_client = pytumblr.TumblrRestClient(
24
- os.getenv("TUMBLR_CONSUMER_KEY"),
25
- os.getenv("TUMBLR_CONSUMER_SECRET"),
26
- os.getenv("TUMBLR_OAUTH_TOKEN"),
27
- os.getenv("TUMBLR_OAUTH_SECRET")
28
- )
29
-
30
- twitter_auth = tweepy.OAuthHandler(os.getenv("TWITTER_API_KEY"), os.getenv("TWITTER_API_SECRET"))
31
- twitter_auth.set_access_token(os.getenv("TWITTER_ACCESS_TOKEN"), os.getenv("TWITTER_ACCESS_TOKEN_SECRET"))
32
- self.twitter_api = tweepy.API(twitter_auth)
33
-
34
- self.reddit = praw.Reddit(
35
- client_id=os.getenv("REDDIT_CLIENT_ID"),
36
- client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
37
- user_agent="Sentiment Analysis Bot 1.0"
38
- )
39
-
40
- self.youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY"))
41
-
42
- def load_company_list(self, file_path: str) -> List[str]:
43
- self.company_list = pd.read_csv(file_path)['company_ticker'].tolist()
44
-
45
- def collect_data(self) -> List[Dict]:
46
- all_data = []
47
-
48
- for company in self.company_list:
49
- print(f"{company}:")
50
- all_data.extend(self._collect_social_media_data(company))
51
- all_data.extend(self._collect_news_data(company))
52
-
53
- return all_data
54
-
55
- def _collect_social_media_data(self, query: str) -> List[Dict]:
56
- social_data = []
57
-
58
- print("Collecting Reddit Data")
59
- social_data.extend(self.collect_reddit_data(query))
60
-
61
- print("Collecting YouTube Data")
62
- social_data.extend(self.collect_youtube_data(query))
63
-
64
- print("Collecting Tumblr Data")
65
- social_data.extend(self.collect_tumblr_data(query))
66
-
67
- return social_data
68
-
69
- def _collect_news_data(self, query: str) -> List[Dict]:
70
- news_data = []
71
-
72
- print("Collecting Google News Data")
73
- news_data.extend(self.collect_google_news(query))
74
-
75
- print("Collecting Financial Times Data")
76
- news_data.extend(self.collect_financial_times(query))
77
-
78
- print("Collecting Bloomberg Data")
79
- news_data.extend(self.collect_bloomberg(query))
80
-
81
- print("Collecting Reuters Data")
82
- news_data.extend(self.collect_reuters(query))
83
-
84
- print("Collecting WSJ Data")
85
- # news_data.extend(self.collect_wsj(query))
86
-
87
- print("Collecting Serper Data - StockNews, Yahoo Finance, Insider Monkey, Investor's Business Daily, etc.")
88
- news_data.extend(self.search_news(query))
89
-
90
- return news_data
91
-
92
- def collect_tumblr_data(self, query: str) -> List[Dict]:
93
- posts = self.tumblr_client.tagged(query)
94
- return [{"platform": "Tumblr", "company": query, "page_content": {
95
- "title": post["blog"]["title"], "content": post["blog"]["description"]}} for post in posts]
96
-
97
- def collect_twitter_data(self, query: str) -> List[Dict]:
98
- tweets = []
99
- for tweet in tweepy.Cursor(self.twitter_api.search_tweets, q=query, lang="en",
100
- since=self.start_date, until=self.end_date).items(100):
101
- tweets.append(tweet._json)
102
- return [{"platform": "Twitter", "company": query, "page_content": tweet} for tweet in tweets]
103
-
104
- def collect_reddit_data(self, query: str) -> List[Dict]:
105
- posts = []
106
- subreddit = self.reddit.subreddit("all")
107
- for post in subreddit.search(query, sort="new", time_filter="day"):
108
- post_date = datetime.fromtimestamp(post.created_utc)
109
- if self.start_date <= post_date <= self.end_date:
110
- posts.append({"platform": "Reddit", "company": query, "page_content": {
111
- "title": post.title, "content": post.selftext}})
112
- return posts
113
-
114
- def collect_youtube_data(self, query: str) -> List[Dict]:
115
- request = self.youtube.search().list(
116
- q=query, type="video", part="id,snippet", maxResults=50,
117
- publishedAfter=self.start_date.isoformat() + "Z", publishedBefore=self.end_date.isoformat() + "Z"
118
- )
119
- response = request.execute()
120
- return [{"platform": "YouTube", "company": query, "page_content": {
121
- "title": item["snippet"]["title"], "content": item["snippet"]["description"]}} for item in response['items']]
122
-
123
- def collect_google_news(self, query: str) -> List[Dict]:
124
- google_news = GNews(language='en', country='US', start_date=self.start_date, end_date=self.end_date)
125
- articles = google_news.get_news(query)
126
- return [{"platform": "Google News", "company": query, "page_content": {
127
- "title": article["title"], "content": article["description"]}} for article in articles]
128
-
129
- def collect_financial_times(self, query: str) -> List[Dict]:
130
- url = f"https://www.ft.com/search?q={query}&dateTo={self.end_date.strftime('%Y-%m-%d')}&dateFrom={self.start_date.strftime('%Y-%m-%d')}"
131
- response = requests.get(url)
132
- soup = BeautifulSoup(response.content, 'html.parser')
133
- articles = soup.find_all('div', class_='o-teaser__content')
134
- return [{"platform": "Financial Times", "company": query, "page_content": {
135
- "title": a.find('div', class_='o-teaser__heading').text.strip(),
136
- "content": a.find('p', class_='o-teaser__standfirst').text.strip() if a.find('p', class_='o-teaser__standfirst') else ''
137
- }} for a in articles]
138
-
139
- def collect_bloomberg(self, query: str) -> List[Dict]:
140
- url = f"https://www.bloomberg.com/search?query={query}"
141
- response = requests.get(url)
142
- soup = BeautifulSoup(response.content, 'html.parser')
143
- articles = soup.find_all('div', class_='storyItem__aaf871c1')
144
- return [{"platform": "Bloomberg", "company": query, "page_content": {
145
- "title": a.find('a', class_='headline__3a97424d').text.strip(),
146
- "content": a.find('p', class_='summary__483358e1').text.strip() if a.find('p', class_='summary__483358e1') else ''
147
- }} for a in articles]
148
-
149
- def collect_reuters(self, query: str) -> List[Dict]:
150
- articles = []
151
- base_url = "https://www.reuters.com/site-search/"
152
- page = 1
153
- while True:
154
- url = f"{base_url}?blob={query}&page={page}"
155
- response = requests.get(url)
156
- soup = BeautifulSoup(response.content, 'html.parser')
157
- results = soup.find_all('li', class_='search-result__item')
158
- if not results:
159
- break
160
- for result in results:
161
- date_elem = result.find('time', class_='search-result__timestamp')
162
- if date_elem:
163
- date = datetime.strptime(date_elem['datetime'], "%Y-%m-%dT%H:%M:%SZ")
164
- if self.start_date <= date <= self.end_date:
165
- articles.append({"platform": "Reuters", "company": query, "page_content": {
166
- "title": result.find('h3', class_='search-result__headline').text.strip(),
167
- "content": result.find('p', class_='search-result__excerpt').text.strip()
168
- }})
169
- elif date < self.start_date:
170
- return articles
171
- page += 1
172
- time.sleep(1)
173
- return articles
174
-
175
- def collect_wsj(self, query: str) -> List[Dict]:
176
- articles = []
177
- base_url = "https://www.wsj.com/search"
178
- page = 1
179
- while True:
180
- params = {
181
- 'query': query, 'isToggleOn': 'true', 'operator': 'AND', 'sort': 'date-desc',
182
- 'duration': 'custom', 'startDate': self.start_date.strftime('%Y/%m/%d'),
183
- 'endDate': self.end_date.strftime('%Y/%m/%d'), 'page': page
184
- }
185
- response = requests.get(base_url, params=params)
186
- soup = BeautifulSoup(response.content, 'html.parser')
187
- results = soup.find_all('article', class_='WSJTheme--story--XB4V2mLz')
188
- if not results:
189
- break
190
- for result in results:
191
- date_elem = result.find('p', class_='WSJTheme--timestamp--22sfkNDv')
192
- if date_elem:
193
- date = datetime.strptime(date_elem.text.strip(), "%B %d, %Y")
194
- if self.start_date <= date <= self.end_date:
195
- articles.append({"platform": "Wall Street Journal", "company": query, "page_content": {
196
- "title": result.find('h3', class_='WSJTheme--headline--unZqjb45').text.strip(),
197
- "content": result.find('p', class_='WSJTheme--summary--lmOXEsbN').text.strip()
198
- }})
199
- elif date < self.start_date:
200
- return articles
201
- page += 1
202
- time.sleep(1)
203
- return articles
204
-
205
- def search_news(self, query: str,cnt=300) -> List[Dict]:
206
- articles = []
207
- num_results = cnt
208
-
209
- headers = {
210
- "X-API-KEY": os.getenv("SERP_API_KEY"),
211
- "Content-Type": "application/json"
212
- }
213
- payload = {"q": f"{query} company news",
214
- "num": num_results,
215
- "dateRestrict": 14
216
- }
217
- response = requests.post(
218
- "https://google.serper.dev/news",
219
- headers=headers,
220
- json=payload
221
- )
222
- # print(response)
223
- if response.status_code == 200:
224
- results = response.json().get("news", [])
225
- for result in results:
226
- articles.append({"platform": result["source"], "company": query, "page_content": {
227
- "title": result["title"],
228
- "content": result["snippet"]
229
- }})
230
- return articles
231
-
232
- # Usage Example
233
- if __name__ == "__main__":
234
- analyzer = DataFetch("company_list.csv")
235
- data = analyzer.collect_data()
236
- # Here, data would contain all collected sentiment data for the given companies
 
 
1
+ import os
2
+ from datetime import datetime, timedelta
3
+ from typing import Dict, List
4
+ import pandas as pd
5
+ import tweepy
6
+ import praw
7
+ import googleapiclient.discovery
8
+ import pytumblr
9
+ from gnews import GNews
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ import time
13
+ import math
14
+
15
+
16
+ class DataFetch:
17
+ def __init__(self):
18
+ # Load company list and set date range
19
+ self.end_date = datetime.now()
20
+ self.start_date = self.end_date - timedelta(days=1)
21
+
22
+ # Initialize API clients
23
+ self.tumblr_client = pytumblr.TumblrRestClient(
24
+ os.getenv("TUMBLR_CONSUMER_KEY"),
25
+ os.getenv("TUMBLR_CONSUMER_SECRET"),
26
+ os.getenv("TUMBLR_OAUTH_TOKEN"),
27
+ os.getenv("TUMBLR_OAUTH_SECRET")
28
+ )
29
+
30
+ twitter_auth = tweepy.OAuthHandler(os.getenv("TWITTER_API_KEY"), os.getenv("TWITTER_API_SECRET"))
31
+ twitter_auth.set_access_token(os.getenv("TWITTER_ACCESS_TOKEN"), os.getenv("TWITTER_ACCESS_TOKEN_SECRET"))
32
+ self.twitter_api = tweepy.API(twitter_auth)
33
+
34
+ self.reddit = praw.Reddit(
35
+ client_id=os.getenv("REDDIT_CLIENT_ID"),
36
+ client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
37
+ user_agent="Sentiment Analysis Bot 1.0"
38
+ )
39
+
40
+ self.youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY"))
41
+
42
+ def load_company_list(self, file_path: str) -> List[str]:
43
+ self.company_list = pd.read_csv(file_path)['company_ticker'].tolist()
44
+
45
+ def collect_data(self) -> List[Dict]:
46
+ all_data = []
47
+
48
+ for company in self.company_list:
49
+ print(f"{company}:")
50
+ all_data.extend(self._collect_social_media_data(company))
51
+ all_data.extend(self._collect_news_data(company))
52
+
53
+ return all_data
54
+
55
+ def _collect_social_media_data(self, query: str) -> List[Dict]:
56
+ social_data = []
57
+
58
+ print("Collecting Reddit Data")
59
+ social_data.extend(self.collect_reddit_data(query))
60
+
61
+ print("Collecting YouTube Data")
62
+ social_data.extend(self.collect_youtube_data(query))
63
+
64
+ print("Collecting Tumblr Data")
65
+ social_data.extend(self.collect_tumblr_data(query))
66
+
67
+ return social_data
68
+
69
+ def _collect_news_data(self, query: str) -> List[Dict]:
70
+ news_data = []
71
+
72
+ print("Collecting Google News Data")
73
+ news_data.extend(self.collect_google_news(query))
74
+
75
+ print("Collecting Financial Times Data")
76
+ news_data.extend(self.collect_financial_times(query))
77
+
78
+ print("Collecting Bloomberg Data")
79
+ news_data.extend(self.collect_bloomberg(query))
80
+
81
+ print("Collecting Reuters Data")
82
+ news_data.extend(self.collect_reuters(query))
83
+
84
+ print("Collecting WSJ Data")
85
+ # news_data.extend(self.collect_wsj(query))
86
+
87
+ print("Collecting Serper Data - StockNews, Yahoo Finance, Insider Monkey, Investor's Business Daily, etc.")
88
+ news_data.extend(self.search_news(query))
89
+
90
+ return news_data
91
+
92
+ def collect_tumblr_data(self, query: str) -> List[Dict]:
93
+ posts = self.tumblr_client.tagged(query)
94
+ return [{"platform": "Tumblr", "company": query, "page_content": {
95
+ "title": post["blog"]["title"], "content": post["blog"]["description"]}} for post in posts]
96
+
97
+ def collect_twitter_data(self, query: str) -> List[Dict]:
98
+ tweets = []
99
+ for tweet in tweepy.Cursor(self.twitter_api.search_tweets, q=query, lang="en",
100
+ since=self.start_date, until=self.end_date).items(100):
101
+ tweets.append(tweet._json)
102
+ return [{"platform": "Twitter", "company": query, "page_content": tweet} for tweet in tweets]
103
+
104
+ def collect_reddit_data(self, query: str) -> List[Dict]:
105
+ posts = []
106
+ subreddit = self.reddit.subreddit("all")
107
+ for post in subreddit.search(query, sort="new", time_filter="day"):
108
+ post_date = datetime.fromtimestamp(post.created_utc)
109
+ if self.start_date <= post_date <= self.end_date:
110
+ posts.append({"platform": "Reddit", "company": query, "page_content": {
111
+ "title": post.title, "content": post.selftext}})
112
+ return posts
113
+
114
+ def collect_youtube_data(self, query: str) -> List[Dict]:
115
+ request = self.youtube.search().list(
116
+ q=query, type="video", part="id,snippet", maxResults=50,
117
+ publishedAfter=self.start_date.isoformat() + "Z", publishedBefore=self.end_date.isoformat() + "Z"
118
+ )
119
+ response = request.execute()
120
+ return [{"platform": "YouTube", "company": query, "page_content": {
121
+ "title": item["snippet"]["title"], "content": item["snippet"]["description"]}} for item in response['items']]
122
+
123
+ def collect_google_news(self, query: str) -> List[Dict]:
124
+ google_news = GNews(language='en', country='US', start_date=self.start_date, end_date=self.end_date)
125
+ articles = google_news.get_news(query)
126
+ return [{"platform": "Google News", "company": query, "page_content": {
127
+ "title": article["title"], "content": article["description"]}} for article in articles]
128
+
129
+ def collect_financial_times(self, query: str) -> List[Dict]:
130
+ url = f"https://www.ft.com/search?q={query}&dateTo={self.end_date.strftime('%Y-%m-%d')}&dateFrom={self.start_date.strftime('%Y-%m-%d')}"
131
+ response = requests.get(url)
132
+ soup = BeautifulSoup(response.content, 'html.parser')
133
+ articles = soup.find_all('div', class_='o-teaser__content')
134
+ return [{"platform": "Financial Times", "company": query, "page_content": {
135
+ "title": a.find('div', class_='o-teaser__heading').text.strip(),
136
+ "content": a.find('p', class_='o-teaser__standfirst').text.strip() if a.find('p', class_='o-teaser__standfirst') else ''
137
+ }} for a in articles]
138
+
139
+ def collect_bloomberg(self, query: str) -> List[Dict]:
140
+ url = f"https://www.bloomberg.com/search?query={query}"
141
+ response = requests.get(url)
142
+ soup = BeautifulSoup(response.content, 'html.parser')
143
+ articles = soup.find_all('div', class_='storyItem__aaf871c1')
144
+ return [{"platform": "Bloomberg", "company": query, "page_content": {
145
+ "title": a.find('a', class_='headline__3a97424d').text.strip(),
146
+ "content": a.find('p', class_='summary__483358e1').text.strip() if a.find('p', class_='summary__483358e1') else ''
147
+ }} for a in articles]
148
+
149
+ def collect_reuters(self, query: str) -> List[Dict]:
150
+ articles = []
151
+ base_url = "https://www.reuters.com/site-search/"
152
+ page = 1
153
+ while True:
154
+ url = f"{base_url}?blob={query}&page={page}"
155
+ response = requests.get(url)
156
+ soup = BeautifulSoup(response.content, 'html.parser')
157
+ results = soup.find_all('li', class_='search-result__item')
158
+ if not results:
159
+ break
160
+ for result in results:
161
+ date_elem = result.find('time', class_='search-result__timestamp')
162
+ if date_elem:
163
+ date = datetime.strptime(date_elem['datetime'], "%Y-%m-%dT%H:%M:%SZ")
164
+ if self.start_date <= date <= self.end_date:
165
+ articles.append({"platform": "Reuters", "company": query, "page_content": {
166
+ "title": result.find('h3', class_='search-result__headline').text.strip(),
167
+ "content": result.find('p', class_='search-result__excerpt').text.strip()
168
+ }})
169
+ elif date < self.start_date:
170
+ return articles
171
+ page += 1
172
+ time.sleep(1)
173
+ return articles
174
+
175
+ def collect_wsj(self, query: str) -> List[Dict]:
176
+ articles = []
177
+ base_url = "https://www.wsj.com/search"
178
+ page = 1
179
+ while True:
180
+ params = {
181
+ 'query': query, 'isToggleOn': 'true', 'operator': 'AND', 'sort': 'date-desc',
182
+ 'duration': 'custom', 'startDate': self.start_date.strftime('%Y/%m/%d'),
183
+ 'endDate': self.end_date.strftime('%Y/%m/%d'), 'page': page
184
+ }
185
+ response = requests.get(base_url, params=params)
186
+ soup = BeautifulSoup(response.content, 'html.parser')
187
+ results = soup.find_all('article', class_='WSJTheme--story--XB4V2mLz')
188
+ if not results:
189
+ break
190
+ for result in results:
191
+ date_elem = result.find('p', class_='WSJTheme--timestamp--22sfkNDv')
192
+ if date_elem:
193
+ date = datetime.strptime(date_elem.text.strip(), "%B %d, %Y")
194
+ if self.start_date <= date <= self.end_date:
195
+ articles.append({"platform": "Wall Street Journal", "company": query, "page_content": {
196
+ "title": result.find('h3', class_='WSJTheme--headline--unZqjb45').text.strip(),
197
+ "content": result.find('p', class_='WSJTheme--summary--lmOXEsbN').text.strip()
198
+ }})
199
+ elif date < self.start_date:
200
+ return articles
201
+ page += 1
202
+ time.sleep(1)
203
+ return articles
204
+
205
+ def search_news(self, query: str,cnt=300) -> List[Dict]:
206
+ articles = []
207
+ num_results = cnt
208
+
209
+ headers = {
210
+ "X-API-KEY": os.getenv("SERP_API_KEY"),
211
+ "Content-Type": "application/json"
212
+ }
213
+ payload = {"q": f"{query} company news",
214
+ "num": num_results,
215
+ "dateRestrict": 14
216
+ }
217
+ response = requests.post(
218
+ "https://google.serper.dev/news",
219
+ headers=headers,
220
+ json=payload
221
+ )
222
+ # print(response)
223
+ if response.status_code == 200:
224
+ results = response.json().get("news", [])
225
+ for result in results:
226
+ articles.append({"platform": result["source"], "company": query, "page_content": {
227
+ "title": result["title"],
228
+ "content": result["snippet"],
229
+ "link": result["link"]
230
+ }})
231
+ return articles
232
+
233
+ # Usage Example
234
+ if __name__ == "__main__":
235
+ analyzer = DataFetch("company_list.csv")
236
+ data = analyzer.collect_data()
237
+ # Here, data would contain all collected sentiment data for the given companies