Spaces:
Running
Running
claytonsamples
commited on
Commit
•
7a70225
1
Parent(s):
ebd602f
Create scraper.py
Browse files- scraper.py +19 -0
scraper.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
class WebScraper:
|
7 |
+
def __init__(self, urls):
|
8 |
+
self.urls = urls
|
9 |
+
self.data = pd.DataFrame()
|
10 |
+
|
11 |
+
def scrape(self):
|
12 |
+
for url in self.urls:
|
13 |
+
response = requests.get(url)
|
14 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
15 |
+
text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
|
16 |
+
self.data = self.data.append({'url': url, 'content': text}, ignore_index=True)
|
17 |
+
|
18 |
+
def get_data(self):
|
19 |
+
return self.data
|