Spaces:
Running
Running
Charbel Malo
commited on
Commit
•
46f7221
1
Parent(s):
fb2ad23
Scraper+
Browse files
app.py
CHANGED
@@ -2,50 +2,97 @@ import streamlit as st
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import re
|
|
|
|
|
|
|
5 |
|
6 |
-
def scrape_visible_text_from_url(url):
|
7 |
try:
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
soup = BeautifulSoup(response.content, 'html.parser')
|
11 |
|
12 |
-
|
13 |
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
|
14 |
tag.extract()
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
visible_text = re.sub(r'\s+', ' ', visible_text)
|
29 |
-
return visible_text.strip()
|
30 |
except Exception as e:
|
31 |
st.error(f"Error occurred while scraping the data: {e}")
|
32 |
return None
|
33 |
|
34 |
-
#ST
|
35 |
-
|
36 |
def main():
|
37 |
st.title("Web Data Scraper")
|
38 |
|
39 |
-
|
40 |
url_input = st.text_input("Enter the URL 👉✏️:", "")
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
if url_input:
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
if data:
|
47 |
st.success("Data text successfully scraped!")
|
48 |
-
st.subheader("Scraped Text
|
49 |
st.write(data)
|
50 |
else:
|
51 |
st.warning("Failed to load data from the URL.")
|
@@ -53,4 +100,4 @@ def main():
|
|
53 |
st.warning("Please enter a valid URL.")
|
54 |
|
55 |
if __name__ == "__main__":
|
56 |
-
main()
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import re
|
5 |
+
from requests.sessions import Session
|
6 |
+
from langdetect import detect
|
7 |
+
from googletrans import Translator
|
8 |
|
9 |
+
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
|
10 |
try:
|
11 |
+
session = Session()
|
12 |
+
|
13 |
+
# Handle authentication if credentials are provided
|
14 |
+
if email and password and login_url:
|
15 |
+
login_data = {
|
16 |
+
'email': email,
|
17 |
+
'password': password
|
18 |
+
# Include other necessary fields as required by the website
|
19 |
+
}
|
20 |
+
response = session.post(login_url, data=login_data)
|
21 |
+
response.raise_for_status()
|
22 |
+
else:
|
23 |
+
response = session.get(url)
|
24 |
+
response.raise_for_status()
|
25 |
+
|
26 |
soup = BeautifulSoup(response.content, 'html.parser')
|
27 |
|
28 |
+
# Remove unwanted tags
|
29 |
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
|
30 |
tag.extract()
|
31 |
|
32 |
+
# Use query selector if provided
|
33 |
+
if query_selector:
|
34 |
+
elements = soup.select(query_selector)
|
35 |
+
text_content = " ".join([element.get_text() for element in elements])
|
36 |
+
else:
|
37 |
+
# Extract header content
|
38 |
+
header_content = soup.find("header")
|
39 |
+
header_text = header_content.get_text() if header_content else ""
|
40 |
+
|
41 |
+
# Extract paragraph content
|
42 |
+
paragraph_content = soup.find_all("p")
|
43 |
+
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
|
44 |
+
|
45 |
+
text_content = f"{header_text}\n\n{paragraph_text}"
|
46 |
|
47 |
+
# Clean up whitespace
|
48 |
+
visible_text = re.sub(r'\s+', ' ', text_content).strip()
|
|
|
49 |
|
50 |
+
# Translate non-English text
|
51 |
+
translator = Translator()
|
52 |
+
sentences = re.split(r'(?<=[.!?]) +', visible_text)
|
53 |
+
translated_sentences = []
|
54 |
+
for sentence in sentences:
|
55 |
+
try:
|
56 |
+
lang = detect(sentence)
|
57 |
+
if lang != 'en':
|
58 |
+
translation = translator.translate(sentence, dest='en').text
|
59 |
+
translated_sentences.append(translation)
|
60 |
+
else:
|
61 |
+
translated_sentences.append(sentence)
|
62 |
+
except Exception:
|
63 |
+
translated_sentences.append(sentence)
|
64 |
+
translated_text = ' '.join(translated_sentences)
|
65 |
|
66 |
+
return translated_text
|
|
|
|
|
67 |
except Exception as e:
|
68 |
st.error(f"Error occurred while scraping the data: {e}")
|
69 |
return None
|
70 |
|
|
|
|
|
71 |
def main():
|
72 |
st.title("Web Data Scraper")
|
73 |
|
|
|
74 |
url_input = st.text_input("Enter the URL 👉✏️:", "")
|
75 |
|
76 |
+
query_selector = st.text_input("Enter a query selector (optional):", "")
|
77 |
+
|
78 |
+
email = st.text_input("Email (if authentication required):", "")
|
79 |
+
|
80 |
+
password = st.text_input("Password (if authentication required):", "", type="password")
|
81 |
+
|
82 |
+
login_url = st.text_input("Enter the login URL (if authentication required):", "")
|
83 |
+
|
84 |
+
if st.button("Load Data 🧈"):
|
85 |
if url_input:
|
86 |
+
data = scrape_visible_text_from_url(
|
87 |
+
url=url_input,
|
88 |
+
query_selector=query_selector if query_selector else None,
|
89 |
+
email=email if email else None,
|
90 |
+
password=password if password else None,
|
91 |
+
login_url=login_url if login_url else None
|
92 |
+
)
|
93 |
if data:
|
94 |
st.success("Data text successfully scraped!")
|
95 |
+
st.subheader("Scraped Text:")
|
96 |
st.write(data)
|
97 |
else:
|
98 |
st.warning("Failed to load data from the URL.")
|
|
|
100 |
st.warning("Please enter a valid URL.")
|
101 |
|
102 |
if __name__ == "__main__":
|
103 |
+
main()
|