Charbel Malo commited on
Commit
46f7221
1 Parent(s): fb2ad23
Files changed (1) hide show
  1. app.py +70 -23
app.py CHANGED
@@ -2,50 +2,97 @@ import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
 
 
 
5
 
6
- def scrape_visible_text_from_url(url):
7
  try:
8
- response = requests.get(url)
9
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  soup = BeautifulSoup(response.content, 'html.parser')
11
 
12
-
13
  for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
14
  tag.extract()
15
 
16
-
17
- header_content = soup.find("header")
18
- header_text = header_content.get_text() if header_content else ""
 
 
 
 
 
 
 
 
 
 
 
19
 
20
-
21
- paragraph_content = soup.find_all("p")
22
- paragraph_text = " ".join([p.get_text() for p in paragraph_content])
23
 
24
-
25
- visible_text = f"{header_text}\n\n{paragraph_text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
-
28
- visible_text = re.sub(r'\s+', ' ', visible_text)
29
- return visible_text.strip()
30
  except Exception as e:
31
  st.error(f"Error occurred while scraping the data: {e}")
32
  return None
33
 
34
- #ST
35
-
36
  def main():
37
  st.title("Web Data Scraper")
38
 
39
-
40
  url_input = st.text_input("Enter the URL 👉✏️:", "")
41
 
42
- if st.button("Load Datum 🧈"):
 
 
 
 
 
 
 
 
43
  if url_input:
44
-
45
- data = scrape_visible_text_from_url(url_input)
 
 
 
 
 
46
  if data:
47
  st.success("Data text successfully scraped!")
48
- st.subheader("Scraped Text :")
49
  st.write(data)
50
  else:
51
  st.warning("Failed to load data from the URL.")
@@ -53,4 +100,4 @@ def main():
53
  st.warning("Please enter a valid URL.")
54
 
55
  if __name__ == "__main__":
56
- main()
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
+ from requests.sessions import Session
6
+ from langdetect import detect
7
+ from googletrans import Translator
8
 
9
+ def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
10
  try:
11
+ session = Session()
12
+
13
+ # Handle authentication if credentials are provided
14
+ if email and password and login_url:
15
+ login_data = {
16
+ 'email': email,
17
+ 'password': password
18
+ # Include other necessary fields as required by the website
19
+ }
20
+ response = session.post(login_url, data=login_data)
21
+ response.raise_for_status()
22
+ else:
23
+ response = session.get(url)
24
+ response.raise_for_status()
25
+
26
  soup = BeautifulSoup(response.content, 'html.parser')
27
 
28
+ # Remove unwanted tags
29
  for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
30
  tag.extract()
31
 
32
+ # Use query selector if provided
33
+ if query_selector:
34
+ elements = soup.select(query_selector)
35
+ text_content = " ".join([element.get_text() for element in elements])
36
+ else:
37
+ # Extract header content
38
+ header_content = soup.find("header")
39
+ header_text = header_content.get_text() if header_content else ""
40
+
41
+ # Extract paragraph content
42
+ paragraph_content = soup.find_all("p")
43
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content])
44
+
45
+ text_content = f"{header_text}\n\n{paragraph_text}"
46
 
47
+ # Clean up whitespace
48
+ visible_text = re.sub(r'\s+', ' ', text_content).strip()
 
49
 
50
+ # Translate non-English text
51
+ translator = Translator()
52
+ sentences = re.split(r'(?<=[.!?]) +', visible_text)
53
+ translated_sentences = []
54
+ for sentence in sentences:
55
+ try:
56
+ lang = detect(sentence)
57
+ if lang != 'en':
58
+ translation = translator.translate(sentence, dest='en').text
59
+ translated_sentences.append(translation)
60
+ else:
61
+ translated_sentences.append(sentence)
62
+ except Exception:
63
+ translated_sentences.append(sentence)
64
+ translated_text = ' '.join(translated_sentences)
65
 
66
+ return translated_text
 
 
67
  except Exception as e:
68
  st.error(f"Error occurred while scraping the data: {e}")
69
  return None
70
 
 
 
71
  def main():
72
  st.title("Web Data Scraper")
73
 
 
74
  url_input = st.text_input("Enter the URL 👉✏️:", "")
75
 
76
+ query_selector = st.text_input("Enter a query selector (optional):", "")
77
+
78
+ email = st.text_input("Email (if authentication required):", "")
79
+
80
+ password = st.text_input("Password (if authentication required):", "", type="password")
81
+
82
+ login_url = st.text_input("Enter the login URL (if authentication required):", "")
83
+
84
+ if st.button("Load Data 🧈"):
85
  if url_input:
86
+ data = scrape_visible_text_from_url(
87
+ url=url_input,
88
+ query_selector=query_selector if query_selector else None,
89
+ email=email if email else None,
90
+ password=password if password else None,
91
+ login_url=login_url if login_url else None
92
+ )
93
  if data:
94
  st.success("Data text successfully scraped!")
95
+ st.subheader("Scraped Text:")
96
  st.write(data)
97
  else:
98
  st.warning("Failed to load data from the URL.")
 
100
  st.warning("Please enter a valid URL.")
101
 
102
  if __name__ == "__main__":
103
+ main()