prithivMLmods commited on
Commit
53dbd29
1 Parent(s): e2248c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -56
app.py CHANGED
@@ -1,56 +1,56 @@
1
- import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import re
5
-
6
- # Function to scrape only visible text from the given URL
7
- def scrape_visible_text_from_url(url):
8
- try:
9
- response = requests.get(url)
10
- response.raise_for_status()
11
- soup = BeautifulSoup(response.content, 'html.parser')
12
-
13
- # Remove script, style, and other non-visible tags
14
- for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
15
- tag.extract()
16
-
17
- # Get the header content
18
- header_content = soup.find("header")
19
- header_text = header_content.get_text() if header_content else ""
20
-
21
- # Get the paragraph content
22
- paragraph_content = soup.find_all("p")
23
- paragraph_text = " ".join([p.get_text() for p in paragraph_content])
24
-
25
- # Combine header and paragraph text
26
- visible_text = f"{header_text}\n\n{paragraph_text}"
27
-
28
- # Remove multiple whitespaces and newlines
29
- visible_text = re.sub(r'\s+', ' ', visible_text)
30
- return visible_text.strip()
31
- except Exception as e:
32
- st.error(f"Error occurred while scraping the data: {e}")
33
- return None
34
-
35
- # Streamlit UI
36
- def main():
37
- st.title("Web Data Scraper")
38
-
39
- # Get the URL from the user
40
- url_input = st.text_input("Enter the URL of the web page:", "")
41
-
42
- if st.button("Scrape Visible Text"):
43
- if url_input:
44
- # Extract visible text from the URL
45
- data = scrape_visible_text_from_url(url_input)
46
- if data:
47
- st.success("Visible text successfully scraped!")
48
- st.subheader("Scraped Text:")
49
- st.write(data)
50
- else:
51
- st.warning("Failed to scrape visible text from the URL.")
52
- else:
53
- st.warning("Please enter a valid URL.")
54
-
55
- if __name__ == "__main__":
56
- main()
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+
6
+ def scrape_visible_text_from_url(url):
7
+ try:
8
+ response = requests.get(url)
9
+ response.raise_for_status()
10
+ soup = BeautifulSoup(response.content, 'html.parser')
11
+
12
+
13
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
14
+ tag.extract()
15
+
16
+
17
+ header_content = soup.find("header")
18
+ header_text = header_content.get_text() if header_content else ""
19
+
20
+
21
+ paragraph_content = soup.find_all("p")
22
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content])
23
+
24
+
25
+ visible_text = f"{header_text}\n\n{paragraph_text}"
26
+
27
+
28
+ visible_text = re.sub(r'\s+', ' ', visible_text)
29
+ return visible_text.strip()
30
+ except Exception as e:
31
+ st.error(f"Error occurred while scraping the data: {e}")
32
+ return None
33
+
34
+ #ST
35
+
36
+ def main():
37
+ st.title("Web Data Scraper")
38
+
39
+ # Get the URL from the user
40
+ url_input = st.text_input("Enter the URL 👉✏️:", "")
41
+
42
+ if st.button("Load Datum 🧈"):
43
+ if url_input:
44
+ # Extract visible text from the URL
45
+ data = scrape_visible_text_from_url(url_input)
46
+ if data:
47
+ st.success("Data text successfully scraped!")
48
+ st.subheader("Scraped Text :")
49
+ st.write(data)
50
+ else:
51
+ st.warning("Failed to load data from the URL.")
52
+ else:
53
+ st.warning("Please enter a valid URL.")
54
+
55
+ if __name__ == "__main__":
56
+ main()