Nymbo shreyasiv commited on
Commit
36dbdcd
·
0 Parent(s):

Duplicate from Insightly/web_scraper

Browse files

Co-authored-by: Shreya Sivakumar <shreyasiv@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +56 -0
  4. requirements.txt +60 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Web Scraper
3
+ emoji: 🏃
4
+ colorFrom: pink
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.25.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: Insightly/web_scraper
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+
6
+ # Function to scrape only visible text from the given URL
7
+ def scrape_visible_text_from_url(url):
8
+ try:
9
+ response = requests.get(url)
10
+ response.raise_for_status()
11
+ soup = BeautifulSoup(response.content, 'html.parser')
12
+
13
+ # Remove script, style, and other non-visible tags
14
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
15
+ tag.extract()
16
+
17
+ # Get the header content
18
+ header_content = soup.find("header")
19
+ header_text = header_content.get_text() if header_content else ""
20
+
21
+ # Get the paragraph content
22
+ paragraph_content = soup.find_all("p")
23
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content])
24
+
25
+ # Combine header and paragraph text
26
+ visible_text = f"{header_text}\n\n{paragraph_text}"
27
+
28
+ # Remove multiple whitespaces and newlines
29
+ visible_text = re.sub(r'\s+', ' ', visible_text)
30
+ return visible_text.strip()
31
+ except Exception as e:
32
+ st.error(f"Error occurred while scraping the data: {e}")
33
+ return None
34
+
35
+ # Streamlit UI
36
+ def main():
37
+ st.title("Web Data Scraper")
38
+
39
+ # Get the URL from the user
40
+ url_input = st.text_input("Enter the URL of the web page:", "")
41
+
42
+ if st.button("Scrape Visible Text"):
43
+ if url_input:
44
+ # Extract visible text from the URL
45
+ data = scrape_visible_text_from_url(url_input)
46
+ if data:
47
+ st.success("Visible text successfully scraped!")
48
+ st.subheader("Scraped Text:")
49
+ st.write(data)
50
+ else:
51
+ st.warning("Failed to scrape visible text from the URL.")
52
+ else:
53
+ st.warning("Please enter a valid URL.")
54
+
55
+ if __name__ == "__main__":
56
+ main()
requirements.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ async-timeout==4.0.2
5
+ attrs==23.1.0
6
+ beautifulsoup4==4.12.2
7
+ blinker==1.6.2
8
+ bs4==0.0.1
9
+ cachetools==5.3.1
10
+ certifi==2023.7.22
11
+ charset-normalizer==3.2.0
12
+ click==8.1.6
13
+ decorator==5.1.1
14
+ frozenlist==1.4.0
15
+ gitdb==4.0.10
16
+ GitPython==3.1.32
17
+ idna==3.4
18
+ importlib-metadata==6.8.0
19
+ Jinja2==3.1.2
20
+ jsonschema==4.18.4
21
+ jsonschema-specifications==2023.7.1
22
+ markdown-it-py==3.0.0
23
+ MarkupSafe==2.1.3
24
+ mdurl==0.1.2
25
+ multidict==6.0.4
26
+ numpy==1.25.2
27
+ openai==0.27.8
28
+ packaging==23.1
29
+ pandas==2.0.3
30
+ Pillow==9.5.0
31
+ protobuf==4.23.4
32
+ pyarrow==12.0.1
33
+ pydeck==0.8.0
34
+ Pygments==2.15.1
35
+ Pympler==1.0.1
36
+ python-dateutil==2.8.2
37
+ python-dotenv==1.0.0
38
+ pytz==2023.3
39
+ pytz-deprecation-shim==0.1.0.post0
40
+ referencing==0.30.0
41
+ requests==2.31.0
42
+ rich==13.5.2
43
+ rpds-py==0.9.2
44
+ six==1.16.0
45
+ smmap==5.0.0
46
+ soupsieve==2.4.1
47
+ streamlit==1.25.0
48
+ tenacity==8.2.2
49
+ toml==0.10.2
50
+ toolz==0.12.0
51
+ tornado==6.3.2
52
+ tqdm==4.65.0
53
+ typing_extensions==4.7.1
54
+ tzdata==2023.3
55
+ tzlocal==4.3.1
56
+ urllib3==2.0.4
57
+ validators==0.20.0
58
+ watchdog==3.0.0
59
+ yarl==1.9.2
60
+ zipp==3.16.2