Marcepelaez commited on
Commit
befec25
1 Parent(s): 15ee190
Files changed (1) hide show
  1. app.py +126 -18
app.py CHANGED
@@ -2,55 +2,163 @@ import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
 
 
5
 
6
- # Function to scrape only visible text from the given URL
7
- def scrape_visible_text_from_url(url):
 
 
 
 
 
 
 
 
 
8
  try:
 
 
 
 
9
  response = requests.get(url)
10
  response.raise_for_status()
 
 
11
  soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Remove script, style, and other non-visible tags
14
- for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
15
  tag.extract()
16
-
17
  # Get the header content
18
  header_content = soup.find("header")
19
  header_text = header_content.get_text() if header_content else ""
20
-
21
  # Get the paragraph content
22
  paragraph_content = soup.find_all("p")
23
  paragraph_text = " ".join([p.get_text() for p in paragraph_content])
24
-
25
  # Combine header and paragraph text
26
  visible_text = f"{header_text}\n\n{paragraph_text}"
27
-
28
  # Remove multiple whitespaces and newlines
29
  visible_text = re.sub(r'\s+', ' ', visible_text)
30
  return visible_text.strip()
 
31
  except Exception as e:
32
  st.error(f"Error occurred while scraping the data: {e}")
33
  return None
34
 
35
- # Streamlit UI
36
  def main():
37
- st.title("Web Data Scraper")
38
-
 
 
 
39
  # Get the URL from the user
40
  url_input = st.text_input("Enter the URL of the web page:", "")
41
-
42
- if st.button("Scrape Visible Text"):
 
 
 
43
  if url_input:
44
  # Extract visible text from the URL
45
- data = scrape_visible_text_from_url(url_input)
46
- if data:
47
- st.success("Visible text successfully scraped!")
 
 
 
 
 
 
48
  st.subheader("Scraped Text:")
49
- st.write(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  else:
51
- st.warning("Failed to scrape visible text from the URL.")
52
  else:
53
  st.warning("Please enter a valid URL.")
54
 
55
  if __name__ == "__main__":
56
- main()
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
+ import os
6
+ from urllib.parse import urljoin
7
 
8
+ def download_images(url, max_images=10):
9
+ """
10
+ Download images from the given URL
11
+
12
+ Args:
13
+ url (str): URL of the webpage
14
+ max_images (int): Maximum number of images to download
15
+
16
+ Returns:
17
+ tuple: List of downloaded image paths and total image count
18
+ """
19
  try:
20
+ # Create a directory to save images if it doesn't exist
21
+ os.makedirs('downloaded_images', exist_ok=True)
22
+
23
+ # Send a request to the URL
24
  response = requests.get(url)
25
  response.raise_for_status()
26
+
27
+ # Parse the HTML content
28
  soup = BeautifulSoup(response.content, 'html.parser')
29
+
30
+ # Find all image tags
31
+ img_tags = soup.find_all('img', src=True)
32
+
33
+ # List to store downloaded image paths
34
+ downloaded_images = []
35
+
36
+ # Download images
37
+ for i, img in enumerate(img_tags[:max_images], 1):
38
+ # Get the image source URL
39
+ img_url = img['src']
40
+
41
+ # Handle relative URLs
42
+ if not img_url.startswith(('http://', 'https://')):
43
+ img_url = urljoin(url, img_url)
44
+
45
+ try:
46
+ # Download the image
47
+ img_response = requests.get(img_url)
48
+ img_response.raise_for_status()
49
+
50
+ # Generate a unique filename
51
+ filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
52
+
53
+ # Save the image
54
+ with open(filename, 'wb') as f:
55
+ f.write(img_response.content)
56
+
57
+ downloaded_images.append(filename)
58
+
59
+ except Exception as img_error:
60
+ st.warning(f"Could not download image {i}: {img_error}")
61
+
62
+ return downloaded_images, len(img_tags)
63
+
64
+ except Exception as e:
65
+ st.error(f"Error occurred while downloading images: {e}")
66
+ return [], 0
67
 
68
+ def scrape_visible_text_from_url(url):
69
+ """
70
+ Scrape visible text from the given URL
71
+
72
+ Args:
73
+ url (str): URL of the webpage
74
+
75
+ Returns:
76
+ str: Extracted visible text
77
+ """
78
+ try:
79
+ response = requests.get(url)
80
+ response.raise_for_status()
81
+ soup = BeautifulSoup(response.content, 'html.parser')
82
+
83
  # Remove script, style, and other non-visible tags
84
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
85
  tag.extract()
86
+
87
  # Get the header content
88
  header_content = soup.find("header")
89
  header_text = header_content.get_text() if header_content else ""
90
+
91
  # Get the paragraph content
92
  paragraph_content = soup.find_all("p")
93
  paragraph_text = " ".join([p.get_text() for p in paragraph_content])
94
+
95
  # Combine header and paragraph text
96
  visible_text = f"{header_text}\n\n{paragraph_text}"
97
+
98
  # Remove multiple whitespaces and newlines
99
  visible_text = re.sub(r'\s+', ' ', visible_text)
100
  return visible_text.strip()
101
+
102
  except Exception as e:
103
  st.error(f"Error occurred while scraping the data: {e}")
104
  return None
105
 
 
106
  def main():
107
+ """
108
+ Main Streamlit application
109
+ """
110
+ st.title("Web Data Scraper with Image Downloader")
111
+
112
  # Get the URL from the user
113
  url_input = st.text_input("Enter the URL of the web page:", "")
114
+
115
+ # Maximum images to download slider
116
+ max_images = st.slider("Maximum number of images to download", 1, 20, 10)
117
+
118
+ if st.button("Scrape Content"):
119
  if url_input:
120
  # Extract visible text from the URL
121
+ text_data = scrape_visible_text_from_url(url_input)
122
+
123
+ # Download images
124
+ downloaded_images, total_images = download_images(url_input, max_images)
125
+
126
+ if text_data:
127
+ st.success("Content successfully scraped!")
128
+
129
+ # Display text
130
  st.subheader("Scraped Text:")
131
+ st.write(text_data)
132
+
133
+ # Display and manage images
134
+ st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")
135
+
136
+ # Create columns for images
137
+ if downloaded_images:
138
+ cols = st.columns(min(len(downloaded_images), 3))
139
+ for i, img_path in enumerate(downloaded_images):
140
+ with cols[i % 3]:
141
+ st.image(img_path, use_column_width=True)
142
+
143
+ # Provide download option for all images
144
+ with open('downloaded_images.zip', 'wb') as zipf:
145
+ import zipfile
146
+ with zipfile.ZipFile(zipf, 'w') as zip_file:
147
+ for img_path in downloaded_images:
148
+ zip_file.write(img_path, os.path.basename(img_path))
149
+
150
+ st.download_button(
151
+ label="Download All Images",
152
+ data=open('downloaded_images.zip', 'rb').read(),
153
+ file_name='downloaded_images.zip',
154
+ mime='application/zip'
155
+ )
156
+ else:
157
+ st.warning("No images found on the page.")
158
  else:
159
+ st.warning("Failed to scrape content from the URL.")
160
  else:
161
  st.warning("Please enter a valid URL.")
162
 
163
  if __name__ == "__main__":
164
+ main()