Marcepelaez commited on
Commit
2df3230
1 Parent(s): befec25
Files changed (1) hide show
  1. app.py +65 -79
app.py CHANGED
@@ -5,21 +5,17 @@ import re
5
  import os
6
  from urllib.parse import urljoin
7
 
8
- def download_images(url, max_images=10):
9
  """
10
- Download images from the given URL
11
 
12
  Args:
13
  url (str): URL of the webpage
14
- max_images (int): Maximum number of images to download
15
 
16
  Returns:
17
- tuple: List of downloaded image paths and total image count
18
  """
19
  try:
20
- # Create a directory to save images if it doesn't exist
21
- os.makedirs('downloaded_images', exist_ok=True)
22
-
23
  # Send a request to the URL
24
  response = requests.get(url)
25
  response.raise_for_status()
@@ -27,22 +23,21 @@ def download_images(url, max_images=10):
27
  # Parse the HTML content
28
  soup = BeautifulSoup(response.content, 'html.parser')
29
 
30
- # Find all image tags
31
- img_tags = soup.find_all('img', src=True)
32
-
33
- # List to store downloaded image paths
34
- downloaded_images = []
35
 
36
  # Download images
37
- for i, img in enumerate(img_tags[:max_images], 1):
38
- # Get the image source URL
39
- img_url = img['src']
40
-
41
- # Handle relative URLs
42
- if not img_url.startswith(('http://', 'https://')):
43
- img_url = urljoin(url, img_url)
44
-
45
  try:
 
 
 
 
 
 
 
46
  # Download the image
47
  img_response = requests.get(img_url)
48
  img_response.raise_for_status()
@@ -54,97 +49,90 @@ def download_images(url, max_images=10):
54
  with open(filename, 'wb') as f:
55
  f.write(img_response.content)
56
 
 
 
57
  downloaded_images.append(filename)
58
 
59
  except Exception as img_error:
60
  st.warning(f"Could not download image {i}: {img_error}")
61
 
62
- return downloaded_images, len(img_tags)
63
-
64
- except Exception as e:
65
- st.error(f"Error occurred while downloading images: {e}")
66
- return [], 0
67
-
68
- def scrape_visible_text_from_url(url):
69
- """
70
- Scrape visible text from the given URL
71
-
72
- Args:
73
- url (str): URL of the webpage
74
-
75
- Returns:
76
- str: Extracted visible text
77
- """
78
- try:
79
- response = requests.get(url)
80
- response.raise_for_status()
81
- soup = BeautifulSoup(response.content, 'html.parser')
82
-
83
- # Remove script, style, and other non-visible tags
84
- for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
85
- tag.extract()
86
-
87
- # Get the header content
88
- header_content = soup.find("header")
89
- header_text = header_content.get_text() if header_content else ""
90
 
91
- # Get the paragraph content
92
- paragraph_content = soup.find_all("p")
93
- paragraph_text = " ".join([p.get_text() for p in paragraph_content])
94
 
95
- # Combine header and paragraph text
96
- visible_text = f"{header_text}\n\n{paragraph_text}"
97
 
98
- # Remove multiple whitespaces and newlines
99
- visible_text = re.sub(r'\s+', ' ', visible_text)
100
- return visible_text.strip()
 
 
101
 
102
  except Exception as e:
103
- st.error(f"Error occurred while scraping the data: {e}")
104
  return None
105
 
106
  def main():
107
  """
108
  Main Streamlit application
109
  """
110
- st.title("Web Data Scraper with Image Downloader")
111
 
112
  # Get the URL from the user
113
  url_input = st.text_input("Enter the URL of the web page:", "")
114
 
115
- # Maximum images to download slider
116
- max_images = st.slider("Maximum number of images to download", 1, 20, 10)
 
117
 
118
  if st.button("Scrape Content"):
119
  if url_input:
120
- # Extract visible text from the URL
121
- text_data = scrape_visible_text_from_url(url_input)
122
 
123
- # Download images
124
- downloaded_images, total_images = download_images(url_input, max_images)
125
-
126
- if text_data:
127
  st.success("Content successfully scraped!")
128
 
129
- # Display text
130
- st.subheader("Scraped Text:")
131
- st.write(text_data)
 
 
 
 
 
 
 
132
 
133
- # Display and manage images
134
- st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")
 
 
 
 
 
 
 
 
 
135
 
136
- # Create columns for images
137
- if downloaded_images:
138
- cols = st.columns(min(len(downloaded_images), 3))
139
- for i, img_path in enumerate(downloaded_images):
 
140
  with cols[i % 3]:
141
  st.image(img_path, use_column_width=True)
142
 
143
- # Provide download option for all images
144
  with open('downloaded_images.zip', 'wb') as zipf:
145
  import zipfile
146
  with zipfile.ZipFile(zipf, 'w') as zip_file:
147
- for img_path in downloaded_images:
148
  zip_file.write(img_path, os.path.basename(img_path))
149
 
150
  st.download_button(
@@ -153,8 +141,6 @@ def main():
153
  file_name='downloaded_images.zip',
154
  mime='application/zip'
155
  )
156
- else:
157
- st.warning("No images found on the page.")
158
  else:
159
  st.warning("Failed to scrape content from the URL.")
160
  else:
 
5
  import os
6
  from urllib.parse import urljoin
7
 
8
+ def scrape_web_content(url):
9
  """
10
+ Scrape the web content while preserving its original formatting
11
 
12
  Args:
13
  url (str): URL of the webpage
 
14
 
15
  Returns:
16
+ dict: Extracted content with text, HTML, and images
17
  """
18
  try:
 
 
 
19
  # Send a request to the URL
20
  response = requests.get(url)
21
  response.raise_for_status()
 
23
  # Parse the HTML content
24
  soup = BeautifulSoup(response.content, 'html.parser')
25
 
26
+ # Create a directory to save images if it doesn't exist
27
+ os.makedirs('downloaded_images', exist_ok=True)
 
 
 
28
 
29
  # Download images
30
+ downloaded_images = []
31
+ img_tags = soup.find_all('img', src=True)
32
+ for i, img in enumerate(img_tags[:10], 1):
 
 
 
 
 
33
  try:
34
+ # Get the image source URL
35
+ img_url = img['src']
36
+
37
+ # Handle relative URLs
38
+ if not img_url.startswith(('http://', 'https://')):
39
+ img_url = urljoin(url, img_url)
40
+
41
  # Download the image
42
  img_response = requests.get(img_url)
43
  img_response.raise_for_status()
 
49
  with open(filename, 'wb') as f:
50
  f.write(img_response.content)
51
 
52
+ # Update the image tag in the soup to point to local file
53
+ img['src'] = filename
54
  downloaded_images.append(filename)
55
 
56
  except Exception as img_error:
57
  st.warning(f"Could not download image {i}: {img_error}")
58
 
59
+ # Remove unwanted tags
60
+ for tag in soup(["script", "style", "meta", "link", "noscript"]):
61
+ tag.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # Convert remaining soup to HTML string
64
+ formatted_html = str(soup)
 
65
 
66
+ # Extract plain text for preview
67
+ plain_text = soup.get_text(separator='\n', strip=True)
68
 
69
+ return {
70
+ 'html': formatted_html,
71
+ 'plain_text': plain_text,
72
+ 'images': downloaded_images
73
+ }
74
 
75
  except Exception as e:
76
+ st.error(f"Error occurred while scraping the content: {e}")
77
  return None
78
 
79
  def main():
80
  """
81
  Main Streamlit application
82
  """
83
+ st.title("Web Content Scraper with Preserved Formatting")
84
 
85
  # Get the URL from the user
86
  url_input = st.text_input("Enter the URL of the web page:", "")
87
 
88
+ # Option to choose display mode
89
+ display_mode = st.radio("Display Mode:",
90
+ ["Full HTML", "Plain Text", "Side-by-Side"])
91
 
92
  if st.button("Scrape Content"):
93
  if url_input:
94
+ # Scrape the content
95
+ scraped_content = scrape_web_content(url_input)
96
 
97
+ if scraped_content:
 
 
 
98
  st.success("Content successfully scraped!")
99
 
100
+ # Display content based on selected mode
101
+ if display_mode == "Full HTML":
102
+ # Display full HTML with preserved formatting
103
+ st.markdown("### Formatted Web Content")
104
+ st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
105
+
106
+ elif display_mode == "Plain Text":
107
+ # Display plain text
108
+ st.markdown("### Plain Text Content")
109
+ st.text_area("Scraped Text:", scraped_content['plain_text'], height=400)
110
 
111
+ else: # Side-by-Side
112
+ # Split the screen to show HTML and plain text
113
+ col1, col2 = st.columns(2)
114
+
115
+ with col1:
116
+ st.markdown("### Formatted HTML")
117
+ st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
118
+
119
+ with col2:
120
+ st.markdown("### Plain Text")
121
+ st.text_area("Scraped Text:", scraped_content['plain_text'], height=600)
122
 
123
+ # Display images
124
+ if scraped_content['images']:
125
+ st.subheader("Downloaded Images")
126
+ cols = st.columns(min(len(scraped_content['images']), 3))
127
+ for i, img_path in enumerate(scraped_content['images']):
128
  with cols[i % 3]:
129
  st.image(img_path, use_column_width=True)
130
 
131
+ # Zip and download option for images
132
  with open('downloaded_images.zip', 'wb') as zipf:
133
  import zipfile
134
  with zipfile.ZipFile(zipf, 'w') as zip_file:
135
+ for img_path in scraped_content['images']:
136
  zip_file.write(img_path, os.path.basename(img_path))
137
 
138
  st.download_button(
 
141
  file_name='downloaded_images.zip',
142
  mime='application/zip'
143
  )
 
 
144
  else:
145
  st.warning("Failed to scrape content from the URL.")
146
  else: