Marcepelaez commited on
Commit
4680d28
1 Parent(s): 1bf2dc7
Files changed (1) hide show
  1. app.py +94 -3
app.py CHANGED
@@ -66,39 +66,67 @@ def apply_theme(theme):
66
  def scrape_web_content(url, max_images, theme):
67
  """
68
  Scrape the web content while preserving its original formatting
 
 
 
 
 
 
 
 
69
  """
70
  try:
 
71
  response = requests.get(url)
72
  response.raise_for_status()
73
 
 
74
  soup = BeautifulSoup(response.content, 'html.parser')
 
 
75
  os.makedirs('downloaded_images', exist_ok=True)
76
 
 
77
  downloaded_images = []
78
  img_tags = soup.find_all('img', src=True)
79
  for i, img in enumerate(img_tags[:max_images], 1):
80
  try:
 
81
  img_url = img['src']
 
 
82
  if not img_url.startswith(('http://', 'https://')):
83
  img_url = urljoin(url, img_url)
84
 
 
85
  img_response = requests.get(img_url)
86
  img_response.raise_for_status()
87
 
 
88
  filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
 
 
89
  with open(filename, 'wb') as f:
90
  f.write(img_response.content)
91
 
 
92
  img['src'] = filename
93
  downloaded_images.append(filename)
 
94
  except Exception as img_error:
95
  st.warning(f"Could not download image {i}: {img_error}")
96
 
 
97
  for tag in soup(["script", "style", "meta", "link", "noscript"]):
98
  tag.decompose()
99
 
 
100
  theme_prefix = apply_theme(theme) if theme == "Claro" else ""
 
 
101
  formatted_html = theme_prefix + str(soup)
 
 
102
  plain_text = soup.get_text(separator='\n', strip=True)
103
 
104
  return {
@@ -115,22 +143,85 @@ def main():
115
  """
116
  Main Streamlit application
117
  """
 
118
  st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
 
 
119
  theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
 
 
120
  apply_theme(theme)
 
121
  st.title("Web Content Scraper")
 
 
122
  url_input = st.text_input("Enter the URL of the web page:", "")
123
- display_mode = st.radio("Display Mode:", ["Full HTML", "Plain Text", "Side-by-Side"])
 
 
 
 
 
124
  max_images = st.slider("Maximum number of images to download", 1, 40, 10)
125
 
126
  if st.button("Scrape Content"):
127
  if url_input:
 
128
  scraped_content = scrape_web_content(url_input, max_images, theme)
 
129
  if scraped_content:
130
  st.success("Content successfully scraped!")
131
- # Resto del código para mostrar contenido...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  else:
133
  st.warning("Please enter a valid URL.")
134
 
135
  if __name__ == "__main__":
136
- main()
 
66
  def scrape_web_content(url, max_images, theme):
67
  """
68
  Scrape the web content while preserving its original formatting
69
+
70
+ Args:
71
+ url (str): URL of the webpage
72
+ max_images (int): Maximum number of images to download
73
+ theme (str): Selected theme (Claro/Oscuro)
74
+
75
+ Returns:
76
+ dict: Extracted content with text, HTML, and images
77
  """
78
  try:
79
+ # Send a request to the URL
80
  response = requests.get(url)
81
  response.raise_for_status()
82
 
83
+ # Parse the HTML content
84
  soup = BeautifulSoup(response.content, 'html.parser')
85
+
86
+ # Create a directory to save images if it doesn't exist
87
  os.makedirs('downloaded_images', exist_ok=True)
88
 
89
+ # Download images
90
  downloaded_images = []
91
  img_tags = soup.find_all('img', src=True)
92
  for i, img in enumerate(img_tags[:max_images], 1):
93
  try:
94
+ # Get the image source URL
95
  img_url = img['src']
96
+
97
+ # Handle relative URLs
98
  if not img_url.startswith(('http://', 'https://')):
99
  img_url = urljoin(url, img_url)
100
 
101
+ # Download the image
102
  img_response = requests.get(img_url)
103
  img_response.raise_for_status()
104
 
105
+ # Generate a unique filename
106
  filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
107
+
108
+ # Save the image
109
  with open(filename, 'wb') as f:
110
  f.write(img_response.content)
111
 
112
+ # Update the image tag in the soup to point to local file
113
  img['src'] = filename
114
  downloaded_images.append(filename)
115
+
116
  except Exception as img_error:
117
  st.warning(f"Could not download image {i}: {img_error}")
118
 
119
+ # Remove unwanted tags
120
  for tag in soup(["script", "style", "meta", "link", "noscript"]):
121
  tag.decompose()
122
 
123
+ # Apply light theme styling if selected
124
  theme_prefix = apply_theme(theme) if theme == "Claro" else ""
125
+
126
+ # Convert remaining soup to HTML string with theme prefix
127
  formatted_html = theme_prefix + str(soup)
128
+
129
+ # Extract plain text for preview
130
  plain_text = soup.get_text(separator='\n', strip=True)
131
 
132
  return {
 
143
  """
144
  Main Streamlit application
145
  """
146
+ # Set page config
147
  st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
148
+
149
+ # Theme selector
150
  theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
151
+
152
+ # Apply selected theme
153
  apply_theme(theme)
154
+
155
  st.title("Web Content Scraper")
156
+
157
+ # Get the URL from the user
158
  url_input = st.text_input("Enter the URL of the web page:", "")
159
+
160
+ # Option to choose display mode
161
+ display_mode = st.radio("Display Mode:",
162
+ ["Full HTML", "Plain Text", "Side-by-Side"])
163
+
164
+ # Slider for maximum images (1-40)
165
  max_images = st.slider("Maximum number of images to download", 1, 40, 10)
166
 
167
  if st.button("Scrape Content"):
168
  if url_input:
169
+ # Scrape the content
170
  scraped_content = scrape_web_content(url_input, max_images, theme)
171
+
172
  if scraped_content:
173
  st.success("Content successfully scraped!")
174
+
175
+ # Display content based on selected mode
176
+ if display_mode == "Full HTML":
177
+ # Display full HTML with preserved formatting
178
+ st.markdown("### Formatted Web Content")
179
+ st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
180
+
181
+ elif display_mode == "Plain Text":
182
+ # Display plain text
183
+ st.markdown("### Plain Text Content")
184
+ st.text_area("Scraped Text:", scraped_content['plain_text'], height=400)
185
+
186
+ else: # Side-by-Side
187
+ # Split the screen to show HTML and plain text
188
+ col1, col2 = st.columns(2)
189
+
190
+ with col1:
191
+ st.markdown("### Formatted HTML")
192
+ st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
193
+
194
+ with col2:
195
+ st.markdown("### Plain Text")
196
+ st.text_area("Scraped Text:", scraped_content['plain_text'], height=600)
197
+
198
+ # Display images
199
+ if scraped_content['images']:
200
+ st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")
201
+
202
+ # Create a grid of image columns
203
+ cols = st.columns(4) # 4 columns for better layout with more images
204
+ for i, img_path in enumerate(scraped_content['images']):
205
+ with cols[i % 4]:
206
+ st.image(img_path, use_column_width=True)
207
+
208
+ # Zip and download option for images
209
+ with open('downloaded_images.zip', 'wb') as zipf:
210
+ import zipfile
211
+ with zipfile.ZipFile(zipf, 'w') as zip_file:
212
+ for img_path in scraped_content['images']:
213
+ zip_file.write(img_path, os.path.basename(img_path))
214
+
215
+ st.download_button(
216
+ label="Download All Images",
217
+ data=open('downloaded_images.zip', 'rb').read(),
218
+ file_name='downloaded_images.zip',
219
+ mime='application/zip'
220
+ )
221
+ else:
222
+ st.warning("Failed to scrape content from the URL.")
223
  else:
224
  st.warning("Please enter a valid URL.")
225
 
226
  if __name__ == "__main__":
227
+ main()