Marcepelaez commited on
Commit
0f3572b
1 Parent(s): 2df3230
Files changed (1) hide show
  1. app.py +59 -7
app.py CHANGED
@@ -5,12 +5,50 @@ import re
5
  import os
6
  from urllib.parse import urljoin
7
 
8
- def scrape_web_content(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
  Scrape the web content while preserving its original formatting
11
 
12
  Args:
13
  url (str): URL of the webpage
 
14
 
15
  Returns:
16
  dict: Extracted content with text, HTML, and images
@@ -29,7 +67,7 @@ def scrape_web_content(url):
29
  # Download images
30
  downloaded_images = []
31
  img_tags = soup.find_all('img', src=True)
32
- for i, img in enumerate(img_tags[:10], 1):
33
  try:
34
  # Get the image source URL
35
  img_url = img['src']
@@ -80,7 +118,16 @@ def main():
80
  """
81
  Main Streamlit application
82
  """
83
- st.title("Web Content Scraper with Preserved Formatting")
 
 
 
 
 
 
 
 
 
84
 
85
  # Get the URL from the user
86
  url_input = st.text_input("Enter the URL of the web page:", "")
@@ -89,10 +136,13 @@ def main():
89
  display_mode = st.radio("Display Mode:",
90
  ["Full HTML", "Plain Text", "Side-by-Side"])
91
 
 
 
 
92
  if st.button("Scrape Content"):
93
  if url_input:
94
  # Scrape the content
95
- scraped_content = scrape_web_content(url_input)
96
 
97
  if scraped_content:
98
  st.success("Content successfully scraped!")
@@ -122,10 +172,12 @@ def main():
122
 
123
  # Display images
124
  if scraped_content['images']:
125
- st.subheader("Downloaded Images")
126
- cols = st.columns(min(len(scraped_content['images']), 3))
 
 
127
  for i, img_path in enumerate(scraped_content['images']):
128
- with cols[i % 3]:
129
  st.image(img_path, use_column_width=True)
130
 
131
  # Zip and download option for images
 
5
  import os
6
  from urllib.parse import urljoin
7
 
8
+ def apply_theme(theme):
9
+ """
10
+ Apply custom CSS based on the selected theme
11
+ """
12
+ if theme == "Claro":
13
+ st.markdown("""
14
+ <style>
15
+ body {
16
+ color: black;
17
+ background-color: white;
18
+ }
19
+ .stTextInput > div > div > input {
20
+ color: black;
21
+ background-color: white;
22
+ }
23
+ .stMarkdown {
24
+ color: black;
25
+ }
26
+ </style>
27
+ """, unsafe_allow_html=True)
28
+ else:
29
+ st.markdown("""
30
+ <style>
31
+ body {
32
+ color: white;
33
+ background-color: #0E1117;
34
+ }
35
+ .stTextInput > div > div > input {
36
+ color: white;
37
+ background-color: #262730;
38
+ }
39
+ .stMarkdown {
40
+ color: white;
41
+ }
42
+ </style>
43
+ """, unsafe_allow_html=True)
44
+
45
+ def scrape_web_content(url, max_images):
46
  """
47
  Scrape the web content while preserving its original formatting
48
 
49
  Args:
50
  url (str): URL of the webpage
51
+ max_images (int): Maximum number of images to download
52
 
53
  Returns:
54
  dict: Extracted content with text, HTML, and images
 
67
  # Download images
68
  downloaded_images = []
69
  img_tags = soup.find_all('img', src=True)
70
+ for i, img in enumerate(img_tags[:max_images], 1):
71
  try:
72
  # Get the image source URL
73
  img_url = img['src']
 
118
  """
119
  Main Streamlit application
120
  """
121
+ # Set page config
122
+ st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
123
+
124
+ # Theme selector
125
+ theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
126
+
127
+ # Apply selected theme
128
+ apply_theme(theme)
129
+
130
+ st.title("Web Content Scraper")
131
 
132
  # Get the URL from the user
133
  url_input = st.text_input("Enter the URL of the web page:", "")
 
136
  display_mode = st.radio("Display Mode:",
137
  ["Full HTML", "Plain Text", "Side-by-Side"])
138
 
139
+ # Slider for maximum images (1-40)
140
+ max_images = st.slider("Maximum number of images to download", 1, 40, 10)
141
+
142
  if st.button("Scrape Content"):
143
  if url_input:
144
  # Scrape the content
145
+ scraped_content = scrape_web_content(url_input, max_images)
146
 
147
  if scraped_content:
148
  st.success("Content successfully scraped!")
 
172
 
173
  # Display images
174
  if scraped_content['images']:
175
+ st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")
176
+
177
+ # Create a grid of image columns
178
+ cols = st.columns(4) # 4 columns for better layout with more images
179
  for i, img_path in enumerate(scraped_content['images']):
180
+ with cols[i % 4]:
181
  st.image(img_path, use_column_width=True)
182
 
183
  # Zip and download option for images