Spaces:
Running
Running
Marcepelaez
commited on
Commit
•
0f3572b
1
Parent(s):
2df3230
app
Browse files
app.py
CHANGED
@@ -5,12 +5,50 @@ import re
|
|
5 |
import os
|
6 |
from urllib.parse import urljoin
|
7 |
|
8 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"""
|
10 |
Scrape the web content while preserving its original formatting
|
11 |
|
12 |
Args:
|
13 |
url (str): URL of the webpage
|
|
|
14 |
|
15 |
Returns:
|
16 |
dict: Extracted content with text, HTML, and images
|
@@ -29,7 +67,7 @@ def scrape_web_content(url):
|
|
29 |
# Download images
|
30 |
downloaded_images = []
|
31 |
img_tags = soup.find_all('img', src=True)
|
32 |
-
for i, img in enumerate(img_tags[:
|
33 |
try:
|
34 |
# Get the image source URL
|
35 |
img_url = img['src']
|
@@ -80,7 +118,16 @@ def main():
|
|
80 |
"""
|
81 |
Main Streamlit application
|
82 |
"""
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# Get the URL from the user
|
86 |
url_input = st.text_input("Enter the URL of the web page:", "")
|
@@ -89,10 +136,13 @@ def main():
|
|
89 |
display_mode = st.radio("Display Mode:",
|
90 |
["Full HTML", "Plain Text", "Side-by-Side"])
|
91 |
|
|
|
|
|
|
|
92 |
if st.button("Scrape Content"):
|
93 |
if url_input:
|
94 |
# Scrape the content
|
95 |
-
scraped_content = scrape_web_content(url_input)
|
96 |
|
97 |
if scraped_content:
|
98 |
st.success("Content successfully scraped!")
|
@@ -122,10 +172,12 @@ def main():
|
|
122 |
|
123 |
# Display images
|
124 |
if scraped_content['images']:
|
125 |
-
st.subheader("Downloaded Images")
|
126 |
-
|
|
|
|
|
127 |
for i, img_path in enumerate(scraped_content['images']):
|
128 |
-
with cols[i %
|
129 |
st.image(img_path, use_column_width=True)
|
130 |
|
131 |
# Zip and download option for images
|
|
|
5 |
import os
|
6 |
from urllib.parse import urljoin
|
7 |
|
8 |
+
def apply_theme(theme):
|
9 |
+
"""
|
10 |
+
Apply custom CSS based on the selected theme
|
11 |
+
"""
|
12 |
+
if theme == "Claro":
|
13 |
+
st.markdown("""
|
14 |
+
<style>
|
15 |
+
body {
|
16 |
+
color: black;
|
17 |
+
background-color: white;
|
18 |
+
}
|
19 |
+
.stTextInput > div > div > input {
|
20 |
+
color: black;
|
21 |
+
background-color: white;
|
22 |
+
}
|
23 |
+
.stMarkdown {
|
24 |
+
color: black;
|
25 |
+
}
|
26 |
+
</style>
|
27 |
+
""", unsafe_allow_html=True)
|
28 |
+
else:
|
29 |
+
st.markdown("""
|
30 |
+
<style>
|
31 |
+
body {
|
32 |
+
color: white;
|
33 |
+
background-color: #0E1117;
|
34 |
+
}
|
35 |
+
.stTextInput > div > div > input {
|
36 |
+
color: white;
|
37 |
+
background-color: #262730;
|
38 |
+
}
|
39 |
+
.stMarkdown {
|
40 |
+
color: white;
|
41 |
+
}
|
42 |
+
</style>
|
43 |
+
""", unsafe_allow_html=True)
|
44 |
+
|
45 |
+
def scrape_web_content(url, max_images):
|
46 |
"""
|
47 |
Scrape the web content while preserving its original formatting
|
48 |
|
49 |
Args:
|
50 |
url (str): URL of the webpage
|
51 |
+
max_images (int): Maximum number of images to download
|
52 |
|
53 |
Returns:
|
54 |
dict: Extracted content with text, HTML, and images
|
|
|
67 |
# Download images
|
68 |
downloaded_images = []
|
69 |
img_tags = soup.find_all('img', src=True)
|
70 |
+
for i, img in enumerate(img_tags[:max_images], 1):
|
71 |
try:
|
72 |
# Get the image source URL
|
73 |
img_url = img['src']
|
|
|
118 |
"""
|
119 |
Main Streamlit application
|
120 |
"""
|
121 |
+
# Set page config
|
122 |
+
st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
|
123 |
+
|
124 |
+
# Theme selector
|
125 |
+
theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
|
126 |
+
|
127 |
+
# Apply selected theme
|
128 |
+
apply_theme(theme)
|
129 |
+
|
130 |
+
st.title("Web Content Scraper")
|
131 |
|
132 |
# Get the URL from the user
|
133 |
url_input = st.text_input("Enter the URL of the web page:", "")
|
|
|
136 |
display_mode = st.radio("Display Mode:",
|
137 |
["Full HTML", "Plain Text", "Side-by-Side"])
|
138 |
|
139 |
+
# Slider for maximum images (1-40)
|
140 |
+
max_images = st.slider("Maximum number of images to download", 1, 40, 10)
|
141 |
+
|
142 |
if st.button("Scrape Content"):
|
143 |
if url_input:
|
144 |
# Scrape the content
|
145 |
+
scraped_content = scrape_web_content(url_input, max_images)
|
146 |
|
147 |
if scraped_content:
|
148 |
st.success("Content successfully scraped!")
|
|
|
172 |
|
173 |
# Display images
|
174 |
if scraped_content['images']:
|
175 |
+
st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")
|
176 |
+
|
177 |
+
# Create a grid of image columns
|
178 |
+
cols = st.columns(4) # 4 columns for better layout with more images
|
179 |
for i, img_path in enumerate(scraped_content['images']):
|
180 |
+
with cols[i % 4]:
|
181 |
st.image(img_path, use_column_width=True)
|
182 |
|
183 |
# Zip and download option for images
|