Spaces:
Running
Running
Marcepelaez
commited on
Commit
•
6ef7dfb
1
Parent(s):
0f3572b
app
Browse files
app.py
CHANGED
@@ -23,8 +23,28 @@ def apply_theme(theme):
|
|
23 |
.stMarkdown {
|
24 |
color: black;
|
25 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
</style>
|
27 |
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
else:
|
29 |
st.markdown("""
|
30 |
<style>
|
@@ -41,14 +61,16 @@ def apply_theme(theme):
|
|
41 |
}
|
42 |
</style>
|
43 |
""", unsafe_allow_html=True)
|
|
|
44 |
|
45 |
-
def scrape_web_content(url, max_images):
|
46 |
"""
|
47 |
Scrape the web content while preserving its original formatting
|
48 |
|
49 |
Args:
|
50 |
url (str): URL of the webpage
|
51 |
max_images (int): Maximum number of images to download
|
|
|
52 |
|
53 |
Returns:
|
54 |
dict: Extracted content with text, HTML, and images
|
@@ -98,8 +120,11 @@ def scrape_web_content(url, max_images):
|
|
98 |
for tag in soup(["script", "style", "meta", "link", "noscript"]):
|
99 |
tag.decompose()
|
100 |
|
101 |
-
#
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
# Extract plain text for preview
|
105 |
plain_text = soup.get_text(separator='\n', strip=True)
|
@@ -142,7 +167,7 @@ def main():
|
|
142 |
if st.button("Scrape Content"):
|
143 |
if url_input:
|
144 |
# Scrape the content
|
145 |
-
scraped_content = scrape_web_content(url_input, max_images)
|
146 |
|
147 |
if scraped_content:
|
148 |
st.success("Content successfully scraped!")
|
|
|
23 |
.stMarkdown {
|
24 |
color: black;
|
25 |
}
|
26 |
+
/* Light theme for HTML content */
|
27 |
+
.light-theme {
|
28 |
+
background-color: white !important;
|
29 |
+
color: black !important;
|
30 |
+
}
|
31 |
+
.light-theme a {
|
32 |
+
color: #0066cc !important;
|
33 |
+
}
|
34 |
+
.light-theme h1, .light-theme h2, .light-theme h3,
|
35 |
+
.light-theme h4, .light-theme h5, .light-theme h6 {
|
36 |
+
color: #333 !important;
|
37 |
+
}
|
38 |
</style>
|
39 |
""", unsafe_allow_html=True)
|
40 |
+
return """
|
41 |
+
<div style="background-color: white; color: black; padding: 20px;">
|
42 |
+
<style>
|
43 |
+
body { background-color: white !important; color: black !important; }
|
44 |
+
a { color: #0066cc; }
|
45 |
+
h1, h2, h3, h4, h5, h6 { color: #333; }
|
46 |
+
</style>
|
47 |
+
"""
|
48 |
else:
|
49 |
st.markdown("""
|
50 |
<style>
|
|
|
61 |
}
|
62 |
</style>
|
63 |
""", unsafe_allow_html=True)
|
64 |
+
return ""
|
65 |
|
66 |
+
def scrape_web_content(url, max_images, theme):
|
67 |
"""
|
68 |
Scrape the web content while preserving its original formatting
|
69 |
|
70 |
Args:
|
71 |
url (str): URL of the webpage
|
72 |
max_images (int): Maximum number of images to download
|
73 |
+
theme (str): Selected theme (Claro/Oscuro)
|
74 |
|
75 |
Returns:
|
76 |
dict: Extracted content with text, HTML, and images
|
|
|
120 |
for tag in soup(["script", "style", "meta", "link", "noscript"]):
|
121 |
tag.decompose()
|
122 |
|
123 |
+
# Apply light theme styling if selected
|
124 |
+
theme_prefix = apply_theme(theme) if theme == "Claro" else ""
|
125 |
+
|
126 |
+
# Convert remaining soup to HTML string with theme prefix
|
127 |
+
formatted_html = theme_prefix + str(soup)
|
128 |
|
129 |
# Extract plain text for preview
|
130 |
plain_text = soup.get_text(separator='\n', strip=True)
|
|
|
167 |
if st.button("Scrape Content"):
|
168 |
if url_input:
|
169 |
# Scrape the content
|
170 |
+
scraped_content = scrape_web_content(url_input, max_images, theme)
|
171 |
|
172 |
if scraped_content:
|
173 |
st.success("Content successfully scraped!")
|