web_scraper

Running

App Files Files Community

Marcepelaez commited on 26 days ago

Commit

0f3572b

•

1 Parent(s): 2df3230

app

Browse files

Files changed (1) hide show

app.py +59 -7

app.py CHANGED Viewed

@@ -5,12 +5,50 @@ import re
 import os
 from urllib.parse import urljoin
-def scrape_web_content(url):
     """
     Scrape the web content while preserving its original formatting
     Args:
         url (str): URL of the webpage
     Returns:
         dict: Extracted content with text, HTML, and images
@@ -29,7 +67,7 @@ def scrape_web_content(url):
         # Download images
         downloaded_images = []
         img_tags = soup.find_all('img', src=True)
-        for i, img in enumerate(img_tags[:10], 1):
             try:
                 # Get the image source URL
                 img_url = img['src']
@@ -80,7 +118,16 @@ def main():
     """
     Main Streamlit application
     """
-    st.title("Web Content Scraper with Preserved Formatting")
     # Get the URL from the user
     url_input = st.text_input("Enter the URL of the web page:", "")
@@ -89,10 +136,13 @@ def main():
     display_mode = st.radio("Display Mode:",
                              ["Full HTML", "Plain Text", "Side-by-Side"])
     if st.button("Scrape Content"):
         if url_input:
             # Scrape the content
-            scraped_content = scrape_web_content(url_input)
             if scraped_content:
                 st.success("Content successfully scraped!")
@@ -122,10 +172,12 @@ def main():
                 # Display images
                 if scraped_content['images']:
-                    st.subheader("Downloaded Images")
-                    cols = st.columns(min(len(scraped_content['images']), 3))
                     for i, img_path in enumerate(scraped_content['images']):
-                        with cols[i % 3]:
                             st.image(img_path, use_column_width=True)
                     # Zip and download option for images

 import os
 from urllib.parse import urljoin
+def apply_theme(theme):
+    """
+    Apply custom CSS based on the selected theme
+    """
+    if theme == "Claro":
+        st.markdown("""
+        <style>
+        body {
+            color: black;
+            background-color: white;
+        }
+        .stTextInput > div > div > input {
+            color: black;
+            background-color: white;
+        }
+        .stMarkdown {
+            color: black;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+    else:
+        st.markdown("""
+        <style>
+        body {
+            color: white;
+            background-color: #0E1117;
+        }
+        .stTextInput > div > div > input {
+            color: white;
+            background-color: #262730;
+        }
+        .stMarkdown {
+            color: white;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+def scrape_web_content(url, max_images):
     """
     Scrape the web content while preserving its original formatting
     Args:
         url (str): URL of the webpage
+        max_images (int): Maximum number of images to download
     Returns:
         dict: Extracted content with text, HTML, and images
         # Download images
         downloaded_images = []
         img_tags = soup.find_all('img', src=True)
+        for i, img in enumerate(img_tags[:max_images], 1):
             try:
                 # Get the image source URL
                 img_url = img['src']
     """
     Main Streamlit application
     """
+    # Set page config
+    st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
+    # Theme selector
+    theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
+    # Apply selected theme
+    apply_theme(theme)
+    st.title("Web Content Scraper")
     # Get the URL from the user
     url_input = st.text_input("Enter the URL of the web page:", "")
     display_mode = st.radio("Display Mode:",
                              ["Full HTML", "Plain Text", "Side-by-Side"])
+    # Slider for maximum images (1-40)
+    max_images = st.slider("Maximum number of images to download", 1, 40, 10)
     if st.button("Scrape Content"):
         if url_input:
             # Scrape the content
+            scraped_content = scrape_web_content(url_input, max_images)
             if scraped_content:
                 st.success("Content successfully scraped!")
                 # Display images
                 if scraped_content['images']:
+                    st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")
+                    # Create a grid of image columns
+                    cols = st.columns(4)  # 4 columns for better layout with more images
                     for i, img_path in enumerate(scraped_content['images']):
+                        with cols[i % 4]:
                             st.image(img_path, use_column_width=True)
                     # Zip and download option for images