import streamlit as st import requests from bs4 import BeautifulSoup import re import os from urllib.parse import urljoin def apply_theme(theme): """ Apply custom CSS based on the selected theme """ if theme == "Claro": st.markdown(""" """, unsafe_allow_html=True) return """
""" else: st.markdown(""" """, unsafe_allow_html=True) return "" def scrape_web_content(url, max_images, theme): """ Scrape the web content while preserving its original formatting """ try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') os.makedirs('downloaded_images', exist_ok=True) downloaded_images = [] img_tags = soup.find_all('img', src=True) for i, img in enumerate(img_tags[:max_images], 1): try: img_url = img['src'] if not img_url.startswith(('http://', 'https://')): img_url = urljoin(url, img_url) img_response = requests.get(img_url) img_response.raise_for_status() filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}' with open(filename, 'wb') as f: f.write(img_response.content) img['src'] = filename downloaded_images.append(filename) except Exception as img_error: st.warning(f"Could not download image {i}: {img_error}") for tag in soup(["script", "style", "meta", "link", "noscript"]): tag.decompose() theme_prefix = apply_theme(theme) if theme == "Claro" else "" formatted_html = theme_prefix + str(soup) plain_text = soup.get_text(separator='\n', strip=True) return { 'html': formatted_html, 'plain_text': plain_text, 'images': downloaded_images } except Exception as e: st.error(f"Error occurred while scraping the content: {e}") return None def main(): """ Main Streamlit application """ st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide") theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"]) apply_theme(theme) st.title("Web Content Scraper") url_input = st.text_input("Enter the URL of the web page:", "") display_mode = st.radio("Display Mode:", ["Full HTML", "Plain Text", "Side-by-Side"]) max_images = st.slider("Maximum number of images to download", 1, 40, 10) if st.button("Scrape Content"): if url_input: scraped_content = scrape_web_content(url_input, max_images, theme) if scraped_content: st.success("Content successfully scraped!") # Resto del código para mostrar contenido... else: st.warning("Please enter a valid URL.") if __name__ == "__main__": main()