import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin
def apply_theme(theme):
"""
Apply custom CSS based on the selected theme
"""
if theme == "Claro":
st.markdown("""
""", unsafe_allow_html=True)
return """
"""
else:
st.markdown("""
""", unsafe_allow_html=True)
return ""
def scrape_web_content(url, max_images, theme):
"""
Scrape the web content while preserving its original formatting
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
os.makedirs('downloaded_images', exist_ok=True)
downloaded_images = []
img_tags = soup.find_all('img', src=True)
for i, img in enumerate(img_tags[:max_images], 1):
try:
img_url = img['src']
if not img_url.startswith(('http://', 'https://')):
img_url = urljoin(url, img_url)
img_response = requests.get(img_url)
img_response.raise_for_status()
filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
with open(filename, 'wb') as f:
f.write(img_response.content)
img['src'] = filename
downloaded_images.append(filename)
except Exception as img_error:
st.warning(f"Could not download image {i}: {img_error}")
for tag in soup(["script", "style", "meta", "link", "noscript"]):
tag.decompose()
theme_prefix = apply_theme(theme) if theme == "Claro" else ""
formatted_html = theme_prefix + str(soup)
plain_text = soup.get_text(separator='\n', strip=True)
return {
'html': formatted_html,
'plain_text': plain_text,
'images': downloaded_images
}
except Exception as e:
st.error(f"Error occurred while scraping the content: {e}")
return None
def main():
"""
Main Streamlit application
"""
st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
apply_theme(theme)
st.title("Web Content Scraper")
url_input = st.text_input("Enter the URL of the web page:", "")
display_mode = st.radio("Display Mode:", ["Full HTML", "Plain Text", "Side-by-Side"])
max_images = st.slider("Maximum number of images to download", 1, 40, 10)
if st.button("Scrape Content"):
if url_input:
scraped_content = scrape_web_content(url_input, max_images, theme)
if scraped_content:
st.success("Content successfully scraped!")
# Resto del código para mostrar contenido...
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main()