web_scraper

Running

App Files Files Community

web_scraper / app.py

Marcepelaez

app

4680d28 verified 19 days ago

raw

history blame

8.11 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re
	import os
	from urllib.parse import urljoin

	def apply_theme(theme):
	"""
	Apply custom CSS based on the selected theme
	"""
	if theme == "Claro":
	st.markdown("""
	<style>
	body {
	color: black;
	background-color: white;
	}
	.stTextInput > div > div > input {
	color: black;
	background-color: white;
	}
	.stMarkdown {
	color: black;
	}
	/* Light theme for HTML content */
	.light-theme {
	background-color: white !important;
	color: black !important;
	}
	.light-theme a {
	color: #0066cc !important;
	}
	.light-theme h1, .light-theme h2, .light-theme h3,
	.light-theme h4, .light-theme h5, .light-theme h6 {
	color: #333 !important;
	}
	</style>
	""", unsafe_allow_html=True)
	return """
	<div style="background-color: white; color: black; padding: 20px;">
	<style>
	body { background-color: white !important; color: black !important; }
	a { color: #0066cc; }
	h1, h2, h3, h4, h5, h6 { color: #333; }
	</style>
	"""
	else:
	st.markdown("""
	<style>
	body {
	color: white;
	background-color: #0E1117;
	}
	.stTextInput > div > div > input {
	color: white;
	background-color: #262730;
	}
	.stMarkdown {
	color: white;
	}
	</style>
	""", unsafe_allow_html=True)
	return ""

	def scrape_web_content(url, max_images, theme):
	"""
	Scrape the web content while preserving its original formatting

	Args:
	url (str): URL of the webpage
	max_images (int): Maximum number of images to download
	theme (str): Selected theme (Claro/Oscuro)

	Returns:
	dict: Extracted content with text, HTML, and images
	"""
	try:
	# Send a request to the URL
	response = requests.get(url)
	response.raise_for_status()

	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Create a directory to save images if it doesn't exist
	os.makedirs('downloaded_images', exist_ok=True)

	# Download images
	downloaded_images = []
	img_tags = soup.find_all('img', src=True)
	for i, img in enumerate(img_tags[:max_images], 1):
	try:
	# Get the image source URL
	img_url = img['src']

	# Handle relative URLs
	if not img_url.startswith(('http://', 'https://')):
	img_url = urljoin(url, img_url)

	# Download the image
	img_response = requests.get(img_url)
	img_response.raise_for_status()

	# Generate a unique filename
	filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'

	# Save the image
	with open(filename, 'wb') as f:
	f.write(img_response.content)

	# Update the image tag in the soup to point to local file
	img['src'] = filename
	downloaded_images.append(filename)

	except Exception as img_error:
	st.warning(f"Could not download image {i}: {img_error}")

	# Remove unwanted tags
	for tag in soup(["script", "style", "meta", "link", "noscript"]):
	tag.decompose()

	# Apply light theme styling if selected
	theme_prefix = apply_theme(theme) if theme == "Claro" else ""

	# Convert remaining soup to HTML string with theme prefix
	formatted_html = theme_prefix + str(soup)

	# Extract plain text for preview
	plain_text = soup.get_text(separator='\n', strip=True)

	return {
	'html': formatted_html,
	'plain_text': plain_text,
	'images': downloaded_images
	}

	except Exception as e:
	st.error(f"Error occurred while scraping the content: {e}")
	return None

	def main():
	"""
	Main Streamlit application
	"""
	# Set page config
	st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")

	# Theme selector
	theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])

	# Apply selected theme
	apply_theme(theme)

	st.title("Web Content Scraper")

	# Get the URL from the user
	url_input = st.text_input("Enter the URL of the web page:", "")

	# Option to choose display mode
	display_mode = st.radio("Display Mode:",
	["Full HTML", "Plain Text", "Side-by-Side"])

	# Slider for maximum images (1-40)
	max_images = st.slider("Maximum number of images to download", 1, 40, 10)

	if st.button("Scrape Content"):
	if url_input:
	# Scrape the content
	scraped_content = scrape_web_content(url_input, max_images, theme)

	if scraped_content:
	st.success("Content successfully scraped!")

	# Display content based on selected mode
	if display_mode == "Full HTML":
	# Display full HTML with preserved formatting
	st.markdown("### Formatted Web Content")
	st.components.v1.html(scraped_content['html'], height=600, scrolling=True)

	elif display_mode == "Plain Text":
	# Display plain text
	st.markdown("### Plain Text Content")
	st.text_area("Scraped Text:", scraped_content['plain_text'], height=400)

	else: # Side-by-Side
	# Split the screen to show HTML and plain text
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### Formatted HTML")
	st.components.v1.html(scraped_content['html'], height=600, scrolling=True)

	with col2:
	st.markdown("### Plain Text")
	st.text_area("Scraped Text:", scraped_content['plain_text'], height=600)

	# Display images
	if scraped_content['images']:
	st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")

	# Create a grid of image columns
	cols = st.columns(4) # 4 columns for better layout with more images
	for i, img_path in enumerate(scraped_content['images']):
	with cols[i % 4]:
	st.image(img_path, use_column_width=True)

	# Zip and download option for images
	with open('downloaded_images.zip', 'wb') as zipf:
	import zipfile
	with zipfile.ZipFile(zipf, 'w') as zip_file:
	for img_path in scraped_content['images']:
	zip_file.write(img_path, os.path.basename(img_path))

	st.download_button(
	label="Download All Images",
	data=open('downloaded_images.zip', 'rb').read(),
	file_name='downloaded_images.zip',
	mime='application/zip'
	)
	else:
	st.warning("Failed to scrape content from the URL.")
	else:
	st.warning("Please enter a valid URL.")

	if __name__ == "__main__":
	main()