web_scraper

Running

App Files Files Community

web_scraper / app.py

Marcepelaez

app

befec25 verified about 2 months ago

raw

history blame

5.78 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re
	import os
	from urllib.parse import urljoin

	def download_images(url, max_images=10):
	"""
	Download images from the given URL

	Args:
	url (str): URL of the webpage
	max_images (int): Maximum number of images to download

	Returns:
	tuple: List of downloaded image paths and total image count
	"""
	try:
	# Create a directory to save images if it doesn't exist
	os.makedirs('downloaded_images', exist_ok=True)

	# Send a request to the URL
	response = requests.get(url)
	response.raise_for_status()

	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all image tags
	img_tags = soup.find_all('img', src=True)

	# List to store downloaded image paths
	downloaded_images = []

	# Download images
	for i, img in enumerate(img_tags[:max_images], 1):
	# Get the image source URL
	img_url = img['src']

	# Handle relative URLs
	if not img_url.startswith(('http://', 'https://')):
	img_url = urljoin(url, img_url)

	try:
	# Download the image
	img_response = requests.get(img_url)
	img_response.raise_for_status()

	# Generate a unique filename
	filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'

	# Save the image
	with open(filename, 'wb') as f:
	f.write(img_response.content)

	downloaded_images.append(filename)

	except Exception as img_error:
	st.warning(f"Could not download image {i}: {img_error}")

	return downloaded_images, len(img_tags)

	except Exception as e:
	st.error(f"Error occurred while downloading images: {e}")
	return [], 0

	def scrape_visible_text_from_url(url):
	"""
	Scrape visible text from the given URL

	Args:
	url (str): URL of the webpage

	Returns:
	str: Extracted visible text
	"""
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script, style, and other non-visible tags
	for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
	tag.extract()

	# Get the header content
	header_content = soup.find("header")
	header_text = header_content.get_text() if header_content else ""

	# Get the paragraph content
	paragraph_content = soup.find_all("p")
	paragraph_text = " ".join([p.get_text() for p in paragraph_content])

	# Combine header and paragraph text
	visible_text = f"{header_text}\n\n{paragraph_text}"

	# Remove multiple whitespaces and newlines
	visible_text = re.sub(r'\s+', ' ', visible_text)
	return visible_text.strip()

	except Exception as e:
	st.error(f"Error occurred while scraping the data: {e}")
	return None

	def main():
	"""
	Main Streamlit application
	"""
	st.title("Web Data Scraper with Image Downloader")

	# Get the URL from the user
	url_input = st.text_input("Enter the URL of the web page:", "")

	# Maximum images to download slider
	max_images = st.slider("Maximum number of images to download", 1, 20, 10)

	if st.button("Scrape Content"):
	if url_input:
	# Extract visible text from the URL
	text_data = scrape_visible_text_from_url(url_input)

	# Download images
	downloaded_images, total_images = download_images(url_input, max_images)

	if text_data:
	st.success("Content successfully scraped!")

	# Display text
	st.subheader("Scraped Text:")
	st.write(text_data)

	# Display and manage images
	st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")

	# Create columns for images
	if downloaded_images:
	cols = st.columns(min(len(downloaded_images), 3))
	for i, img_path in enumerate(downloaded_images):
	with cols[i % 3]:
	st.image(img_path, use_column_width=True)

	# Provide download option for all images
	with open('downloaded_images.zip', 'wb') as zipf:
	import zipfile
	with zipfile.ZipFile(zipf, 'w') as zip_file:
	for img_path in downloaded_images:
	zip_file.write(img_path, os.path.basename(img_path))

	st.download_button(
	label="Download All Images",
	data=open('downloaded_images.zip', 'rb').read(),
	file_name='downloaded_images.zip',
	mime='application/zip'
	)
	else:
	st.warning("No images found on the page.")
	else:
	st.warning("Failed to scrape content from the URL.")
	else:
	st.warning("Please enter a valid URL.")

	if __name__ == "__main__":
	main()