web_scraper / app.py
Marcepelaez's picture
app
befec25 verified
raw
history blame
5.78 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin
def download_images(url, max_images=10):
"""
Download images from the given URL
Args:
url (str): URL of the webpage
max_images (int): Maximum number of images to download
Returns:
tuple: List of downloaded image paths and total image count
"""
try:
# Create a directory to save images if it doesn't exist
os.makedirs('downloaded_images', exist_ok=True)
# Send a request to the URL
response = requests.get(url)
response.raise_for_status()
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all image tags
img_tags = soup.find_all('img', src=True)
# List to store downloaded image paths
downloaded_images = []
# Download images
for i, img in enumerate(img_tags[:max_images], 1):
# Get the image source URL
img_url = img['src']
# Handle relative URLs
if not img_url.startswith(('http://', 'https://')):
img_url = urljoin(url, img_url)
try:
# Download the image
img_response = requests.get(img_url)
img_response.raise_for_status()
# Generate a unique filename
filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
# Save the image
with open(filename, 'wb') as f:
f.write(img_response.content)
downloaded_images.append(filename)
except Exception as img_error:
st.warning(f"Could not download image {i}: {img_error}")
return downloaded_images, len(img_tags)
except Exception as e:
st.error(f"Error occurred while downloading images: {e}")
return [], 0
def scrape_visible_text_from_url(url):
"""
Scrape visible text from the given URL
Args:
url (str): URL of the webpage
Returns:
str: Extracted visible text
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script, style, and other non-visible tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
tag.extract()
# Get the header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Get the paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
# Combine header and paragraph text
visible_text = f"{header_text}\n\n{paragraph_text}"
# Remove multiple whitespaces and newlines
visible_text = re.sub(r'\s+', ' ', visible_text)
return visible_text.strip()
except Exception as e:
st.error(f"Error occurred while scraping the data: {e}")
return None
def main():
"""
Main Streamlit application
"""
st.title("Web Data Scraper with Image Downloader")
# Get the URL from the user
url_input = st.text_input("Enter the URL of the web page:", "")
# Maximum images to download slider
max_images = st.slider("Maximum number of images to download", 1, 20, 10)
if st.button("Scrape Content"):
if url_input:
# Extract visible text from the URL
text_data = scrape_visible_text_from_url(url_input)
# Download images
downloaded_images, total_images = download_images(url_input, max_images)
if text_data:
st.success("Content successfully scraped!")
# Display text
st.subheader("Scraped Text:")
st.write(text_data)
# Display and manage images
st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")
# Create columns for images
if downloaded_images:
cols = st.columns(min(len(downloaded_images), 3))
for i, img_path in enumerate(downloaded_images):
with cols[i % 3]:
st.image(img_path, use_column_width=True)
# Provide download option for all images
with open('downloaded_images.zip', 'wb') as zipf:
import zipfile
with zipfile.ZipFile(zipf, 'w') as zip_file:
for img_path in downloaded_images:
zip_file.write(img_path, os.path.basename(img_path))
st.download_button(
label="Download All Images",
data=open('downloaded_images.zip', 'rb').read(),
file_name='downloaded_images.zip',
mime='application/zip'
)
else:
st.warning("No images found on the page.")
else:
st.warning("Failed to scrape content from the URL.")
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main()