Spaces:
Running
Running
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import os | |
from urllib.parse import urljoin | |
def download_images(url, max_images=10): | |
""" | |
Download images from the given URL | |
Args: | |
url (str): URL of the webpage | |
max_images (int): Maximum number of images to download | |
Returns: | |
tuple: List of downloaded image paths and total image count | |
""" | |
try: | |
# Create a directory to save images if it doesn't exist | |
os.makedirs('downloaded_images', exist_ok=True) | |
# Send a request to the URL | |
response = requests.get(url) | |
response.raise_for_status() | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find all image tags | |
img_tags = soup.find_all('img', src=True) | |
# List to store downloaded image paths | |
downloaded_images = [] | |
# Download images | |
for i, img in enumerate(img_tags[:max_images], 1): | |
# Get the image source URL | |
img_url = img['src'] | |
# Handle relative URLs | |
if not img_url.startswith(('http://', 'https://')): | |
img_url = urljoin(url, img_url) | |
try: | |
# Download the image | |
img_response = requests.get(img_url) | |
img_response.raise_for_status() | |
# Generate a unique filename | |
filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}' | |
# Save the image | |
with open(filename, 'wb') as f: | |
f.write(img_response.content) | |
downloaded_images.append(filename) | |
except Exception as img_error: | |
st.warning(f"Could not download image {i}: {img_error}") | |
return downloaded_images, len(img_tags) | |
except Exception as e: | |
st.error(f"Error occurred while downloading images: {e}") | |
return [], 0 | |
def scrape_visible_text_from_url(url): | |
""" | |
Scrape visible text from the given URL | |
Args: | |
url (str): URL of the webpage | |
Returns: | |
str: Extracted visible text | |
""" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Remove script, style, and other non-visible tags | |
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]): | |
tag.extract() | |
# Get the header content | |
header_content = soup.find("header") | |
header_text = header_content.get_text() if header_content else "" | |
# Get the paragraph content | |
paragraph_content = soup.find_all("p") | |
paragraph_text = " ".join([p.get_text() for p in paragraph_content]) | |
# Combine header and paragraph text | |
visible_text = f"{header_text}\n\n{paragraph_text}" | |
# Remove multiple whitespaces and newlines | |
visible_text = re.sub(r'\s+', ' ', visible_text) | |
return visible_text.strip() | |
except Exception as e: | |
st.error(f"Error occurred while scraping the data: {e}") | |
return None | |
def main(): | |
""" | |
Main Streamlit application | |
""" | |
st.title("Web Data Scraper with Image Downloader") | |
# Get the URL from the user | |
url_input = st.text_input("Enter the URL of the web page:", "") | |
# Maximum images to download slider | |
max_images = st.slider("Maximum number of images to download", 1, 20, 10) | |
if st.button("Scrape Content"): | |
if url_input: | |
# Extract visible text from the URL | |
text_data = scrape_visible_text_from_url(url_input) | |
# Download images | |
downloaded_images, total_images = download_images(url_input, max_images) | |
if text_data: | |
st.success("Content successfully scraped!") | |
# Display text | |
st.subheader("Scraped Text:") | |
st.write(text_data) | |
# Display and manage images | |
st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)") | |
# Create columns for images | |
if downloaded_images: | |
cols = st.columns(min(len(downloaded_images), 3)) | |
for i, img_path in enumerate(downloaded_images): | |
with cols[i % 3]: | |
st.image(img_path, use_column_width=True) | |
# Provide download option for all images | |
with open('downloaded_images.zip', 'wb') as zipf: | |
import zipfile | |
with zipfile.ZipFile(zipf, 'w') as zip_file: | |
for img_path in downloaded_images: | |
zip_file.write(img_path, os.path.basename(img_path)) | |
st.download_button( | |
label="Download All Images", | |
data=open('downloaded_images.zip', 'rb').read(), | |
file_name='downloaded_images.zip', | |
mime='application/zip' | |
) | |
else: | |
st.warning("No images found on the page.") | |
else: | |
st.warning("Failed to scrape content from the URL.") | |
else: | |
st.warning("Please enter a valid URL.") | |
if __name__ == "__main__": | |
main() |