import streamlit as st import requests from bs4 import BeautifulSoup import re import os from urllib.parse import urljoin def download_images(url, max_images=10): """ Download images from the given URL Args: url (str): URL of the webpage max_images (int): Maximum number of images to download Returns: tuple: List of downloaded image paths and total image count """ try: # Create a directory to save images if it doesn't exist os.makedirs('downloaded_images', exist_ok=True) # Send a request to the URL response = requests.get(url) response.raise_for_status() # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Find all image tags img_tags = soup.find_all('img', src=True) # List to store downloaded image paths downloaded_images = [] # Download images for i, img in enumerate(img_tags[:max_images], 1): # Get the image source URL img_url = img['src'] # Handle relative URLs if not img_url.startswith(('http://', 'https://')): img_url = urljoin(url, img_url) try: # Download the image img_response = requests.get(img_url) img_response.raise_for_status() # Generate a unique filename filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}' # Save the image with open(filename, 'wb') as f: f.write(img_response.content) downloaded_images.append(filename) except Exception as img_error: st.warning(f"Could not download image {i}: {img_error}") return downloaded_images, len(img_tags) except Exception as e: st.error(f"Error occurred while downloading images: {e}") return [], 0 def scrape_visible_text_from_url(url): """ Scrape visible text from the given URL Args: url (str): URL of the webpage Returns: str: Extracted visible text """ try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove script, style, and other non-visible tags for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]): tag.extract() # Get the header content header_content = soup.find("header") header_text = header_content.get_text() if header_content else "" # Get the paragraph content paragraph_content = soup.find_all("p") paragraph_text = " ".join([p.get_text() for p in paragraph_content]) # Combine header and paragraph text visible_text = f"{header_text}\n\n{paragraph_text}" # Remove multiple whitespaces and newlines visible_text = re.sub(r'\s+', ' ', visible_text) return visible_text.strip() except Exception as e: st.error(f"Error occurred while scraping the data: {e}") return None def main(): """ Main Streamlit application """ st.title("Web Data Scraper with Image Downloader") # Get the URL from the user url_input = st.text_input("Enter the URL of the web page:", "") # Maximum images to download slider max_images = st.slider("Maximum number of images to download", 1, 20, 10) if st.button("Scrape Content"): if url_input: # Extract visible text from the URL text_data = scrape_visible_text_from_url(url_input) # Download images downloaded_images, total_images = download_images(url_input, max_images) if text_data: st.success("Content successfully scraped!") # Display text st.subheader("Scraped Text:") st.write(text_data) # Display and manage images st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)") # Create columns for images if downloaded_images: cols = st.columns(min(len(downloaded_images), 3)) for i, img_path in enumerate(downloaded_images): with cols[i % 3]: st.image(img_path, use_column_width=True) # Provide download option for all images with open('downloaded_images.zip', 'wb') as zipf: import zipfile with zipfile.ZipFile(zipf, 'w') as zip_file: for img_path in downloaded_images: zip_file.write(img_path, os.path.basename(img_path)) st.download_button( label="Download All Images", data=open('downloaded_images.zip', 'rb').read(), file_name='downloaded_images.zip', mime='application/zip' ) else: st.warning("No images found on the page.") else: st.warning("Failed to scrape content from the URL.") else: st.warning("Please enter a valid URL.") if __name__ == "__main__": main()