import streamlit as st import requests from bs4 import BeautifulSoup import re import os from urllib.parse import urljoin def scrape_web_content(url): """ Scrape the web content while preserving its original formatting Args: url (str): URL of the webpage Returns: dict: Extracted content with text, HTML, and images """ try: # Send a request to the URL response = requests.get(url) response.raise_for_status() # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Create a directory to save images if it doesn't exist os.makedirs('downloaded_images', exist_ok=True) # Download images downloaded_images = [] img_tags = soup.find_all('img', src=True) for i, img in enumerate(img_tags[:10], 1): try: # Get the image source URL img_url = img['src'] # Handle relative URLs if not img_url.startswith(('http://', 'https://')): img_url = urljoin(url, img_url) # Download the image img_response = requests.get(img_url) img_response.raise_for_status() # Generate a unique filename filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}' # Save the image with open(filename, 'wb') as f: f.write(img_response.content) # Update the image tag in the soup to point to local file img['src'] = filename downloaded_images.append(filename) except Exception as img_error: st.warning(f"Could not download image {i}: {img_error}") # Remove unwanted tags for tag in soup(["script", "style", "meta", "link", "noscript"]): tag.decompose() # Convert remaining soup to HTML string formatted_html = str(soup) # Extract plain text for preview plain_text = soup.get_text(separator='\n', strip=True) return { 'html': formatted_html, 'plain_text': plain_text, 'images': downloaded_images } except Exception as e: st.error(f"Error occurred while scraping the content: {e}") return None def main(): """ Main Streamlit application """ st.title("Web Content Scraper with Preserved Formatting") # Get the URL from the user url_input = st.text_input("Enter the URL of the web page:", "") # Option to choose display mode display_mode = st.radio("Display Mode:", ["Full HTML", "Plain Text", "Side-by-Side"]) if st.button("Scrape Content"): if url_input: # Scrape the content scraped_content = scrape_web_content(url_input) if scraped_content: st.success("Content successfully scraped!") # Display content based on selected mode if display_mode == "Full HTML": # Display full HTML with preserved formatting st.markdown("### Formatted Web Content") st.components.v1.html(scraped_content['html'], height=600, scrolling=True) elif display_mode == "Plain Text": # Display plain text st.markdown("### Plain Text Content") st.text_area("Scraped Text:", scraped_content['plain_text'], height=400) else: # Side-by-Side # Split the screen to show HTML and plain text col1, col2 = st.columns(2) with col1: st.markdown("### Formatted HTML") st.components.v1.html(scraped_content['html'], height=600, scrolling=True) with col2: st.markdown("### Plain Text") st.text_area("Scraped Text:", scraped_content['plain_text'], height=600) # Display images if scraped_content['images']: st.subheader("Downloaded Images") cols = st.columns(min(len(scraped_content['images']), 3)) for i, img_path in enumerate(scraped_content['images']): with cols[i % 3]: st.image(img_path, use_column_width=True) # Zip and download option for images with open('downloaded_images.zip', 'wb') as zipf: import zipfile with zipfile.ZipFile(zipf, 'w') as zip_file: for img_path in scraped_content['images']: zip_file.write(img_path, os.path.basename(img_path)) st.download_button( label="Download All Images", data=open('downloaded_images.zip', 'rb').read(), file_name='downloaded_images.zip', mime='application/zip' ) else: st.warning("Failed to scrape content from the URL.") else: st.warning("Please enter a valid URL.") if __name__ == "__main__": main()