Spaces:
Running
Running
File size: 5,780 Bytes
36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 36dbdcd befec25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin
def download_images(url, max_images=10):
"""
Download images from the given URL
Args:
url (str): URL of the webpage
max_images (int): Maximum number of images to download
Returns:
tuple: List of downloaded image paths and total image count
"""
try:
# Create a directory to save images if it doesn't exist
os.makedirs('downloaded_images', exist_ok=True)
# Send a request to the URL
response = requests.get(url)
response.raise_for_status()
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all image tags
img_tags = soup.find_all('img', src=True)
# List to store downloaded image paths
downloaded_images = []
# Download images
for i, img in enumerate(img_tags[:max_images], 1):
# Get the image source URL
img_url = img['src']
# Handle relative URLs
if not img_url.startswith(('http://', 'https://')):
img_url = urljoin(url, img_url)
try:
# Download the image
img_response = requests.get(img_url)
img_response.raise_for_status()
# Generate a unique filename
filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
# Save the image
with open(filename, 'wb') as f:
f.write(img_response.content)
downloaded_images.append(filename)
except Exception as img_error:
st.warning(f"Could not download image {i}: {img_error}")
return downloaded_images, len(img_tags)
except Exception as e:
st.error(f"Error occurred while downloading images: {e}")
return [], 0
def scrape_visible_text_from_url(url):
"""
Scrape visible text from the given URL
Args:
url (str): URL of the webpage
Returns:
str: Extracted visible text
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script, style, and other non-visible tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
tag.extract()
# Get the header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Get the paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
# Combine header and paragraph text
visible_text = f"{header_text}\n\n{paragraph_text}"
# Remove multiple whitespaces and newlines
visible_text = re.sub(r'\s+', ' ', visible_text)
return visible_text.strip()
except Exception as e:
st.error(f"Error occurred while scraping the data: {e}")
return None
def main():
"""
Main Streamlit application
"""
st.title("Web Data Scraper with Image Downloader")
# Get the URL from the user
url_input = st.text_input("Enter the URL of the web page:", "")
# Maximum images to download slider
max_images = st.slider("Maximum number of images to download", 1, 20, 10)
if st.button("Scrape Content"):
if url_input:
# Extract visible text from the URL
text_data = scrape_visible_text_from_url(url_input)
# Download images
downloaded_images, total_images = download_images(url_input, max_images)
if text_data:
st.success("Content successfully scraped!")
# Display text
st.subheader("Scraped Text:")
st.write(text_data)
# Display and manage images
st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")
# Create columns for images
if downloaded_images:
cols = st.columns(min(len(downloaded_images), 3))
for i, img_path in enumerate(downloaded_images):
with cols[i % 3]:
st.image(img_path, use_column_width=True)
# Provide download option for all images
with open('downloaded_images.zip', 'wb') as zipf:
import zipfile
with zipfile.ZipFile(zipf, 'w') as zip_file:
for img_path in downloaded_images:
zip_file.write(img_path, os.path.basename(img_path))
st.download_button(
label="Download All Images",
data=open('downloaded_images.zip', 'rb').read(),
file_name='downloaded_images.zip',
mime='application/zip'
)
else:
st.warning("No images found on the page.")
else:
st.warning("Failed to scrape content from the URL.")
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main() |