File size: 5,780 Bytes
36dbdcd
 
 
 
befec25
 
36dbdcd
befec25
 
 
 
 
 
 
 
 
 
 
36dbdcd
befec25
 
 
 
36dbdcd
 
befec25
 
36dbdcd
befec25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dbdcd
befec25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dbdcd
befec25
36dbdcd
befec25
36dbdcd
 
 
befec25
36dbdcd
 
 
befec25
36dbdcd
 
befec25
36dbdcd
 
 
befec25
36dbdcd
 
 
 
 
befec25
 
 
 
 
36dbdcd
 
befec25
 
 
 
 
36dbdcd
 
befec25
 
 
 
 
 
 
 
 
36dbdcd
befec25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dbdcd
befec25
36dbdcd
 
 
 
befec25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin

def download_images(url, max_images=10):
    """
    Download images from the given URL
    
    Args:
        url (str): URL of the webpage
        max_images (int): Maximum number of images to download
    
    Returns:
        tuple: List of downloaded image paths and total image count
    """
    try:
        # Create a directory to save images if it doesn't exist
        os.makedirs('downloaded_images', exist_ok=True)
        
        # Send a request to the URL
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all image tags
        img_tags = soup.find_all('img', src=True)
        
        # List to store downloaded image paths
        downloaded_images = []
        
        # Download images
        for i, img in enumerate(img_tags[:max_images], 1):
            # Get the image source URL
            img_url = img['src']
            
            # Handle relative URLs
            if not img_url.startswith(('http://', 'https://')):
                img_url = urljoin(url, img_url)
            
            try:
                # Download the image
                img_response = requests.get(img_url)
                img_response.raise_for_status()
                
                # Generate a unique filename
                filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
                
                # Save the image
                with open(filename, 'wb') as f:
                    f.write(img_response.content)
                
                downloaded_images.append(filename)
            
            except Exception as img_error:
                st.warning(f"Could not download image {i}: {img_error}")
        
        return downloaded_images, len(img_tags)
    
    except Exception as e:
        st.error(f"Error occurred while downloading images: {e}")
        return [], 0

def scrape_visible_text_from_url(url):
    """
    Scrape visible text from the given URL
    
    Args:
        url (str): URL of the webpage
    
    Returns:
        str: Extracted visible text
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script, style, and other non-visible tags
        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav"]):
            tag.extract()
        
        # Get the header content
        header_content = soup.find("header")
        header_text = header_content.get_text() if header_content else ""
        
        # Get the paragraph content
        paragraph_content = soup.find_all("p")
        paragraph_text = " ".join([p.get_text() for p in paragraph_content])
        
        # Combine header and paragraph text
        visible_text = f"{header_text}\n\n{paragraph_text}"
        
        # Remove multiple whitespaces and newlines
        visible_text = re.sub(r'\s+', ' ', visible_text)
        return visible_text.strip()
    
    except Exception as e:
        st.error(f"Error occurred while scraping the data: {e}")
        return None

def main():
    """
    Main Streamlit application
    """
    st.title("Web Data Scraper with Image Downloader")
    
    # Get the URL from the user
    url_input = st.text_input("Enter the URL of the web page:", "")
    
    # Maximum images to download slider
    max_images = st.slider("Maximum number of images to download", 1, 20, 10)
    
    if st.button("Scrape Content"):
        if url_input:
            # Extract visible text from the URL
            text_data = scrape_visible_text_from_url(url_input)
            
            # Download images
            downloaded_images, total_images = download_images(url_input, max_images)
            
            if text_data:
                st.success("Content successfully scraped!")
                
                # Display text
                st.subheader("Scraped Text:")
                st.write(text_data)
                
                # Display and manage images
                st.subheader(f"Images (Downloaded {len(downloaded_images)} out of {total_images} total)")
                
                # Create columns for images
                if downloaded_images:
                    cols = st.columns(min(len(downloaded_images), 3))
                    for i, img_path in enumerate(downloaded_images):
                        with cols[i % 3]:
                            st.image(img_path, use_column_width=True)
                    
                    # Provide download option for all images
                    with open('downloaded_images.zip', 'wb') as zipf:
                        import zipfile
                        with zipfile.ZipFile(zipf, 'w') as zip_file:
                            for img_path in downloaded_images:
                                zip_file.write(img_path, os.path.basename(img_path))
                    
                    st.download_button(
                        label="Download All Images",
                        data=open('downloaded_images.zip', 'rb').read(),
                        file_name='downloaded_images.zip',
                        mime='application/zip'
                    )
                else:
                    st.warning("No images found on the page.")
            else:
                st.warning("Failed to scrape content from the URL.")
        else:
            st.warning("Please enter a valid URL.")

if __name__ == "__main__":
    main()