Marcepelaez commited on
Commit
1bf2dc7
1 Parent(s): 38f7ded
Files changed (1) hide show
  1. app.py +107 -82
app.py CHANGED
@@ -5,105 +5,130 @@ import re
5
  import os
6
  from urllib.parse import urljoin
7
 
8
- # Resto del código se mantiene igual...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def main():
11
  """
12
  Main Streamlit application
13
  """
14
- # Set page config
15
  st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
16
-
17
- # Theme selector
18
  theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
19
-
20
- # Apply selected theme
21
  apply_theme(theme)
22
-
23
  st.title("Web Content Scraper")
24
-
25
- # Get the URL from the user
26
  url_input = st.text_input("Enter the URL of the web page:", "")
27
-
28
- # Option to choose display mode
29
- display_mode = st.radio("Display Mode:",
30
- ["Full HTML", "Plain Text", "Side-by-Side"])
31
-
32
- # Slider for maximum images (1-40)
33
  max_images = st.slider("Maximum number of images to download", 1, 40, 10)
34
 
35
  if st.button("Scrape Content"):
36
  if url_input:
37
- # Scrape the content
38
  scraped_content = scrape_web_content(url_input, max_images, theme)
39
-
40
  if scraped_content:
41
  st.success("Content successfully scraped!")
42
-
43
- # Display content based on selected mode
44
- if display_mode == "Full HTML":
45
- # Display full HTML with preserved formatting
46
- st.markdown("### Formatted Web Content")
47
- st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
48
-
49
- elif display_mode == "Plain Text":
50
- # Display plain text
51
- st.markdown("### Plain Text Content")
52
- # Text area to show scraped content
53
- text_area = st.text_area("Scraped Text:", scraped_content['plain_text'], height=400)
54
- # Add a button to copy all text
55
- st.download_button(
56
- label="Copy All Text",
57
- data=scraped_content['plain_text'],
58
- file_name="scraped_text.txt",
59
- mime="text/plain",
60
- help="Click to copy all extracted text"
61
- )
62
-
63
- else: # Side-by-Side
64
- # Split the screen to show HTML and plain text
65
- col1, col2 = st.columns(2)
66
-
67
- with col1:
68
- st.markdown("### Formatted HTML")
69
- st.components.v1.html(scraped_content['html'], height=600, scrolling=True)
70
-
71
- with col2:
72
- st.markdown("### Plain Text")
73
- text_area = st.text_area("Scraped Text:", scraped_content['plain_text'], height=600)
74
- st.download_button(
75
- label="Copy All Text",
76
- data=scraped_content['plain_text'],
77
- file_name="scraped_text.txt",
78
- mime="text/plain",
79
- help="Click to copy all extracted text"
80
- )
81
-
82
- # Display images
83
- if scraped_content['images']:
84
- st.subheader(f"Downloaded Images ({len(scraped_content['images'])} of {max_images})")
85
-
86
- # Create a grid of image columns
87
- cols = st.columns(4) # 4 columns for better layout with more images
88
- for i, img_path in enumerate(scraped_content['images']):
89
- with cols[i % 4]:
90
- st.image(img_path, use_column_width=True)
91
-
92
- # Zip and download option for images
93
- with open('downloaded_images.zip', 'wb') as zipf:
94
- import zipfile
95
- with zipfile.ZipFile(zipf, 'w') as zip_file:
96
- for img_path in scraped_content['images']:
97
- zip_file.write(img_path, os.path.basename(img_path))
98
-
99
- st.download_button(
100
- label="Download All Images",
101
- data=open('downloaded_images.zip', 'rb').read(),
102
- file_name='downloaded_images.zip',
103
- mime='application/zip'
104
- )
105
- else:
106
- st.warning("Failed to scrape content from the URL.")
107
  else:
108
  st.warning("Please enter a valid URL.")
109
 
 
5
  import os
6
  from urllib.parse import urljoin
7
 
8
+ def apply_theme(theme):
9
+ """
10
+ Apply custom CSS based on the selected theme
11
+ """
12
+ if theme == "Claro":
13
+ st.markdown("""
14
+ <style>
15
+ body {
16
+ color: black;
17
+ background-color: white;
18
+ }
19
+ .stTextInput > div > div > input {
20
+ color: black;
21
+ background-color: white;
22
+ }
23
+ .stMarkdown {
24
+ color: black;
25
+ }
26
+ /* Light theme for HTML content */
27
+ .light-theme {
28
+ background-color: white !important;
29
+ color: black !important;
30
+ }
31
+ .light-theme a {
32
+ color: #0066cc !important;
33
+ }
34
+ .light-theme h1, .light-theme h2, .light-theme h3,
35
+ .light-theme h4, .light-theme h5, .light-theme h6 {
36
+ color: #333 !important;
37
+ }
38
+ </style>
39
+ """, unsafe_allow_html=True)
40
+ return """
41
+ <div style="background-color: white; color: black; padding: 20px;">
42
+ <style>
43
+ body { background-color: white !important; color: black !important; }
44
+ a { color: #0066cc; }
45
+ h1, h2, h3, h4, h5, h6 { color: #333; }
46
+ </style>
47
+ """
48
+ else:
49
+ st.markdown("""
50
+ <style>
51
+ body {
52
+ color: white;
53
+ background-color: #0E1117;
54
+ }
55
+ .stTextInput > div > div > input {
56
+ color: white;
57
+ background-color: #262730;
58
+ }
59
+ .stMarkdown {
60
+ color: white;
61
+ }
62
+ </style>
63
+ """, unsafe_allow_html=True)
64
+ return ""
65
+
66
+ def scrape_web_content(url, max_images, theme):
67
+ """
68
+ Scrape the web content while preserving its original formatting
69
+ """
70
+ try:
71
+ response = requests.get(url)
72
+ response.raise_for_status()
73
+
74
+ soup = BeautifulSoup(response.content, 'html.parser')
75
+ os.makedirs('downloaded_images', exist_ok=True)
76
+
77
+ downloaded_images = []
78
+ img_tags = soup.find_all('img', src=True)
79
+ for i, img in enumerate(img_tags[:max_images], 1):
80
+ try:
81
+ img_url = img['src']
82
+ if not img_url.startswith(('http://', 'https://')):
83
+ img_url = urljoin(url, img_url)
84
+
85
+ img_response = requests.get(img_url)
86
+ img_response.raise_for_status()
87
+
88
+ filename = f'downloaded_images/image_{i}.{img_url.split(".")[-1].split("?")[0]}'
89
+ with open(filename, 'wb') as f:
90
+ f.write(img_response.content)
91
+
92
+ img['src'] = filename
93
+ downloaded_images.append(filename)
94
+ except Exception as img_error:
95
+ st.warning(f"Could not download image {i}: {img_error}")
96
+
97
+ for tag in soup(["script", "style", "meta", "link", "noscript"]):
98
+ tag.decompose()
99
+
100
+ theme_prefix = apply_theme(theme) if theme == "Claro" else ""
101
+ formatted_html = theme_prefix + str(soup)
102
+ plain_text = soup.get_text(separator='\n', strip=True)
103
+
104
+ return {
105
+ 'html': formatted_html,
106
+ 'plain_text': plain_text,
107
+ 'images': downloaded_images
108
+ }
109
+
110
+ except Exception as e:
111
+ st.error(f"Error occurred while scraping the content: {e}")
112
+ return None
113
 
114
  def main():
115
  """
116
  Main Streamlit application
117
  """
 
118
  st.set_page_config(page_title="Web Content Scraper", page_icon=":mag:", layout="wide")
 
 
119
  theme = st.sidebar.radio("Seleccionar Tema:", ["Oscuro", "Claro"])
 
 
120
  apply_theme(theme)
 
121
  st.title("Web Content Scraper")
 
 
122
  url_input = st.text_input("Enter the URL of the web page:", "")
123
+ display_mode = st.radio("Display Mode:", ["Full HTML", "Plain Text", "Side-by-Side"])
 
 
 
 
 
124
  max_images = st.slider("Maximum number of images to download", 1, 40, 10)
125
 
126
  if st.button("Scrape Content"):
127
  if url_input:
 
128
  scraped_content = scrape_web_content(url_input, max_images, theme)
 
129
  if scraped_content:
130
  st.success("Content successfully scraped!")
131
+ # Resto del código para mostrar contenido...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  else:
133
  st.warning("Please enter a valid URL.")
134