KingNish commited on
Commit
fab1175
1 Parent(s): 61bd6f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -79
app.py CHANGED
@@ -34,12 +34,45 @@ def clean_text(content):
34
  content = re.sub(r'\s+', ' ', content)
35
  return content
36
 
37
- def split_content(content, chunk_size=CHUNK_SIZE):
38
- """Splits content into chunks of a specified size."""
39
- chunks = []
40
- for i in range(0, len(content), chunk_size):
41
- chunks.append(content[i:i + chunk_size])
42
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # --- Document Reading Functions ---
45
 
@@ -156,7 +189,7 @@ def read_document(file_path, clean=True, url=""):
156
  return f"Error reading PPTX: {e}", 0
157
  elif mime == "text/html": # Handle HTML content
158
  try:
159
- soup = BeautifulSoup(file_content, 'html.parser')
160
  structured_data = {
161
  "Texts": extract_texts(soup),
162
  "Links": extract_links(soup, url),
@@ -181,15 +214,10 @@ def download_and_process_file(url, clean=True):
181
 
182
  try:
183
  response = requests.get(url, stream=True, timeout=10)
184
- response.raise_for_status() # Raise an exception for bad status codes
185
-
186
- # Generate a safe and unique temporary filename
187
  original_filename = os.path.basename(url)
188
- # Remove invalid characters from filename
189
  safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
190
  temp_filename = f"{safe_filename}"
191
 
192
- # Infer file extension from content type
193
  content_type = response.headers['content-type']
194
  ext = mimetypes.guess_extension(content_type)
195
  if ext and not temp_filename.endswith(ext): # Append extension if not already present
@@ -199,7 +227,6 @@ def download_and_process_file(url, clean=True):
199
  for chunk in response.iter_content(chunk_size=8192000):
200
  f.write(chunk)
201
 
202
- # Check if it's an image type
203
  kind = filetype.guess(temp_filename)
204
  if kind and kind.mime.startswith('image/'):
205
  return f"![]({url})", 0 # Return markdown image syntax if it's an image
@@ -215,72 +242,6 @@ def download_and_process_file(url, clean=True):
215
  except requests.exceptions.RequestException as e:
216
  return f"Error downloading file: {e}", 0
217
 
218
- # --- Web Page Content Extraction Functions (from previous code) ---
219
-
220
- def extract_texts(soup):
221
- """Extracts all text content from the soup."""
222
- return [text for text in soup.stripped_strings]
223
-
224
- def extract_links(soup, base_url):
225
- """Extracts all valid links from the soup."""
226
- links = []
227
- for link in soup.find_all('a', href=True):
228
- href = link['href']
229
- # Use urljoin to create an absolute URL
230
- full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
231
- link_text = link.get_text(strip=True) or "No Text"
232
- links.append({"Text": link_text, "URL": full_url})
233
- return links
234
-
235
- def extract_images(soup, base_url):
236
- """Extracts all valid image URLs and their alt text from the soup."""
237
- images = []
238
- for img in soup.find_all('img', src=True):
239
- img_url = img['src']
240
- # Use urljoin to create an absolute URL
241
- full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
242
- alt_text = img.get('alt', 'No Alt Text')
243
- images.append({"Alt Text": alt_text, "Image URL": full_img_url})
244
- return images
245
-
246
- def fetch_page_content(url):
247
- """Fetches the content of the page at the given URL."""
248
- try:
249
- response = requests.get(url, timeout=10)
250
- response.raise_for_status()
251
- return response.text
252
- except requests.exceptions.RequestException as e:
253
- return f"Error fetching the URL: {e}"
254
-
255
- def format_detailed_output(structured_data):
256
- """Formats the structured data into a Markdown string."""
257
- result = "### Structured Page Content\n\n"
258
- result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
259
- result += "**Links:**\n"
260
- if structured_data["Links"]:
261
- result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
262
- else:
263
- result += "No links found.\n"
264
- result += "**Images:**\n"
265
- if structured_data["Images"]:
266
- result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
267
- else:
268
- result += "No images found.\n"
269
- return result
270
-
271
- def extract_page_content(url):
272
- """Extracts and formats the content of the page at the given URL."""
273
- page_content = fetch_page_content(url)
274
- if "Error" in page_content:
275
- return page_content
276
- soup = BeautifulSoup(page_content, 'html.parser')
277
- structured_data = {
278
- "Texts": extract_texts(soup),
279
- "Links": extract_links(soup, url), # Pass the base URL
280
- "Images": extract_images(soup, url) # Pass the base URL
281
- }
282
- return format_detailed_output(structured_data)
283
-
284
  # --- Gradio Interface ---
285
 
286
  iface = gr.Interface(
 
34
  content = re.sub(r'\s+', ' ', content)
35
  return content
36
 
37
+ def extract_texts(soup):
38
+ """Extracts all text content from the soup."""
39
+ return [text for text in soup.stripped_strings]
40
+
41
+ def extract_links(soup, base_url):
42
+ """Extracts all valid links from the soup."""
43
+ links = []
44
+ for link in soup.find_all('a', href=True):
45
+ href = link['href']
46
+ full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
47
+ link_text = link.get_text(strip=True) or "No Text"
48
+ links.append({"Text": link_text, "URL": full_url})
49
+ return links
50
+
51
+ def extract_images(soup, base_url):
52
+ """Extracts all valid image URLs and their alt text from the soup."""
53
+ images = []
54
+ for img in soup.find_all('img', src=True):
55
+ img_url = img['src']
56
+ full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
57
+ alt_text = img.get('alt', 'No Alt Text')
58
+ images.append({"Alt Text": alt_text, "Image URL": full_img_url})
59
+ return images
60
+
61
+ def format_detailed_output(structured_data):
62
+ """Formats the structured data into a Markdown string."""
63
+ result = "### Structured Page Content\n\n"
64
+ result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
65
+ result += "**Links:**\n"
66
+ if structured_data["Links"]:
67
+ result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
68
+ else:
69
+ result += "No links found.\n"
70
+ result += "**Images:**\n"
71
+ if structured_data["Images"]:
72
+ result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
73
+ else:
74
+ result += "No images found.\n"
75
+ return result
76
 
77
  # --- Document Reading Functions ---
78
 
 
189
  return f"Error reading PPTX: {e}", 0
190
  elif mime == "text/html": # Handle HTML content
191
  try:
192
+ soup = BeautifulSoup(file_content, 'lxml')
193
  structured_data = {
194
  "Texts": extract_texts(soup),
195
  "Links": extract_links(soup, url),
 
214
 
215
  try:
216
  response = requests.get(url, stream=True, timeout=10)
 
 
 
217
  original_filename = os.path.basename(url)
 
218
  safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
219
  temp_filename = f"{safe_filename}"
220
 
 
221
  content_type = response.headers['content-type']
222
  ext = mimetypes.guess_extension(content_type)
223
  if ext and not temp_filename.endswith(ext): # Append extension if not already present
 
227
  for chunk in response.iter_content(chunk_size=8192000):
228
  f.write(chunk)
229
 
 
230
  kind = filetype.guess(temp_filename)
231
  if kind and kind.mime.startswith('image/'):
232
  return f"![]({url})", 0 # Return markdown image syntax if it's an image
 
242
  except requests.exceptions.RequestException as e:
243
  return f"Error downloading file: {e}", 0
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  # --- Gradio Interface ---
246
 
247
  iface = gr.Interface(