Fetch-Content

Sleeping

App Files Files Community

KingNish commited on Sep 25, 2024

Commit

fab1175

verified ·

1 Parent(s): 61bd6f0

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -79

app.py CHANGED Viewed

@@ -34,12 +34,45 @@ def clean_text(content):
     content = re.sub(r'\s+', ' ', content)
     return content
-def split_content(content, chunk_size=CHUNK_SIZE):
-    """Splits content into chunks of a specified size."""
-    chunks = []
-    for i in range(0, len(content), chunk_size):
-        chunks.append(content[i:i + chunk_size])
-    return chunks
 # --- Document Reading Functions ---
@@ -156,7 +189,7 @@ def read_document(file_path, clean=True, url=""):
             return f"Error reading PPTX: {e}", 0
     elif mime == "text/html":  # Handle HTML content
         try:
-            soup = BeautifulSoup(file_content, 'html.parser')
             structured_data = {
                 "Texts": extract_texts(soup),
                 "Links": extract_links(soup, url),
@@ -181,15 +214,10 @@ def download_and_process_file(url, clean=True):
     try:
         response = requests.get(url, stream=True, timeout=10)
-        response.raise_for_status()  # Raise an exception for bad status codes
-        # Generate a safe and unique temporary filename
         original_filename = os.path.basename(url)
-        # Remove invalid characters from filename
         safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
         temp_filename = f"{safe_filename}"
-        # Infer file extension from content type
         content_type = response.headers['content-type']
         ext = mimetypes.guess_extension(content_type)
         if ext and not temp_filename.endswith(ext):  # Append extension if not already present
@@ -199,7 +227,6 @@ def download_and_process_file(url, clean=True):
             for chunk in response.iter_content(chunk_size=8192000):
                 f.write(chunk)
-        # Check if it's an image type
         kind = filetype.guess(temp_filename)
         if kind and kind.mime.startswith('image/'):
             return f"![]({url})", 0  # Return markdown image syntax if it's an image
@@ -215,72 +242,6 @@ def download_and_process_file(url, clean=True):
     except requests.exceptions.RequestException as e:
         return f"Error downloading file: {e}", 0
-# --- Web Page Content Extraction Functions (from previous code) ---
-def extract_texts(soup):
-    """Extracts all text content from the soup."""
-    return [text for text in soup.stripped_strings]
-def extract_links(soup, base_url):
-    """Extracts all valid links from the soup."""
-    links = []
-    for link in soup.find_all('a', href=True):
-        href = link['href']
-        # Use urljoin to create an absolute URL
-        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
-        link_text = link.get_text(strip=True) or "No Text"
-        links.append({"Text": link_text, "URL": full_url})
-    return links
-def extract_images(soup, base_url):
-    """Extracts all valid image URLs and their alt text from the soup."""
-    images = []
-    for img in soup.find_all('img', src=True):
-        img_url = img['src']
-        # Use urljoin to create an absolute URL
-        full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
-        alt_text = img.get('alt', 'No Alt Text')
-        images.append({"Alt Text": alt_text, "Image URL": full_img_url})
-    return images
-def fetch_page_content(url):
-    """Fetches the content of the page at the given URL."""
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        return response.text
-    except requests.exceptions.RequestException as e:
-        return f"Error fetching the URL: {e}"
-def format_detailed_output(structured_data):
-    """Formats the structured data into a Markdown string."""
-    result = "### Structured Page Content\n\n"
-    result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
-    result += "**Links:**\n"
-    if structured_data["Links"]:
-        result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
-    else:
-        result += "No links found.\n"
-    result += "**Images:**\n"
-    if structured_data["Images"]:
-        result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
-    else:
-        result += "No images found.\n"
-    return result
-def extract_page_content(url):
-    """Extracts and formats the content of the page at the given URL."""
-    page_content = fetch_page_content(url)
-    if "Error" in page_content:
-        return page_content
-    soup = BeautifulSoup(page_content, 'html.parser')
-    structured_data = {
-        "Texts": extract_texts(soup),
-        "Links": extract_links(soup, url),  # Pass the base URL
-        "Images": extract_images(soup, url)  # Pass the base URL
-    }
-    return format_detailed_output(structured_data)
 # --- Gradio Interface ---
 iface = gr.Interface(

     content = re.sub(r'\s+', ' ', content)
     return content
+def extract_texts(soup):
+    """Extracts all text content from the soup."""
+    return [text for text in soup.stripped_strings]
+def extract_links(soup, base_url):
+    """Extracts all valid links from the soup."""
+    links = []
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
+        link_text = link.get_text(strip=True) or "No Text"
+        links.append({"Text": link_text, "URL": full_url})
+    return links
+def extract_images(soup, base_url):
+    """Extracts all valid image URLs and their alt text from the soup."""
+    images = []
+    for img in soup.find_all('img', src=True):
+        img_url = img['src']
+        full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
+        alt_text = img.get('alt', 'No Alt Text')
+        images.append({"Alt Text": alt_text, "Image URL": full_img_url})
+    return images
+def format_detailed_output(structured_data):
+    """Formats the structured data into a Markdown string."""
+    result = "### Structured Page Content\n\n"
+    result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
+    result += "**Links:**\n"
+    if structured_data["Links"]:
+        result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
+    else:
+        result += "No links found.\n"
+    result += "**Images:**\n"
+    if structured_data["Images"]:
+        result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
+    else:
+        result += "No images found.\n"
+    return result
 # --- Document Reading Functions ---
             return f"Error reading PPTX: {e}", 0
     elif mime == "text/html":  # Handle HTML content
         try:
+            soup = BeautifulSoup(file_content, 'lxml')
             structured_data = {
                 "Texts": extract_texts(soup),
                 "Links": extract_links(soup, url),
     try:
         response = requests.get(url, stream=True, timeout=10)
         original_filename = os.path.basename(url)
         safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
         temp_filename = f"{safe_filename}"
         content_type = response.headers['content-type']
         ext = mimetypes.guess_extension(content_type)
         if ext and not temp_filename.endswith(ext):  # Append extension if not already present
             for chunk in response.iter_content(chunk_size=8192000):
                 f.write(chunk)
         kind = filetype.guess(temp_filename)
         if kind and kind.mime.startswith('image/'):
             return f"![]({url})", 0  # Return markdown image syntax if it's an image
     except requests.exceptions.RequestException as e:
         return f"Error downloading file: {e}", 0
 # --- Gradio Interface ---
 iface = gr.Interface(