Fetch-Content

Sleeping

App Files Files Community

KingNish commited on Sep 24, 2024

Commit

413592b

verified ·

1 Parent(s): b79a457

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -11

app.py CHANGED Viewed

@@ -1,10 +1,170 @@
 import gradio as gr
 import requests
 import os
-import re
 import mimetypes
-def download_file(url):
   """Downloads a file from a URL and returns the local file path."""
   if not url.startswith("http://") and not url.startswith("https://"):
     url = "http://" + url  # Prepend "http://" if not present
@@ -28,20 +188,35 @@ def download_file(url):
     with open(temp_filename, 'wb') as f:
       for chunk in response.iter_content(chunk_size=8192000):
         f.write(chunk)
-    return temp_filename
   except requests.exceptions.MissingSchema:
-    return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid."
   except requests.exceptions.ConnectionError:
-    return "Error: Could not connect to the server. Please check your internet connection."
   except requests.exceptions.RequestException as e:
-    return f"Error downloading file: {e}"
 iface = gr.Interface(
-    fn=download_file,
-    inputs=gr.Textbox(lines=1, placeholder="Enter URL of the file"),
-    outputs=gr.File(),
-    title="File Downloader for Hugging Face Chat Tools",
-    description="Enter the URL of an image, video, document, etc. to download it. "
                 "This tool is designed for use with Hugging Face Chat Tools: "
                 "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
     concurrency_limit=None

+import PyPDF2
+from openpyxl import load_workbook
+from pptx import Presentation
 import gradio as gr
+import io
+import re
+import zipfile
+import xml.etree.ElementTree as ET
+import filetype
 import requests
 import os
 import mimetypes
+# Constants
+CHUNK_SIZE = 32000
+# --- Utility Functions ---
+def xml2text(xml):
+    """Extracts text from XML data."""
+    text = u''
+    root = ET.fromstring(xml)
+    for child in root.iter():
+        text += child.text + " " if child.text is not None else ''
+    return text
+def clean_text(content):
+    """Cleans text content based on the 'clean' parameter."""
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+    content = content.replace('\t', ' ')
+    content = re.sub(r'\s+', ' ', content)
+    return content
+def split_content(content, chunk_size=CHUNK_SIZE):
+    """Splits content into chunks of a specified size."""
+    chunks = []
+    for i in range(0, len(content), chunk_size):
+        chunks.append(content[i:i + chunk_size])
+    return chunks
+# --- Document Reading Functions ---
+def extract_text_from_docx(docx_data, clean=True):
+    """Extracts text from DOCX files."""
+    text = u''
+    zipf = zipfile.ZipFile(io.BytesIO(docx_data))
+    filelist = zipf.namelist()
+    header_xmls = 'word/header[0-9]*.xml'
+    for fname in filelist:
+        if re.match(header_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    doc_xml = 'word/document.xml'
+    text += xml2text(zipf.read(doc_xml))
+    footer_xmls = 'word/footer[0-9]*.xml'
+    for fname in filelist:
+        if re.match(footer_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    zipf.close()
+    if clean:
+        text = clean_text(text)
+    return text, len(text)
+def extract_text_from_pptx(pptx_data, clean=True):
+    """Extracts text from PPT files."""
+    text = u''
+    zipf = zipfile.ZipFile(io.BytesIO(pptx_data))
+    filelist = zipf.namelist()
+    # Extract text from slide notes
+    notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
+    for fname in filelist:
+        if re.match(notes_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    # Extract text from slide content (shapes and text boxes)
+    slide_xmls = 'ppt/slides/slide[0-9]*.xml'
+    for fname in filelist:
+        if re.match(slide_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    zipf.close()
+    if clean:
+        text = clean_text(text)
+    return text, len(text)
+def read_document(file_path, clean=True):
+    with open(file_path, "rb") as f:
+        file_content = f.read()
+    kind = filetype.guess(file_content)
+    if kind is None:
+        mime = "text"
+    else:
+        mime = kind.mime
+    if mime == "application/pdf":
+        try:
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
+            content = ''
+            for page in range(len(pdf_reader.pages)):
+                content += pdf_reader.pages[page].extract_text()
+            if clean:
+                content = clean_text(content)
+            return content, len(repr(content))
+        except Exception as e:
+            return f"Error reading PDF: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        try:
+            wb = load_workbook(io.BytesIO(file_content))
+            content = ''
+            for sheet in wb.worksheets:
+                for row in sheet.rows:
+                    for cell in row:
+                        if cell.value is not None:
+                            content += str(cell.value) + ' '
+            if clean:
+                content = clean_text(content)
+            return content, len(repr(content))
+        except Exception as e:
+            return f"Error reading XLSX: {e}", 0
+    elif mime == "text/plain":
+        try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(repr(content))
+        except Exception as e:
+            return f"Error reading TXT file: {e}", 0
+    elif mime == "text/csv":
+        try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(repr(content))
+        except Exception as e:
+            return f"Error reading CSV file: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        try:
+            return extract_text_from_docx(file_content, clean)
+        except Exception as e:
+            return f"Error reading DOCX: {e}", 0
+    elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
+        try:
+            return extract_text_from_pptx(file_content, clean)
+        except Exception as e:
+            return f"Error reading PPTX: {e}", 0
+    else:
+        try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(repr(content))
+        except Exception as e:
+            return f"Error reading file: {e}", 0
+def download_and_process_file(url, clean=True):
   """Downloads a file from a URL and returns the local file path."""
   if not url.startswith("http://") and not url.startswith("https://"):
     url = "http://" + url  # Prepend "http://" if not present
     with open(temp_filename, 'wb') as f:
       for chunk in response.iter_content(chunk_size=8192000):
         f.write(chunk)
+    # Check if it's an image type
+    kind = filetype.guess(temp_filename)
+    if kind and kind.mime.startswith('image/'):
+      return f"![]({url})", 0 # Return markdown image syntax if it's an image
+    else:
+      return read_document(temp_filename, clean) # Otherwise, process as a document
   except requests.exceptions.MissingSchema:
+    return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
   except requests.exceptions.ConnectionError:
+    return "Error: Could not connect to the server. Please check your internet connection.", 0
   except requests.exceptions.RequestException as e:
+    return f"Error downloading file: {e}", 0
+# --- Gradio Interface ---
 iface = gr.Interface(
+    fn=download_and_process_file,
+    inputs=[
+        gr.Textbox(lines=1, placeholder="Enter URL of the file"),
+        gr.Checkbox(label="Clean Text", value=True),
+    ],
+    outputs=[
+        gr.Textbox(label="Document Content/Image Markdown"),
+        gr.Number(label="Document Length (characters)"),
+    ],
+    title="Enhanced File Processor for Hugging Face Chat Tools",
+    description="Enter the URL of site and extract its content"
                 "This tool is designed for use with Hugging Face Chat Tools: "
                 "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
     concurrency_limit=None