Spaces:

PatronusAI
/

LynxDemo

Sleeping

Allen Park commited on Jul 29, 2024

Commit

6efea88

1 Parent(s): 901a87e

feat(docx text extraction): extract all the text from the uploaded docx file

* feat: add python-docx text extraction from pdf helper functoin
---------
Co-authored-by: Allen Park <parknella19@gmail.com>

Files changed (2) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 import gradio as gr
 import openai
 import pymupdf
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
@@ -143,6 +144,13 @@ def extract_text_pymupdf(file):
             text += page.get_text()
     return text
 def upload_file(filepath):
     extracted_file_text = ""
     if filepath is not None:
@@ -153,8 +161,8 @@ def upload_file(filepath):
         # conditionals for filetype and function call
         if filetype == "pdf" or filetype == "txt":
             extracted_file_text = extract_text_pymupdf(filepath)
-        elif filetype == "docx" or filetype == "doc":
-            extracted_file_text = filepath.read().decode("utf-8")
         return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
     else:
         return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]

 import gradio as gr
 import openai
 import pymupdf
+from docx import Document
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
             text += page.get_text()
     return text
+def extract_text_python_docx(file):
+    doc = Document(io.BytesIO(file))
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + '\n'
+    return text.strip()
 def upload_file(filepath):
     extracted_file_text = ""
     if filepath is not None:
         # conditionals for filetype and function call
         if filetype == "pdf" or filetype == "txt":
             extracted_file_text = extract_text_pymupdf(filepath)
+        elif filetype == "docx":
+            extracted_file_text = extract_text_python_docx(filepath)
         return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
     else:
         return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 openai
-PyMuPDF

 openai
+PyMuPDF
+python-docx