Spaces:
Sleeping
Sleeping
File size: 6,741 Bytes
0667fd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import gradio as gr
import os
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document as LangchainDocument
# Initialize global variables
vs_dict = {}
# Helper function to load and parse the input data
def mariela_parse(files):
parser = LlamaParse(
api_key=os.getenv("LLAMA_API_KEY"),
result_type="markdown",
verbose=True
)
parsed_documents = []
for file in files:
parsed_documents.extend(parser.load_data(file.name))
return parsed_documents
# Create vector database
def mariela_create_vector_database(parsed_documents, collection_name):
langchain_docs = [
LangchainDocument(page_content=doc.text, metadata=doc.metadata)
for doc in parsed_documents
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=100)
docs = text_splitter.split_documents(langchain_docs)
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vs = Chroma.from_documents(
documents=docs,
embedding=embed_model,
persist_directory="chroma_db",
collection_name=collection_name
)
return vs
# Function to handle file upload and parsing
def mariela_upload_and_parse(files, collection_name):
global vs_dict
if not files:
return "Please upload at least one file."
parsed_documents = mariela_parse(files)
vs = mariela_create_vector_database(parsed_documents, collection_name)
vs_dict[collection_name] = vs
return f"Files uploaded, parsed, and stored successfully in collection: {collection_name}"
# Function to handle retrieval
def mariela_retrieve(question, collection_name):
global vs_dict
if collection_name not in vs_dict:
return f"Collection '{collection_name}' not found. Please upload and parse files for this collection first."
vs = vs_dict[collection_name]
results = vs.similarity_search(question, k=4)
formatted_results = []
for i, doc in enumerate(results, 1):
formatted_results.append(f"Result {i}:\n{doc.page_content}\n\nMetadata: {doc.metadata}\n")
return "\n\n".join(formatted_results)
# Supported file types list
supported_file_types = """
Supported Document Types:
- Base types: pdf
- Documents and presentations: 602, abw, cgm, cwk, doc, docx, docm, dot, dotm, hwp, key, lwp, mw, mcw, pages, pbd, ppt, pptm, pptx, pot, potm, potx, rtf, sda, sdd, sdp, sdw, sgl, sti, sxi, sxw, stw, sxg, txt, uof, uop, uot, vor, wpd, wps, xml, zabw, epub
- Images: jpg, jpeg, png, gif, bmp, svg, tiff, webp, web, htm, html
- Spreadsheets: xlsx, xls, xlsm, xlsb, xlw, csv, dif, sylk, slk, prn, numbers, et, ods, fods, uos1, uos2, dbf, wk1, wk2, wk3, wk4, wks, 123, wq1, wq2, wb1, wb2, wb3, qpw, xlr, eth, tsv
"""
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Mariela: Multi-Action Retrieval and Intelligent Extraction Learning Assistant")
gr.Markdown("This application allows you to upload documents, parse them, and then ask questions to retrieve relevant information.")
with gr.Tab("Upload and Parse Files"):
gr.Markdown("## Upload and Parse Files")
gr.Markdown("Upload your documents here to create a searchable knowledge base.")
gr.Markdown("""
### API Documentation
1. **Confirm that you have cURL installed on your system.**
```bash
$ curl --version
```
2. **Find the API endpoint below corresponding to your desired function in the app.**
**API Name: `/mariela_upload`**
```bash
curl -X POST {url_of_gradio_app}/call/mariela_upload -s -H "Content-Type: application/json" -d '{
"data": [
[handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/sample_file.pdf')],
"Hello!!"
]}' \
| awk -F'"' '{ print $4}' \
| read EVENT_ID; curl -N {url_of_gradio_app}/call/mariela_upload/$EVENT_ID
```
**Accepts 2 parameters:**
- **[0] any (Required):** The input value that is provided in the "Upload Files" File component.
- **[1] string (Required):** The input value that is provided in the "Collection Name" Textbox component.
**Returns 1 element:**
- **string:** The output value that appears in the "Status" Textbox component.
""")
file_input = gr.File(label="Upload Files", file_count="multiple")
collection_name_input = gr.Textbox(label="Collection Name")
upload_button = gr.Button("Upload and Parse")
upload_output = gr.Textbox(label="Status")
upload_button.click(mariela_upload_and_parse, inputs=[file_input, collection_name_input], outputs=upload_output)
with gr.Tab("Retrieval"):
gr.Markdown("## Retrieval")
gr.Markdown("Ask questions about your uploaded documents here.")
gr.Markdown("""
### API Documentation
1. **Confirm that you have cURL installed on your system.**
```bash
$ curl --version
```
2. **Find the API endpoint below corresponding to your desired function in the app.**
**API Name: `/mariela_retrieve`**
```bash
curl -X POST {url_of_gradio_app}/call/mariela_retrieve -s -H "Content-Type: application/json" -d '{
"data": [
"Hello!!",
"Hello!!"
]}' \
| awk -F'"' '{ print $4}' \
| read EVENT_ID; curl -N {url_of_gradio_app}/call/mariela_retrieve/$EVENT_ID
```
**Accepts 2 parameters:**
- **[0] string (Required):** The input value that is provided in the "Enter a query to retrieve relevant passages" Textbox component.
- **[1] string (Required):** The input value that is provided in the "Collection Name" Textbox component.
**Returns 1 element:**
- **string:** The output value that appears in the "Retrieved Passages" Textbox component.
""")
collection_name_retrieval = gr.Textbox(label="Collection Name")
question_input = gr.Textbox(label="Enter a query to retrieve relevant passages")
retrieval_output = gr.Textbox(label="Retrieved Passages")
retrieval_button = gr.Button("Retrieve")
retrieval_button.click(mariela_retrieve, inputs=[question_input, collection_name_retrieval], outputs=retrieval_output)
with gr.Tab("Supported Document Types"):
gr.Markdown("## Supported Document Types")
gr.Markdown(supported_file_types)
demo.launch(debug=True) |