Spaces:
Running
Running
Patrick Walukagga
commited on
Commit
·
14a4318
1
Parent(s):
d3abbf7
Add liniting
Browse files- .flake8 +5 -0
- .isort.cfg +7 -0
- Makefile +12 -0
- api.py +30 -28
- app.py +38 -27
- docs.py +1 -1
- pyproject.toml +19 -0
- rag/rag_pipeline.py +5 -10
- rag/rag_pipeline_backup.py +4 -5
- requirements-dev.txt +3 -0
- utils/db.py +2 -2
- utils/helpers.py +9 -9
- utils/pdf_processor.py +7 -7
- utils/prompts.py +3 -2
.flake8
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[flake8]
|
2 |
+
ignore = D203, E402, F403, F405, W503, W605
|
3 |
+
exclude = .git,env,__pycache__,docs/source/conf.py,old,build,dist, *migrations*,env,venv,alembic
|
4 |
+
max-complexity = 10
|
5 |
+
max-line-length = 119
|
.isort.cfg
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[settings]
|
2 |
+
multi_line_output=3
|
3 |
+
include_trailing_comma=True
|
4 |
+
force_grid_wrap=0
|
5 |
+
use_parentheses=True
|
6 |
+
line_length=88
|
7 |
+
skip=env,migrations,alembic,venv
|
Makefile
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: lint-apply lint-check
|
2 |
+
|
3 |
+
lint-check:
|
4 |
+
@echo "Checking for lint errors..."
|
5 |
+
flake8 .
|
6 |
+
black --check .
|
7 |
+
isort --check-only .
|
8 |
+
|
9 |
+
lint-apply:
|
10 |
+
@echo "apply linting ..."
|
11 |
+
black .
|
12 |
+
isort .
|
api.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
import os
|
2 |
import logging
|
3 |
-
|
4 |
-
from fastapi import FastAPI, HTTPException
|
5 |
-
from gradio_client import Client
|
6 |
from enum import Enum
|
7 |
from typing import List, Optional
|
8 |
-
|
9 |
-
from fastapi.responses import FileResponse
|
10 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
11 |
|
12 |
from docs import description, tags_metadata
|
13 |
|
@@ -21,9 +21,10 @@ app = FastAPI(
|
|
21 |
openapi_tags=tags_metadata,
|
22 |
)
|
23 |
GRADIO_URL = os.getenv("GRADIO_URL", "http://localhost:7860/")
|
24 |
-
logger.info(f"GRADIO_URL:
|
25 |
client = Client(GRADIO_URL)
|
26 |
|
|
|
27 |
class StudyVariables(str, Enum):
|
28 |
ebola_virus = "Ebola Virus"
|
29 |
vaccine_coverage = "Vaccine coverage"
|
@@ -35,6 +36,7 @@ class PromptType(str, Enum):
|
|
35 |
highlight = "Highlight"
|
36 |
evidence_based = "Evidence-based"
|
37 |
|
|
|
38 |
class StudyVariableRequest(BaseModel):
|
39 |
study_variable: StudyVariables
|
40 |
prompt_type: PromptType
|
@@ -42,6 +44,7 @@ class StudyVariableRequest(BaseModel):
|
|
42 |
|
43 |
model_config = ConfigDict(from_attributes=True)
|
44 |
|
|
|
45 |
class DownloadCSV(BaseModel):
|
46 |
text: constr(min_length=1, strip_whitespace=True) # type: ignore
|
47 |
|
@@ -64,44 +67,41 @@ class ZoteroCredentials(BaseModel):
|
|
64 |
@app.post("/process_zotero_library_items", tags=["zotero"])
|
65 |
def process_zotero_library_items(zotero_credentials: ZoteroCredentials):
|
66 |
result = client.predict(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
)
|
71 |
-
return {"result":result}
|
72 |
-
|
73 |
|
74 |
|
75 |
@app.post("/get_study_info", tags=["zotero"])
|
76 |
def get_study_info(study: Study):
|
77 |
-
result = client.predict(
|
78 |
-
study_name=study.study_name,
|
79 |
-
api_name="/get_study_info"
|
80 |
-
)
|
81 |
# print(result)
|
82 |
-
return {"result":result}
|
83 |
|
84 |
|
85 |
@app.post("/study_variables", tags=["zotero"])
|
86 |
-
def process_study_variables(
|
|
|
|
|
87 |
result = client.predict(
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
)
|
93 |
print(type(result))
|
94 |
-
return {"result":result[0]}
|
95 |
|
96 |
|
97 |
@app.post("/download_csv", tags=["zotero"])
|
98 |
def download_csv(download_request: DownloadCSV):
|
99 |
result = client.predict(
|
100 |
-
|
101 |
-
api_name="/download_as_csv"
|
102 |
)
|
103 |
print(result)
|
104 |
-
|
105 |
file_path = result
|
106 |
if not file_path or not os.path.exists(file_path):
|
107 |
raise HTTPException(status_code=404, detail="File not found")
|
@@ -110,5 +110,7 @@ def download_csv(download_request: DownloadCSV):
|
|
110 |
return FileResponse(
|
111 |
file_path,
|
112 |
media_type="text/csv", # Specify the correct MIME type for CSV
|
113 |
-
filename=os.path.basename(
|
114 |
-
|
|
|
|
|
|
|
|
1 |
import logging
|
2 |
+
import os
|
|
|
|
|
3 |
from enum import Enum
|
4 |
from typing import List, Optional
|
5 |
+
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
+
from fastapi import FastAPI, HTTPException
|
8 |
+
from fastapi.responses import FileResponse
|
9 |
+
from gradio_client import Client
|
10 |
+
from pydantic import BaseModel, ConfigDict, Field, constr
|
11 |
|
12 |
from docs import description, tags_metadata
|
13 |
|
|
|
21 |
openapi_tags=tags_metadata,
|
22 |
)
|
23 |
GRADIO_URL = os.getenv("GRADIO_URL", "http://localhost:7860/")
|
24 |
+
logger.info(f"GRADIO_URL: {GRADIO_URL}")
|
25 |
client = Client(GRADIO_URL)
|
26 |
|
27 |
+
|
28 |
class StudyVariables(str, Enum):
|
29 |
ebola_virus = "Ebola Virus"
|
30 |
vaccine_coverage = "Vaccine coverage"
|
|
|
36 |
highlight = "Highlight"
|
37 |
evidence_based = "Evidence-based"
|
38 |
|
39 |
+
|
40 |
class StudyVariableRequest(BaseModel):
|
41 |
study_variable: StudyVariables
|
42 |
prompt_type: PromptType
|
|
|
44 |
|
45 |
model_config = ConfigDict(from_attributes=True)
|
46 |
|
47 |
+
|
48 |
class DownloadCSV(BaseModel):
|
49 |
text: constr(min_length=1, strip_whitespace=True) # type: ignore
|
50 |
|
|
|
67 |
@app.post("/process_zotero_library_items", tags=["zotero"])
|
68 |
def process_zotero_library_items(zotero_credentials: ZoteroCredentials):
|
69 |
result = client.predict(
|
70 |
+
zotero_library_id=zotero_credentials.library_id,
|
71 |
+
zotero_api_access_key=zotero_credentials.api_access_key,
|
72 |
+
api_name="/process_zotero_library_items",
|
73 |
)
|
74 |
+
return {"result": result}
|
|
|
75 |
|
76 |
|
77 |
@app.post("/get_study_info", tags=["zotero"])
|
78 |
def get_study_info(study: Study):
|
79 |
+
result = client.predict(study_name=study.study_name, api_name="/get_study_info")
|
|
|
|
|
|
|
80 |
# print(result)
|
81 |
+
return {"result": result}
|
82 |
|
83 |
|
84 |
@app.post("/study_variables", tags=["zotero"])
|
85 |
+
def process_study_variables(
|
86 |
+
study_request: StudyVariableRequest,
|
87 |
+
):
|
88 |
result = client.predict(
|
89 |
+
text=study_request.text, # "study id, study title, study design, study summary",
|
90 |
+
study_name=study_request.study_variable, # "Ebola Virus",
|
91 |
+
prompt_type=study_request.prompt_type, # "Default",
|
92 |
+
api_name="/process_multi_input",
|
93 |
)
|
94 |
print(type(result))
|
95 |
+
return {"result": result[0]}
|
96 |
|
97 |
|
98 |
@app.post("/download_csv", tags=["zotero"])
|
99 |
def download_csv(download_request: DownloadCSV):
|
100 |
result = client.predict(
|
101 |
+
markdown_content=download_request.text, api_name="/download_as_csv"
|
|
|
102 |
)
|
103 |
print(result)
|
104 |
+
|
105 |
file_path = result
|
106 |
if not file_path or not os.path.exists(file_path):
|
107 |
raise HTTPException(status_code=404, detail="File not found")
|
|
|
110 |
return FileResponse(
|
111 |
file_path,
|
112 |
media_type="text/csv", # Specify the correct MIME type for CSV
|
113 |
+
filename=os.path.basename(
|
114 |
+
file_path
|
115 |
+
), # Provide a default filename for the download
|
116 |
+
)
|
app.py
CHANGED
@@ -1,35 +1,38 @@
|
|
1 |
# app.py
|
2 |
|
3 |
import csv
|
4 |
-
|
5 |
import datetime
|
6 |
-
|
7 |
# from datetime import datetime
|
8 |
import io
|
9 |
import json
|
10 |
import logging
|
11 |
import os
|
12 |
-
from typing import
|
13 |
|
14 |
import gradio as gr
|
15 |
import openai
|
|
|
16 |
from dotenv import load_dotenv
|
17 |
from slugify import slugify
|
18 |
-
from cachetools import LRUCache
|
19 |
|
20 |
-
from config import
|
|
|
21 |
from rag.rag_pipeline import RAGPipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
from utils.helpers import (
|
23 |
-
append_to_study_files,
|
24 |
add_study_files_to_chromadb,
|
|
|
25 |
chromadb_client,
|
26 |
)
|
27 |
-
from utils.db import create_db_and_tables, add_study_files_to_db, get_study_file_by_name, get_study_files_by_library_id, get_all_study_files
|
28 |
-
from utils.prompts import highlight_prompt, evidence_based_prompt
|
29 |
-
from utils.zotero_manager import ZoteroManager
|
30 |
-
|
31 |
-
from interface import create_chat_interface
|
32 |
from utils.pdf_processor import PDFProcessor
|
|
|
|
|
33 |
|
34 |
# Configure logging
|
35 |
logging.basicConfig(level=logging.INFO)
|
@@ -54,11 +57,13 @@ cache = LRUCache(maxsize=100)
|
|
54 |
# data_ = {}
|
55 |
# json.dump(data_, file, indent=4)
|
56 |
|
|
|
57 |
def get_cache_value(key):
|
58 |
return cache.get(key)
|
59 |
|
|
|
60 |
zotero_library_id = get_cache_value("zotero_library_id")
|
61 |
-
logger.info(f"zotero_library_id:
|
62 |
|
63 |
|
64 |
def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
@@ -83,22 +88,22 @@ def get_study_info(study_name: str | list) -> str:
|
|
83 |
"""Retrieve information about the specified study."""
|
84 |
if isinstance(study_name, list):
|
85 |
study_name = study_name[0] if study_name else None
|
86 |
-
|
87 |
if not study_name:
|
88 |
return "No study selected"
|
89 |
-
|
90 |
study = get_study_file_by_name(study_name)
|
91 |
-
logger.info(f"Study:
|
92 |
|
93 |
collection = chromadb_client.get_or_create_collection("study_files_collection")
|
94 |
result = collection.get(ids=[study_name]) # Query by study name (as a list)
|
95 |
-
logger.info(f"Result:
|
96 |
|
97 |
if not result or len(result["metadatas"]) == 0:
|
98 |
raise ValueError(f"Invalid study name: {study_name}")
|
99 |
|
100 |
study_file = result["metadatas"][0].get("file_path")
|
101 |
-
logger.info(f"study_file:
|
102 |
if not study_file:
|
103 |
raise ValueError(f"File path not found for study name: {study_name}")
|
104 |
|
@@ -154,7 +159,7 @@ def chat_function(message: str, study_name: str, prompt_type: str) -> str:
|
|
154 |
return "Please enter a valid query."
|
155 |
|
156 |
rag = get_rag_pipeline(study_name)
|
157 |
-
logger.info(f"rag:
|
158 |
prompt = {
|
159 |
"Highlight": highlight_prompt,
|
160 |
"Evidence-based": evidence_based_prompt,
|
@@ -229,7 +234,9 @@ def process_zotero_library_items(
|
|
229 |
|
230 |
# Dynamically update study choices
|
231 |
global study_choices
|
232 |
-
study_choices = [
|
|
|
|
|
233 |
message = "Successfully processed items in your zotero library"
|
234 |
except Exception as e:
|
235 |
message = f"Error process your zotero library: {str(e)}"
|
@@ -240,14 +247,16 @@ def process_zotero_library_items(
|
|
240 |
def refresh_study_choices():
|
241 |
"""
|
242 |
Refresh study choices for a specific dropdown instance.
|
243 |
-
|
244 |
:return: Updated Dropdown with current study choices
|
245 |
"""
|
246 |
global study_choices
|
247 |
zotero_library_id = get_cache_value("zotero_library_id")
|
248 |
-
logger.info(f"zotero_library_id:
|
249 |
-
study_choices = [
|
250 |
-
|
|
|
|
|
251 |
return study_choices
|
252 |
|
253 |
|
@@ -255,7 +264,7 @@ def process_multi_input(text, study_name, prompt_type):
|
|
255 |
# Split input based on commas and strip any extra spaces
|
256 |
variable_list = [word.strip().upper() for word in text.split(",")]
|
257 |
user_message = f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
258 |
-
logger.info(f"User message:
|
259 |
response = chat_function(user_message, study_name, prompt_type)
|
260 |
return [response, gr.update(visible=True)]
|
261 |
|
@@ -400,7 +409,9 @@ def create_gr_interface() -> gr.Blocks:
|
|
400 |
if zotero_library_id is None:
|
401 |
zotero_library_id = get_cache_value("zotero_library_id")
|
402 |
logger.info(f"zotero_library_id: =====> {zotero_library_id}")
|
403 |
-
study_choices_db = get_study_files_by_library_id(
|
|
|
|
|
404 |
logger.info(f"study_choices_db: =====> {study_choices_db}")
|
405 |
study_files = get_all_study_files()
|
406 |
logger.info(f"study_files: =====> {study_files}")
|
@@ -501,8 +512,8 @@ def create_gr_interface() -> gr.Blocks:
|
|
501 |
).then(fn=cleanup_temp_files, inputs=None, outputs=None)
|
502 |
|
503 |
refresh_button.click(
|
504 |
-
fn=refresh_study_choices,
|
505 |
-
outputs=[study_dropdown] # Update the same dropdown
|
506 |
)
|
507 |
|
508 |
# Event handlers for PDF Chat tab
|
|
|
1 |
# app.py
|
2 |
|
3 |
import csv
|
|
|
4 |
import datetime
|
|
|
5 |
# from datetime import datetime
|
6 |
import io
|
7 |
import json
|
8 |
import logging
|
9 |
import os
|
10 |
+
from typing import Any, List, Tuple
|
11 |
|
12 |
import gradio as gr
|
13 |
import openai
|
14 |
+
from cachetools import LRUCache
|
15 |
from dotenv import load_dotenv
|
16 |
from slugify import slugify
|
|
|
17 |
|
18 |
+
from config import OPENAI_API_KEY, STUDY_FILES
|
19 |
+
from interface import create_chat_interface
|
20 |
from rag.rag_pipeline import RAGPipeline
|
21 |
+
from utils.db import (
|
22 |
+
add_study_files_to_db,
|
23 |
+
create_db_and_tables,
|
24 |
+
get_all_study_files,
|
25 |
+
get_study_file_by_name,
|
26 |
+
get_study_files_by_library_id,
|
27 |
+
)
|
28 |
from utils.helpers import (
|
|
|
29 |
add_study_files_to_chromadb,
|
30 |
+
append_to_study_files,
|
31 |
chromadb_client,
|
32 |
)
|
|
|
|
|
|
|
|
|
|
|
33 |
from utils.pdf_processor import PDFProcessor
|
34 |
+
from utils.prompts import evidence_based_prompt, highlight_prompt
|
35 |
+
from utils.zotero_manager import ZoteroManager
|
36 |
|
37 |
# Configure logging
|
38 |
logging.basicConfig(level=logging.INFO)
|
|
|
57 |
# data_ = {}
|
58 |
# json.dump(data_, file, indent=4)
|
59 |
|
60 |
+
|
61 |
def get_cache_value(key):
|
62 |
return cache.get(key)
|
63 |
|
64 |
+
|
65 |
zotero_library_id = get_cache_value("zotero_library_id")
|
66 |
+
logger.info(f"zotero_library_id: {zotero_library_id}")
|
67 |
|
68 |
|
69 |
def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
|
|
88 |
"""Retrieve information about the specified study."""
|
89 |
if isinstance(study_name, list):
|
90 |
study_name = study_name[0] if study_name else None
|
91 |
+
|
92 |
if not study_name:
|
93 |
return "No study selected"
|
94 |
+
|
95 |
study = get_study_file_by_name(study_name)
|
96 |
+
logger.info(f"Study: {study}")
|
97 |
|
98 |
collection = chromadb_client.get_or_create_collection("study_files_collection")
|
99 |
result = collection.get(ids=[study_name]) # Query by study name (as a list)
|
100 |
+
logger.info(f"Result: {result}")
|
101 |
|
102 |
if not result or len(result["metadatas"]) == 0:
|
103 |
raise ValueError(f"Invalid study name: {study_name}")
|
104 |
|
105 |
study_file = result["metadatas"][0].get("file_path")
|
106 |
+
logger.info(f"study_file: {study_file}")
|
107 |
if not study_file:
|
108 |
raise ValueError(f"File path not found for study name: {study_name}")
|
109 |
|
|
|
159 |
return "Please enter a valid query."
|
160 |
|
161 |
rag = get_rag_pipeline(study_name)
|
162 |
+
logger.info(f"rag: {rag}")
|
163 |
prompt = {
|
164 |
"Highlight": highlight_prompt,
|
165 |
"Evidence-based": evidence_based_prompt,
|
|
|
234 |
|
235 |
# Dynamically update study choices
|
236 |
global study_choices
|
237 |
+
study_choices = [
|
238 |
+
file.name for file in get_study_files_by_library_id([zotero_library_id])
|
239 |
+
]
|
240 |
message = "Successfully processed items in your zotero library"
|
241 |
except Exception as e:
|
242 |
message = f"Error process your zotero library: {str(e)}"
|
|
|
247 |
def refresh_study_choices():
|
248 |
"""
|
249 |
Refresh study choices for a specific dropdown instance.
|
250 |
+
|
251 |
:return: Updated Dropdown with current study choices
|
252 |
"""
|
253 |
global study_choices
|
254 |
zotero_library_id = get_cache_value("zotero_library_id")
|
255 |
+
logger.info(f"zotero_library_id: {zotero_library_id}")
|
256 |
+
study_choices = [
|
257 |
+
file.name for file in get_study_files_by_library_id([zotero_library_id])
|
258 |
+
]
|
259 |
+
logger.info(f"Study choices: {study_choices}")
|
260 |
return study_choices
|
261 |
|
262 |
|
|
|
264 |
# Split input based on commas and strip any extra spaces
|
265 |
variable_list = [word.strip().upper() for word in text.split(",")]
|
266 |
user_message = f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
267 |
+
logger.info(f"User message: {user_message}")
|
268 |
response = chat_function(user_message, study_name, prompt_type)
|
269 |
return [response, gr.update(visible=True)]
|
270 |
|
|
|
409 |
if zotero_library_id is None:
|
410 |
zotero_library_id = get_cache_value("zotero_library_id")
|
411 |
logger.info(f"zotero_library_id: =====> {zotero_library_id}")
|
412 |
+
study_choices_db = get_study_files_by_library_id(
|
413 |
+
[zotero_library_id]
|
414 |
+
)
|
415 |
logger.info(f"study_choices_db: =====> {study_choices_db}")
|
416 |
study_files = get_all_study_files()
|
417 |
logger.info(f"study_files: =====> {study_files}")
|
|
|
512 |
).then(fn=cleanup_temp_files, inputs=None, outputs=None)
|
513 |
|
514 |
refresh_button.click(
|
515 |
+
fn=refresh_study_choices,
|
516 |
+
outputs=[study_dropdown], # Update the same dropdown
|
517 |
)
|
518 |
|
519 |
# Event handlers for PDF Chat tab
|
docs.py
CHANGED
@@ -10,4 +10,4 @@ Welcome to the Acres AI RAG API documentation.
|
|
10 |
|
11 |
tags_metadata = [
|
12 |
{"name": "ACRES RAG", "description": "AI RAG Application"},
|
13 |
-
]
|
|
|
10 |
|
11 |
tags_metadata = [
|
12 |
{"name": "ACRES RAG", "description": "AI RAG Application"},
|
13 |
+
]
|
pyproject.toml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.black]
|
2 |
+
include = '\.pyi?$'
|
3 |
+
exclude = '''
|
4 |
+
/(
|
5 |
+
\.git
|
6 |
+
| \.hg
|
7 |
+
| \.mypy_cache
|
8 |
+
| \.tox
|
9 |
+
| \.venv
|
10 |
+
| env
|
11 |
+
|venv
|
12 |
+
| _build
|
13 |
+
| buck-out
|
14 |
+
| build
|
15 |
+
| dist
|
16 |
+
| migrations
|
17 |
+
|alembic
|
18 |
+
)/
|
19 |
+
'''
|
rag/rag_pipeline.py
CHANGED
@@ -1,19 +1,15 @@
|
|
1 |
# rag/rag_pipeline.py
|
2 |
import json
|
3 |
import logging
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
from llama_index.core
|
8 |
-
from llama_index.core import
|
9 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
10 |
from llama_index.llms.openai import OpenAI
|
11 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
12 |
-
import chromadb
|
13 |
-
from typing import Dict, Any, List, Tuple, Optional
|
14 |
-
import re
|
15 |
-
import logging
|
16 |
-
|
17 |
|
18 |
logging.basicConfig(level=logging.INFO)
|
19 |
logger = logging.getLogger(__name__)
|
@@ -172,7 +168,6 @@ class RAGPipeline:
|
|
172 |
self.extract_page_number_from_query(context) if self.is_pdf else None
|
173 |
)
|
174 |
|
175 |
-
|
176 |
# This is a hack to index all the documents in the store :)
|
177 |
n_documents = len(self.index.docstore.docs)
|
178 |
print(f"n_documents: {n_documents}")
|
|
|
1 |
# rag/rag_pipeline.py
|
2 |
import json
|
3 |
import logging
|
4 |
+
import re
|
5 |
+
from typing import Any, Dict, List, Optional, Tuple
|
6 |
|
7 |
+
import chromadb
|
8 |
+
from llama_index.core import Document, PromptTemplate, VectorStoreIndex
|
9 |
+
from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser
|
10 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
11 |
from llama_index.llms.openai import OpenAI
|
12 |
from llama_index.vector_stores.chroma import ChromaVectorStore
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
|
|
168 |
self.extract_page_number_from_query(context) if self.is_pdf else None
|
169 |
)
|
170 |
|
|
|
171 |
# This is a hack to index all the documents in the store :)
|
172 |
n_documents = len(self.index.docstore.docs)
|
173 |
print(f"n_documents: {n_documents}")
|
rag/rag_pipeline_backup.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
import json
|
2 |
-
from typing import Dict,
|
3 |
-
|
4 |
-
from llama_index.core
|
5 |
-
from llama_index.core import
|
6 |
-
from typing import List
|
7 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
from llama_index.llms.openai import OpenAI
|
9 |
|
|
|
1 |
import json
|
2 |
+
from typing import Any, Dict, List
|
3 |
+
|
4 |
+
from llama_index.core import Document, PromptTemplate, VectorStoreIndex
|
5 |
+
from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser
|
|
|
6 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
7 |
from llama_index.llms.openai import OpenAI
|
8 |
|
requirements-dev.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
black==24.10.0
|
2 |
+
isort==5.13.2
|
3 |
+
flake8==7.1.1
|
utils/db.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84acae8e51383d6990cd9edb7c1684292e523e7d0af87a71531bd5f9cf2909b5
|
3 |
+
size 4907
|
utils/helpers.py
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
# utils/helpers.py
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
4 |
from llama_index.core import Response
|
5 |
-
|
6 |
from rag.rag_pipeline import RAGPipeline
|
7 |
from utils.prompts import (
|
8 |
-
structured_follow_up_prompt,
|
9 |
-
VaccineCoverageVariables,
|
10 |
StudyCharacteristics,
|
|
|
|
|
11 |
)
|
12 |
-
import json
|
13 |
-
import json
|
14 |
-
import chromadb
|
15 |
-
from chromadb.api.types import Document
|
16 |
|
17 |
# Initialize ChromaDB client
|
18 |
chromadb_client = chromadb.Client()
|
@@ -88,7 +88,7 @@ def append_to_study_files(file_path, new_key, new_value):
|
|
88 |
"Gene Xpert": "data/gene_xpert_zotero_items.json"
|
89 |
}
|
90 |
"""
|
91 |
-
try:
|
92 |
# Read the existing data from the file
|
93 |
with open(file_path, "r") as file:
|
94 |
data = json.load(file)
|
|
|
1 |
# utils/helpers.py
|
2 |
|
3 |
+
import json
|
4 |
+
from typing import Any, Dict, List
|
5 |
+
|
6 |
+
import chromadb
|
7 |
+
from chromadb.api.types import Document
|
8 |
from llama_index.core import Response
|
9 |
+
|
10 |
from rag.rag_pipeline import RAGPipeline
|
11 |
from utils.prompts import (
|
|
|
|
|
12 |
StudyCharacteristics,
|
13 |
+
VaccineCoverageVariables,
|
14 |
+
structured_follow_up_prompt,
|
15 |
)
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# Initialize ChromaDB client
|
18 |
chromadb_client = chromadb.Client()
|
|
|
88 |
"Gene Xpert": "data/gene_xpert_zotero_items.json"
|
89 |
}
|
90 |
"""
|
91 |
+
try:
|
92 |
# Read the existing data from the file
|
93 |
with open(file_path, "r") as file:
|
94 |
data = json.load(file)
|
utils/pdf_processor.py
CHANGED
@@ -3,17 +3,17 @@ PDF processing module for ACRES RAG Platform.
|
|
3 |
Handles PDF file processing, text extraction, and page rendering.
|
4 |
"""
|
5 |
|
6 |
-
# utils/pdf_processor.py
|
7 |
-
import os
|
8 |
-
import fitz
|
9 |
-
import logging
|
10 |
-
from typing import Dict, List, Optional
|
11 |
import datetime
|
12 |
-
from slugify import slugify
|
13 |
import json
|
14 |
-
|
|
|
|
|
15 |
import re
|
|
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
|
|
3 |
Handles PDF file processing, text extraction, and page rendering.
|
4 |
"""
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
import datetime
|
|
|
7 |
import json
|
8 |
+
import logging
|
9 |
+
# utils/pdf_processor.py
|
10 |
+
import os
|
11 |
import re
|
12 |
+
from typing import Dict, List, Optional
|
13 |
|
14 |
+
import fitz
|
15 |
+
from PIL import Image
|
16 |
+
from slugify import slugify
|
17 |
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
utils/prompts.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
# utils/prompts.py
|
2 |
|
|
|
|
|
3 |
from llama_index.core import PromptTemplate
|
4 |
-
from typing import Optional, List
|
5 |
-
from pydantic import BaseModel, Field
|
6 |
from llama_index.core.prompts import PromptTemplate
|
|
|
7 |
|
8 |
|
9 |
class StudyCharacteristics(BaseModel):
|
|
|
1 |
# utils/prompts.py
|
2 |
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
from llama_index.core import PromptTemplate
|
|
|
|
|
6 |
from llama_index.core.prompts import PromptTemplate
|
7 |
+
from pydantic import BaseModel, Field
|
8 |
|
9 |
|
10 |
class StudyCharacteristics(BaseModel):
|