Spaces:
Build error
Build error
Commit
路
7ffe358
1
Parent(s):
b2f16d4
Implementamos modulo LLM y VectorStore.
Browse files- .gitignore +3 -1
- Pipfile +8 -0
- app.py +9 -1
- chatbot/embeddings.py +20 -0
- chatbot/llm.py +36 -0
- chatbot/vectorstore.py +25 -0
- ingest.py +25 -0
- requirements.txt +2 -1
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
Pipfile.lock
|
| 2 |
-
*.pdf
|
|
|
|
|
|
|
|
|
| 1 |
Pipfile.lock
|
| 2 |
+
*.pdf
|
| 3 |
+
.env
|
| 4 |
+
chroma_db/
|
Pipfile
CHANGED
|
@@ -5,6 +5,14 @@ name = "pypi"
|
|
| 5 |
|
| 6 |
[packages]
|
| 7 |
huggingface-hub = "==0.25.2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
[dev-packages]
|
| 10 |
gradio = "==5.5.0"
|
|
|
|
|
|
| 5 |
|
| 6 |
[packages]
|
| 7 |
huggingface-hub = "==0.25.2"
|
| 8 |
+
langchain = "*"
|
| 9 |
+
langchain-community = "*"
|
| 10 |
+
langchain-huggingface = "*"
|
| 11 |
+
langchain-chroma = "*"
|
| 12 |
+
einops = "*"
|
| 13 |
+
langchain-google-genai = "*"
|
| 14 |
+
langchain-core = "*"
|
| 15 |
|
| 16 |
[dev-packages]
|
| 17 |
gradio = "==5.5.0"
|
| 18 |
+
pypdf = "==5.1.0"
|
app.py
CHANGED
|
@@ -2,13 +2,21 @@
|
|
| 2 |
Chatbot Nuevo R茅gimen Acad茅mico
|
| 3 |
"""
|
| 4 |
from chatbot.ui import ChatbotInterface
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def respond(message, history):
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
| 12 |
ui = ChatbotInterface(respond)
|
| 13 |
|
| 14 |
ui.app.launch()
|
|
|
|
| 2 |
Chatbot Nuevo R茅gimen Acad茅mico
|
| 3 |
"""
|
| 4 |
from chatbot.ui import ChatbotInterface
|
| 5 |
+
from chatbot.llm import GeminiAI
|
| 6 |
+
from langchain.globals import set_verbose, set_debug
|
| 7 |
|
| 8 |
|
| 9 |
def respond(message, history):
|
| 10 |
+
prompt = llm.getMainTemplate()
|
| 11 |
+
chain = prompt | llm.llm
|
| 12 |
+
response = chain.invoke({"message": message, "history": history})
|
| 13 |
+
return response.content
|
| 14 |
|
| 15 |
|
| 16 |
if __name__ == "__main__":
|
| 17 |
+
set_verbose(True)
|
| 18 |
+
set_debug(True)
|
| 19 |
+
llm = GeminiAI("gemini-1.5-flash")
|
| 20 |
ui = ChatbotInterface(respond)
|
| 21 |
|
| 22 |
ui.app.launch()
|
chatbot/embeddings.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modulo embeddings
|
| 3 |
+
"""
|
| 4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 5 |
+
|
| 6 |
+
def init_embeddings( embeddings_model_name="jinaai/jina-embeddings-v3" ):
|
| 7 |
+
"""
|
| 8 |
+
Inicializa y devuelve un modelo para embeddings.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
model_kwargs = {"trust_remote_code":True}
|
| 12 |
+
encode_kwargs = {'normalize_embeddings': False}
|
| 13 |
+
embeddings = HuggingFaceEmbeddings(
|
| 14 |
+
model_name=embeddings_model_name,
|
| 15 |
+
model_kwargs=model_kwargs,
|
| 16 |
+
encode_kwargs=encode_kwargs,
|
| 17 |
+
show_progress=True
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
return embeddings
|
chatbot/llm.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
"""
|
| 3 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
+
|
| 6 |
+
class GeminiAI:
|
| 7 |
+
"""
|
| 8 |
+
Google Gemini AI class.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, llm_model_name: str) -> None:
|
| 12 |
+
self.llm = ChatGoogleGenerativeAI(model=llm_model_name)
|
| 13 |
+
|
| 14 |
+
def getMainTemplate(self) -> ChatPromptTemplate:
|
| 15 |
+
"""
|
| 16 |
+
Devuelve el system prompt principal.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 20 |
+
[
|
| 21 |
+
("system",
|
| 22 |
+
"Eres un asesor experto en la Resolucion 1650/24 de la DGCyE de la Provincia de Buenos Aires.\n"
|
| 23 |
+
"Tu tarea es utiliza la informaci贸n de la conversaci贸n y el contexto disponible para responder las consultas del usuario.\n"),
|
| 24 |
+
("placeholder", "{history}"),
|
| 25 |
+
("human", "{message}"),
|
| 26 |
+
]
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
return prompt
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
llm = GeminiAI("gemini-1.5-flash")
|
| 35 |
+
response = llm.llm.invoke("Hola")
|
| 36 |
+
print(response)
|
chatbot/vectorstore.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modulo que permite gestionar la vector store.
|
| 3 |
+
"""
|
| 4 |
+
from langchain_chroma import Chroma
|
| 5 |
+
import requests, zipfile, io, os
|
| 6 |
+
|
| 7 |
+
class ChromaDB:
|
| 8 |
+
"""
|
| 9 |
+
Clase para gestionar una base ChromaDB
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, embedding_model) -> None:
|
| 13 |
+
if not os.path.exists("chroma_db"):
|
| 14 |
+
print("Descargando base de conocimiento...")
|
| 15 |
+
zip_file_url = "https://drive.google.com/uc?export=download&id=" + os.environ["GDRIVE_ID"]
|
| 16 |
+
r = requests.get(zip_file_url)
|
| 17 |
+
z = zipfile.ZipFile(io.BytesIO(r.content))
|
| 18 |
+
z.extractall()
|
| 19 |
+
print("OK")
|
| 20 |
+
|
| 21 |
+
self.db = Chroma(
|
| 22 |
+
collection_name="res_1650",
|
| 23 |
+
embedding_function=embedding_model,
|
| 24 |
+
persist_directory="./chroma_db",
|
| 25 |
+
)
|
ingest.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modulo para procesar el PDF de la resolucion e indexar su contenido en la DB, para su posterior utilizaci贸n por parte del chatbot.
|
| 3 |
+
|
| 4 |
+
Por simplicidad, se indexo un documento por cada p谩gina completa del documento. TODO: Implementar estrategia ParentDocumentRetriever.
|
| 5 |
+
"""
|
| 6 |
+
#from langchain_community.document_loaders import PyPDFLoader
|
| 7 |
+
from chatbot.embeddings import init_embeddings
|
| 8 |
+
from chatbot.vectorstore import ChromaDB
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
#loader = PyPDFLoader("2024_DP_134.pdf")
|
| 12 |
+
embedding_model = init_embeddings()
|
| 13 |
+
|
| 14 |
+
vector_store = ChromaDB(embedding_model)
|
| 15 |
+
|
| 16 |
+
#for page in loader.lazy_load():
|
| 17 |
+
#print(f"Procesando pagina {page.metadata['page']} - len: {len(page.page_content)}")
|
| 18 |
+
#vector_store.add_documents([page])
|
| 19 |
+
|
| 20 |
+
results = vector_store.db.similarity_search(
|
| 21 |
+
"Cuantos anexos contiene la resolucion?",
|
| 22 |
+
k=2,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
print(results)
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
gradio==5.5.0
|
|
|
|
|
|
| 1 |
+
gradio==5.5.0
|
| 2 |
+
pypdf==5.1.0
|