Spaces:

AptusAI
/

Chat-EUR-Lex

Paused

App Files Files Community

sinafarhangdoust commited on Jul 5, 2024

Commit

e34a2a6

1 Parent(s): c102038

feat: added the AKN + limited search space version for the Chat-Eurlex

Browse files

Files changed (7) hide show

EurLexChat.py +121 -79
app.py +59 -21
chat_utils.py +33 -9
config.py +13 -3
config.yaml +12 -5
consts.py +73 -0
requirements.txt +4 -3

EurLexChat.py CHANGED Viewed

@@ -6,21 +6,26 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.tools import StructuredTool
 from langchain_core.utils.function_calling import convert_to_openai_tool
 from langchain_core.messages import AIMessage
-from typing import List
 from chat_utils import get_init_modules, SYSTEM_PROMPT, SYSTEM_PROMPT_LOOP, ContextInput, Answer, get_vectorDB_module
 from langchain_core.documents.base import Document
 class EurLexChat:
     def __init__(self, config: dict):
         self.config = config
         self.max_history_messages = self.config["max_history_messages"]
         self.use_functions = (
-            'use_context_function' in config["llm"] and
-            config["llm"]["use_context_function"] and
             config["llm"]["class"] == "ChatOpenAI")
-        self.embedder, self.llm, self.chatDB_class, self.retriever = get_init_modules(config)
         self.max_context_size = config["llm"]["max_context_size"]
         self.prompt = ChatPromptTemplate.from_messages([
@@ -43,17 +48,26 @@ class EurLexChat:
                 name="get_context",
                 description="To be used whenever the provided context is empty or the user changes the topic of the conversation and you need the context for the topic. " +
                 "This function must be called only when is strictly necessary. " +
-                "This function must not be called if you already have the information to answer the user. ",
                 args_schema=ContextInput
             )
-            # self.llm = self.llm.bind(tools=[convert_to_openai_tool(GET_CONTEXT_TOOL)])
-            self.llm_with_functions = self.llm.bind(tools=[convert_to_openai_tool(GET_CONTEXT_TOOL)])
-            chain = self.prompt | RunnableLambda(self._resize_history) | self.llm_with_functions
         else:
-            chain = self.prompt | RunnableLambda(self._resize_history) | self.llm
         self.chain_with_history = RunnableWithMessageHistory(
             chain,
             self.get_chat_history,
@@ -61,8 +75,7 @@ class EurLexChat:
             history_messages_key="history",
         )
-        self.relevant_documents_pipeline = ( self.retriever | self._parse_documents )
     def _resize_history(self, input_dict):
         """
@@ -77,11 +90,10 @@ class EurLexChat:
         messages = input_dict.messages
         if (len(messages) - 2) > self.max_history_messages:
-            messages = [messages[0]] + messages[-(self.max_history_messages +1):]
             input_dict.messages = messages
         return input_dict
     def get_chat_history(self, session_id: str):
         """
         Retrieve chat history instance for a specific session ID.
@@ -108,7 +120,6 @@ class EurLexChat:
         else:
             return self.chatDB_class(session_id=session_id, **kwargs)
     def _parse_documents(self, docs: List[Document]) -> List[dict]:
         """
         Parse a list of documents into a standardized format.
@@ -126,11 +137,11 @@ class EurLexChat:
             parsed_documents.append({
                 'text': doc.page_content,
                 'source': doc.metadata["source"],
                 '_id': doc.metadata["_id"]
             })
         return parsed_documents
     def _format_context_docs(self, context_docs: List[dict]) -> str:
         """
         Format a list of documents into a single string.
@@ -147,37 +158,107 @@ class EurLexChat:
             context_str += doc['text'] + "\n\n"
         return context_str
-    def get_relevant_docs(self, question:str) -> List[dict]:
         """
         Retrieve relevant documents based on a given question.
         Args:
             question (str): The question for which relevant documents are retrieved.
         Returns:
             List[dict]: A list of relevant documents.
         """
-        docs = self.relevant_documents_pipeline.invoke(question)
         return docs
-    def get_context(self, text:str) -> str:
         """
         Retrieve context for a given text.
         Args:
             text (str): The text for which context is retrieved.
         Returns:
             str: A formatted string containing the relevant documents texts.
         """
-        docs = self.get_relevant_docs(text)
         return self._format_context_docs(docs)
     def _remove_last_messages(self, session_id:str, n:int) -> None:
         """
         Remove last n messages from the chat history of a specific session.
@@ -193,7 +274,6 @@ class EurLexChat:
         for message in message_history:
             chat_history.add_message(message)
     def _format_history(self, session_id:str) -> str:
         """
         Format chat history for a specific session into a string.
@@ -211,8 +291,7 @@ class EurLexChat:
             formatted_history += f"{message.type}: {message.content}\n\n"
         return formatted_history
-    def _resize_context(self, context_docs:List[dict]) -> List[dict]:
         """
         Resize the dimension of the context in terms of number of tokens.
         If the concatenation of document text exceeds max_context_size,
@@ -232,16 +311,24 @@ class EurLexChat:
                 resized_contexts.append(context_docs[i])
                 total_len += l
         return resized_contexts
-    def get_answer(self, session_id:str, question:str, context_docs:List[dict], from_tool:bool=False) -> Answer:
         """
         Get an answer to a question of a specific session, considering context documents and history messages.
         Args:
             session_id (str): The session ID for which the answer is retrieved.
             question (str): The new user message.
             context_docs (List[dict]): A list of documents used as context to answer the user message.
             from_tool (bool, optional): Whether the question originates from a tool. Defaults to False.
         Returns:
             Answer: An object containing the answer along with a new list of context documents
@@ -264,63 +351,18 @@ class EurLexChat:
                 self.get_chat_history(session_id=session_id).add_message(AIMessage(result.content))
                 return Answer(answer=result.content, status=-1)
             text = eval(result.additional_kwargs['tool_calls'][0]['function']['arguments'])['text']
-            new_docs = self.get_relevant_docs(text)
             self._remove_last_messages(session_id=session_id, n=2)
             result = self.get_answer(
                 session_id=session_id,
                 question=question,
                 context_docs=new_docs,
-                from_tool=True
             )
             if result.status == 1:
                 return Answer(answer=result.answer, new_documents=new_docs)
             else:
-                return Answer(answer=result.answer)
-        return Answer(answer=result.content)
-class EurLexChatAkn(EurLexChat):
-    def _parse_documents(self, docs: List[Document]) -> List[dict]:
-        """
-        Parse a list of documents into a standardized format.
-        Args:
-            docs (List[Document]): A list of documents to parse.
-        Returns:
-            List[dict]: A list of dictionaries, each containing parsed information from the input documents.
-        """
-        parsed_documents = []
-        for doc in docs:
-            parsed_documents.append({
-                'text': doc.page_content,
-                'source': doc.metadata["uri"],
-                '_id': doc.metadata["uri"] + doc.metadata["article_id"]
-            })
-        return parsed_documents
-    def get_relevant_docs(self, question: str, eurovoc: str = None) -> List[dict]:
-        """
-        Retrieve relevant documents based on a given question.
-        Args:
-            question (str): The question for which relevant documents are retrieved.
-            eurovoc (str): The Eurovoc to be used as filter
-        Returns:
-            List[dict]: A list of relevant documents.
-        """
-        if eurovoc:
-            retriever = get_vectorDB_module(
-                self.config['vectorDB'], self.embedder, metadata={'filter': {'eurovoc': ''}}
-            )
-            relevant_documents_pipeline_with_filter = (retriever | self._parse_documents)
-            docs = relevant_documents_pipeline_with_filter.invoke(
-                question
-            )
-        else:
-            docs = self.relevant_documents_pipeline.invoke(question)
-        return docs

 from langchain_core.tools import StructuredTool
 from langchain_core.utils.function_calling import convert_to_openai_tool
 from langchain_core.messages import AIMessage
+from typing import List, Optional
 from chat_utils import get_init_modules, SYSTEM_PROMPT, SYSTEM_PROMPT_LOOP, ContextInput, Answer, get_vectorDB_module
 from langchain_core.documents.base import Document
+from langchain_core.runnables import ConfigurableField
+import qdrant_client.models as rest
 class EurLexChat:
     def __init__(self, config: dict):
         self.config = config
         self.max_history_messages = self.config["max_history_messages"]
+        self.vectorDB_class = self.config['vectorDB']['class']
         self.use_functions = (
+            'use_context_function' in config["llm"] and
+            config["llm"]["use_context_function"] and
             config["llm"]["class"] == "ChatOpenAI")
+        self.embedder, self.llm, self.chatDB_class, self.retriever, retriever_chain = get_init_modules(
+            config)
         self.max_context_size = config["llm"]["max_context_size"]
         self.prompt = ChatPromptTemplate.from_messages([
                 name="get_context",
                 description="To be used whenever the provided context is empty or the user changes the topic of the conversation and you need the context for the topic. " +
                 "This function must be called only when is strictly necessary. " +
+                "This function must not be called if you already have in the context the information to answer the user. ",
                 args_schema=ContextInput
             )
+            self.llm_with_functions = self.llm.bind(
+                tools=[convert_to_openai_tool(GET_CONTEXT_TOOL)]
+            )
+            chain = (
+                    self.prompt |
+                    RunnableLambda(self._resize_history) |
+                    self.llm_with_functions
+                    )
         else:
+            chain = (
+                    self.prompt |
+                    RunnableLambda(self._resize_history) |
+                    self.llm
+                    )
         self.chain_with_history = RunnableWithMessageHistory(
             chain,
             self.get_chat_history,
             history_messages_key="history",
         )
+        self.relevant_documents_pipeline = (retriever_chain | self._parse_documents)
     def _resize_history(self, input_dict):
         """
         messages = input_dict.messages
         if (len(messages) - 2) > self.max_history_messages:
+            messages = [messages[0]] + messages[-(self.max_history_messages + 1):]
             input_dict.messages = messages
         return input_dict
     def get_chat_history(self, session_id: str):
         """
         Retrieve chat history instance for a specific session ID.
         else:
             return self.chatDB_class(session_id=session_id, **kwargs)
     def _parse_documents(self, docs: List[Document]) -> List[dict]:
         """
         Parse a list of documents into a standardized format.
             parsed_documents.append({
                 'text': doc.page_content,
                 'source': doc.metadata["source"],
+                'celex': doc.metadata["celex"],
                 '_id': doc.metadata["_id"]
             })
         return parsed_documents
     def _format_context_docs(self, context_docs: List[dict]) -> str:
         """
         Format a list of documents into a single string.
             context_str += doc['text'] + "\n\n"
         return context_str
+    def get_ids_from_celexes(self, celex_list: List[str]):
+        """
+        Retrieve the IDs of the documents given their CELEX numbers.
+        Args:
+            celex_list (List[str]): A list of CELEX numbers.
+        Returns:
+            List[str]: A list of document IDs corresponding to the provided CELEX numbers
+        """
+        if self.vectorDB_class == 'Qdrant':
+            scroll_filter = rest.Filter(
+                must=[
+                    rest.FieldCondition(
+                        key="celex",
+                        match=rest.MatchAny(any=celex_list),
+                    )
+                ])
+            offset = -1
+            ids = []
+            while not (offset is None and offset != -1):
+                if offset == -1:
+                    offset = None
+                points, offset = self.retriever.vectorstore.client.scroll(
+                    collection_name=self.retriever.vectorstore.collection_name,
+                    limit=100,
+                    offset=offset,
+                    scroll_filter=scroll_filter,
+                    with_payload=False
+                )
+                ids.extend([p.id for p in points])
+        else:
+            NotImplementedError(f"Not supported {self.vectorDB_class} vectorDB class")
+        return ids
+    def _get_qdrant_ids_filter(self, ids):
+        """
+        Returns a Qdrant filter to filter documents based on their IDs.
+        This function acts as a workaround due to a hidden bug in Qdrant
+        that prevents correct filtering using CELEX numbers.
+        Args:
+            ids (List[str]): A list of document IDs.
+        Returns:
+            Qdrant filter: A Qdrant filter to filter documents based on their IDs.
+        """
+        filter = rest.Filter(
+            must=[
+                rest.HasIdCondition(has_id=ids),
+            ],
+        )
+        return filter
+    def get_relevant_docs(self, question: str, ids_list: Optional[List[str]] = None) -> List[dict]:
         """
         Retrieve relevant documents based on a given question.
+        If ids_list is provided, the search is filtered by the given IDs.
         Args:
             question (str): The question for which relevant documents are retrieved.
+            ids_list (Optional[List[str]]): A list of document IDs to filter the search results.
         Returns:
             List[dict]: A list of relevant documents.
         """
+        if ids_list:
+            search_kwargs = {k:v for k,v in self.retriever.search_kwargs.items()}
+            if self.vectorDB_class == 'Qdrant':
+                filter = self._get_qdrant_ids_filter(ids_list)
+            else:
+                raise ValueError(f'Celex filter not supported for {self.vectorDB_class}')
+            search_kwargs.update({'filter': filter})
+            docs = self.relevant_documents_pipeline.invoke(
+                {'question': question},
+                config={"configurable": {"search_kwargs": search_kwargs}})
+        else:
+            docs = self.relevant_documents_pipeline.invoke({'question': question})
         return docs
+    def get_context(self, text: str, ids_list:Optional[List[str]]=None) -> str:
         """
         Retrieve context for a given text.
+        If ids_list is provided, the search is filtered by the given IDs.
         Args:
             text (str): The text for which context is retrieved.
+            ids_list (Optional[List[str]]): A list of document IDs to filter the search results.
         Returns:
             str: A formatted string containing the relevant documents texts.
         """
+        docs = self.get_relevant_docs(text, ids_list=ids_list)
         return self._format_context_docs(docs)
     def _remove_last_messages(self, session_id:str, n:int) -> None:
         """
         Remove last n messages from the chat history of a specific session.
         for message in message_history:
             chat_history.add_message(message)
     def _format_history(self, session_id:str) -> str:
         """
         Format chat history for a specific session into a string.
             formatted_history += f"{message.type}: {message.content}\n\n"
         return formatted_history
+    def _resize_context(self, context_docs: List[dict]) -> List[dict]:
         """
         Resize the dimension of the context in terms of number of tokens.
         If the concatenation of document text exceeds max_context_size,
                 resized_contexts.append(context_docs[i])
                 total_len += l
         return resized_contexts
+    def get_answer(self,
+                   session_id: str,
+                   question: str,
+                   context_docs: List[dict],
+                   from_tool: bool = False,
+                   ids_list: List[str] = None
+                   ) -> Answer:
         """
         Get an answer to a question of a specific session, considering context documents and history messages.
+        If ids_list is provided, any search for new context documents is filtered by the given IDs.
         Args:
             session_id (str): The session ID for which the answer is retrieved.
             question (str): The new user message.
             context_docs (List[dict]): A list of documents used as context to answer the user message.
             from_tool (bool, optional): Whether the question originates from a tool. Defaults to False.
+            ids_list (Optional[List[str]]): A list of document IDs to filter the search results for new context documents.
         Returns:
             Answer: An object containing the answer along with a new list of context documents
                 self.get_chat_history(session_id=session_id).add_message(AIMessage(result.content))
                 return Answer(answer=result.content, status=-1)
             text = eval(result.additional_kwargs['tool_calls'][0]['function']['arguments'])['text']
+            new_docs = self.get_relevant_docs(text, ids_list=ids_list)
             self._remove_last_messages(session_id=session_id, n=2)
             result = self.get_answer(
                 session_id=session_id,
                 question=question,
                 context_docs=new_docs,
+                from_tool=True,
+                ids_list=ids_list
             )
             if result.status == 1:
                 return Answer(answer=result.answer, new_documents=new_docs)
             else:
+                return Answer(answer=result.answer)
+        return Answer(answer=result.content)

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from EurLexChat import EurLexChat
 import random
 import string
 from config import CONFIG, UI_USER, UI_PWD
 def generate_random_string(length):
     # Generate a random string of the specified length
@@ -11,31 +14,59 @@ def generate_random_string(length):
     random_string = ''.join(random.choice(characters) for _ in range(length))
     return random_string
-class Documents():
     def __init__(self) -> None:
         self.documents = []
-chat = EurLexChat(config=CONFIG)
-docs = Documents()
 def remove_doc(btn):
-    docs.documents.pop(btn)
-    new_accordions, new_texts = set_new_docs_ui(docs.documents)
     return [*new_accordions, *new_texts]
-def get_answer(message, history, session_id):
     s = session_id
     if len(history) == 0:
-        docs.documents = chat.get_relevant_docs(question=message)
         s = generate_random_string(7)
-    result = chat.get_answer(s, message, docs.documents)
     history.append((message, result.answer))
     if result.new_documents:
-        docs.documents = result.new_documents
-    accordions, list_texts = set_new_docs_ui(docs.documents)
     return ['', history, gr.Column(scale=1, visible=True), *accordions, *list_texts, s]
@@ -44,7 +75,7 @@ def set_new_docs_ui(documents):
     new_texts = []
     for i in range(len(accordions)):
         if i < len(documents):
-            new_accordions.append(gr.update(accordions[i].elem_id, label=f"{documents[i]['text'][:45]}...", visible=True, open=False))
             new_texts.append(gr.update(list_texts[i].elem_id, value=f"{documents[i]['text']}...", visible=True))
         else:
             new_accordions.append(gr.update(accordions[i].elem_id, label="", visible=False))
@@ -53,15 +84,20 @@ def set_new_docs_ui(documents):
 def clean_page():
-    docs.documents = []
-    accordions, list_texts = set_new_docs_ui(docs.documents)
-    return ["", [], None, *accordions, *list_texts]
 list_texts = []
 accordions = []
 states = []
 delete_buttons = []
 block = gr.Blocks()
 with block:
@@ -71,15 +107,16 @@ with block:
     state = gr.State(value=None)
     with gr.Row():
         with gr.Column(scale=3):
             chatbot = gr.Chatbot()
             with gr.Row():
-                message = gr.Textbox(scale=10)
-                submit = gr.Button("Send", scale=1)
-                clear = gr.Button("Clear", scale=1)
         with gr.Column(scale=1, visible=False) as col:
             gr.Markdown("""<h3><center>Context documents</center></h3>""")
-            for i in range(CONFIG['vectorDB']['retriever_args']['search_kwargs']['k']):
                 with gr.Accordion(label="", elem_id=f'accordion_{i}', open=False) as acc:
                     list_texts.append(gr.Textbox("", elem_id=f'text_{i}', show_label=False, lines=10))
                     btn = gr.Button(f"Remove document")
@@ -101,9 +138,10 @@ with block:
                     Contact us: <a href="mailto:chat-eur-lex@igsg.cnr.it">chat-eur-lex@igsg.cnr.it</a>.</p>
                     </div>""")
-    clear.click(clean_page, outputs=[message, chatbot, state, *accordions, *list_texts])
-    message.submit(get_answer, inputs=[message, chatbot, state], outputs=[message, chatbot, col, *accordions, *list_texts, state])
-    submit.click(get_answer, inputs=[message, chatbot, state], outputs=[message, chatbot, col, *accordions, *list_texts, state])
     for i, b in enumerate(delete_buttons):
         b.click(remove_doc, inputs=states[i], outputs=[*accordions, *list_texts])

 import random
 import string
 from config import CONFIG, UI_USER, UI_PWD
+from consts import JUSTICE_CELEXES, POLLUTION_CELEXES
+from enum import Enum
+import regex as re
 def generate_random_string(length):
     # Generate a random string of the specified length
     random_string = ''.join(random.choice(characters) for _ in range(length))
     return random_string
+class ChatBot():
     def __init__(self) -> None:
         self.documents = []
+        self.chat = EurLexChat(config=CONFIG)
+class Versions(Enum):
+    AKN='Akoma Ntoso'
+    JUSTICE='Organisation of the legal system (1226) eurovoc'
+    POLLUTION='Pollution (2524) eurovoc'
+    BASIC='All eurovoc'
+bot = ChatBot()
+justice_ids = bot.chat.get_ids_from_celexes(JUSTICE_CELEXES)
+pollution_ids = bot.chat.get_ids_from_celexes(POLLUTION_CELEXES)
+def reinit(version):
+    bot.documents = []
+    if version == Versions.AKN.value:
+        CONFIG['vectorDB']['kwargs']['collection_name'] += "-akn"
+    else:
+        CONFIG['vectorDB']['kwargs']['collection_name'] = re.sub(r'-akn$', '', CONFIG['vectorDB']['kwargs']['collection_name'])
+    bot.chat = EurLexChat(config=CONFIG)
+    return clean_page()
 def remove_doc(btn):
+    bot.documents.pop(btn)
+    new_accordions, new_texts = set_new_docs_ui(bot.documents)
     return [*new_accordions, *new_texts]
+def get_answer(message, history, session_id, celex_type):
     s = session_id
+    if celex_type == Versions.JUSTICE.value:
+        ids_list = justice_ids
+    elif celex_type == Versions.POLLUTION.value:
+        ids_list = pollution_ids
+    elif celex_type == Versions.BASIC.value or celex_type == Versions.AKN.value:
+        ids_list = None
+    else:
+        raise ValueError(f'Wrong celex_type: {celex_type}')
     if len(history) == 0:
+        bot.documents = []
+        #docs.documents = chat.get_relevant_docs(question=message, ids_list=ids_list)
         s = generate_random_string(7)
+    result = bot.chat.get_answer(s, message, bot.documents, ids_list=ids_list)
     history.append((message, result.answer))
     if result.new_documents:
+        bot.documents = result.new_documents
+    accordions, list_texts = set_new_docs_ui(bot.documents)
     return ['', history, gr.Column(scale=1, visible=True), *accordions, *list_texts, s]
     new_texts = []
     for i in range(len(accordions)):
         if i < len(documents):
+            new_accordions.append(gr.update(accordions[i].elem_id, label=f"{documents[i]['celex']}: {documents[i]['text'][:40]}...", visible=True, open=False))
             new_texts.append(gr.update(list_texts[i].elem_id, value=f"{documents[i]['text']}...", visible=True))
         else:
             new_accordions.append(gr.update(accordions[i].elem_id, label="", visible=False))
 def clean_page():
+    bot.documents = []
+    accordions, list_texts = set_new_docs_ui(bot.documents)
+    return ["", [], None, *accordions, *list_texts, gr.Column(visible=False)]
 list_texts = []
 accordions = []
 states = []
 delete_buttons = []
+if CONFIG['vectorDB'].get('rerank'):
+    n_context_docs = CONFIG['vectorDB']['rerank']['kwargs']['top_n']
+else:
+    n_context_docs = CONFIG['vectorDB']['retriever_args']['search_kwargs']['k']
 block = gr.Blocks()
 with block:
     state = gr.State(value=None)
     with gr.Row():
         with gr.Column(scale=3):
+            drop_down = gr.Dropdown(label='Choose a version', choices=[attribute.value for attribute in Versions], value=Versions.BASIC)
             chatbot = gr.Chatbot()
             with gr.Row():
+                message = gr.Textbox(scale=10,label='',placeholder='Write a message...', container=False)
+                submit = gr.Button("Send message", scale=1)
+                clear = gr.Button("Reset chat", scale=1)
         with gr.Column(scale=1, visible=False) as col:
             gr.Markdown("""<h3><center>Context documents</center></h3>""")
+            for i in range(n_context_docs):
                 with gr.Accordion(label="", elem_id=f'accordion_{i}', open=False) as acc:
                     list_texts.append(gr.Textbox("", elem_id=f'text_{i}', show_label=False, lines=10))
                     btn = gr.Button(f"Remove document")
                     Contact us: <a href="mailto:chat-eur-lex@igsg.cnr.it">chat-eur-lex@igsg.cnr.it</a>.</p>
                     </div>""")
+    drop_down.change(reinit, inputs=[drop_down], outputs=[message, chatbot, state, *accordions, *list_texts, col])
+    clear.click(clean_page, outputs=[message, chatbot, state, *accordions, *list_texts, col])
+    message.submit(get_answer, inputs=[message, chatbot, state, drop_down], outputs=[message, chatbot, col, *accordions, *list_texts, state])
+    submit.click(get_answer, inputs=[message, chatbot, state, drop_down], outputs=[message, chatbot, col, *accordions, *list_texts, state])
     for i, b in enumerate(delete_buttons):
         b.click(remove_doc, inputs=states[i], outputs=[*accordions, *list_texts])

chat_utils.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from dataclasses import dataclass
 from typing import Optional, List
 from langchain.pydantic_v1 import BaseModel, Field
 SYSTEM_PROMPT = (
     "You are an assistant  specialized in the legal and compliance field who must answer and converse with the user using the context provided. " +
@@ -59,12 +62,11 @@ def get_init_modules(config):
     mod_chat = __import__("langchain_community.chat_message_histories",
                           fromlist=[config["chatDB"]["class"]])
     chatDB_class = getattr(mod_chat, config["chatDB"]["class"])
-    retriever = get_vectorDB_module(config['vectorDB'], embedder)
-    return embedder, llm, chatDB_class, retriever
-def get_vectorDB_module(db_config, embedder, metadata=None):
     mod_chat = __import__("langchain_community.vectorstores",
                           fromlist=[db_config["class"]])
     vectorDB_class = getattr(mod_chat, db_config["class"])
@@ -85,13 +87,10 @@ def get_vectorDB_module(db_config, embedder, metadata=None):
         client = QdrantClient(**client_kwargs)
-        if metadata is None:
-            metadata = {}
         retriever = vectorDB_class(
             client, embeddings=embedder, **db_kwargs).as_retriever(
                 search_type=db_config["retriever_args"]["search_type"],
-                search_kwargs={**db_config["retriever_args"]["search_kwargs"], **metadata},
-                filter=metadata
         )
     else:
@@ -100,4 +99,29 @@ def get_vectorDB_module(db_config, embedder, metadata=None):
             search_kwargs=db_config["retriever_args"]["search_kwargs"]
         )
-    return retriever

 from dataclasses import dataclass
 from typing import Optional, List
 from langchain.pydantic_v1 import BaseModel, Field
+from langchain_core.runnables import ConfigurableField
+from langchain_core.runnables.base import RunnableLambda
+from operator import itemgetter
 SYSTEM_PROMPT = (
     "You are an assistant  specialized in the legal and compliance field who must answer and converse with the user using the context provided. " +
     mod_chat = __import__("langchain_community.chat_message_histories",
                           fromlist=[config["chatDB"]["class"]])
     chatDB_class = getattr(mod_chat, config["chatDB"]["class"])
+    retriever, retriever_chain = get_vectorDB_module(config['vectorDB'], embedder)
+    return embedder, llm, chatDB_class, retriever, retriever_chain
+def get_vectorDB_module(db_config, embedder):
     mod_chat = __import__("langchain_community.vectorstores",
                           fromlist=[db_config["class"]])
     vectorDB_class = getattr(mod_chat, db_config["class"])
         client = QdrantClient(**client_kwargs)
         retriever = vectorDB_class(
             client, embeddings=embedder, **db_kwargs).as_retriever(
                 search_type=db_config["retriever_args"]["search_type"],
+                search_kwargs={**db_config["retriever_args"]["search_kwargs"]}
         )
     else:
             search_kwargs=db_config["retriever_args"]["search_kwargs"]
         )
+    retriever = retriever.configurable_fields(
+        search_kwargs=ConfigurableField(
+            id="search_kwargs",
+            name="Search Kwargs",
+            description="The search kwargs to use. Includes dynamic category adjustment.",
+        )
+    )
+    chain = ( RunnableLambda(lambda x: x['question']) | retriever)
+    if db_config.get("rerank"):
+        if db_config["rerank"]["class"] == "CohereRerank":
+            module_compressors = __import__("langchain.retrievers.document_compressors",
+                                        fromlist=[db_config["rerank"]["class"]])
+            rerank_class = getattr(module_compressors, db_config["rerank"]["class"])
+            rerank = rerank_class(**db_config["rerank"]["kwargs"])
+            chain = ({
+                "docs": chain,
+                "query": itemgetter("question"),
+                } | (RunnableLambda(lambda x: rerank.compress_documents(x['docs'], x['query'])))
+                    )
+        else:
+            raise NotImplementedError(db_config["rerank"]["class"])
+    return retriever, chain

config.py CHANGED Viewed

@@ -24,12 +24,22 @@ CONFIG["llm"]["kwargs"]["openai_organization"] = OPENAI_ORG_KEY
 CONFIG["vectorDB"]["kwargs"]["url"] = QDRANT_URL
 CONFIG["vectorDB"]["kwargs"]["api_key"] = QDRANT_KEY
 # if the history should be stored on AWS DynamoDB
 # otherwise it will be stored on local FS to the output_path defined in the config.yaml file
 if CONFIG['chatDB']['class'] == 'DynamoDBChatMessageHistory':
-    CHATDB_TABLE_NAME = os.getenv("CHATDB_TABLE_NAME", CONFIG["chatDB"]["kwargs"].get("table_name", "ChatEurlexHistory"))
-    AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", CONFIG["chatDB"]["kwargs"].get("aws_access_key_id", ""))
-    AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", CONFIG["chatDB"]["kwargs"].get("aws_secret_access_key", ""))
     CONFIG["chatDB"]["kwargs"]["table_name"] = CHATDB_TABLE_NAME
     CONFIG["chatDB"]["kwargs"]["aws_access_key_id"] = AWS_ACCESS_KEY_ID
     CONFIG["chatDB"]["kwargs"]["aws_secret_access_key"] = AWS_SECRET_ACCESS_KEY

 CONFIG["vectorDB"]["kwargs"]["url"] = QDRANT_URL
 CONFIG["vectorDB"]["kwargs"]["api_key"] = QDRANT_KEY
 # if the history should be stored on AWS DynamoDB
 # otherwise it will be stored on local FS to the output_path defined in the config.yaml file
 if CONFIG['chatDB']['class'] == 'DynamoDBChatMessageHistory':
+    CHATDB_TABLE_NAME = os.getenv("CHATDB_TABLE_NAME",
+                                  CONFIG["chatDB"]["kwargs"].get("table_name", "ChatEurlexHistory"))
+    AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID",
+                                  CONFIG["chatDB"]["kwargs"].get("aws_access_key_id", ""))
+    AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY",
+                                      CONFIG["chatDB"]["kwargs"].get("aws_secret_access_key", ""))
     CONFIG["chatDB"]["kwargs"]["table_name"] = CHATDB_TABLE_NAME
     CONFIG["chatDB"]["kwargs"]["aws_access_key_id"] = AWS_ACCESS_KEY_ID
     CONFIG["chatDB"]["kwargs"]["aws_secret_access_key"] = AWS_SECRET_ACCESS_KEY
+# if the Cohere reranking is enabled look for the api key and assign it to the CONFIG
+if CONFIG['vectorDB'].get('rerank'):
+    COHERE_KEY = os.getenv("COHERE_API_KEY",
+                           CONFIG["vectorDB"]["rerank"]["kwargs"].get("cohere_api_key", ""))
+    CONFIG["vectorDB"]["rerank"]["kwargs"]["cohere_api_key"] = COHERE_KEY

config.yaml CHANGED Viewed

@@ -4,15 +4,22 @@ vectorDB:
         url: ""
         api_key: ""
         collection_name: chat-eur-lex
     retriever_args:
         search_type: mmr
         search_kwargs:
-            k: 15
             fetch_k: 300
-            score_threshold: 0.0
             lambda_mult: 0.8
 embeddings:
     class: OpenAIEmbeddings
     kwargs:
@@ -22,9 +29,9 @@ embeddings:
 llm:
     class: ChatOpenAI
     use_context_function: True
-    max_context_size: 6000
     kwargs:
-        model_name: gpt-4
         temperature: 0.8
@@ -35,4 +42,4 @@ chatDB:
         aws_access_key_id: ''
         aws_secret_access_key: ''
-max_history_messages: 5

         url: ""
         api_key: ""
         collection_name: chat-eur-lex
+        timeout: 60
     retriever_args:
         search_type: mmr
         search_kwargs:
+            k: 100
             fetch_k: 300
             lambda_mult: 0.8
+    rerank:
+        class: CohereRerank
+        kwargs:
+            cohere_api_key: ""
+            model: rerank-multilingual-v3.0
+            top_n: 15
 embeddings:
     class: OpenAIEmbeddings
     kwargs:
 llm:
     class: ChatOpenAI
     use_context_function: True
+    max_context_size: 12000
     kwargs:
+        model_name: gpt-4o
         temperature: 0.8
         aws_access_key_id: ''
         aws_secret_access_key: ''
+max_history_messages: 10

consts.py ADDED Viewed

	@@ -0,0 +1,73 @@

+JUSTICE_CELEXES =[
+    "32024D0414",
+    "32023D2098",
+    "32023D0133",
+    "32022D0998",
+    "32022D0494",
+    "32021D1711",
+    "32021D1943",
+    "32021R0693",
+    "32020D1117",
+    "32019D1798",
+    "32019D1564",
+    "32019R1111",
+    "32019D0844",
+    "32019R0629",
+    "32019D0598",
+    "32018R1990",
+    "32018R1935",
+    "32018D1275",
+    "32018D1103",
+    "32018D1094",
+    "02018D1696-20200711",
+    "32018D0856",
+    "02017R1939-20210110",
+    "32017D0973",
+    "32016D1990",
+    "32016R1192",
+    "32016R1104",
+    "32016R1103",
+    "32016D0947",
+    "32016D0954",
+    "32016D0454",
+    "32015R2422",
+    "32015D1380",
+    "32014R1329",
+    "32014D0887",
+    "32014D0444",
+    "32013L0048",
+    "02012R1215-20150110",
+    "32012R0650",
+    "32011R0969",
+    "32009D0026",
+    "02009R0004-20150312",
+    "32008R0593",
+    "32007D0712",
+    "32005F0667",
+    "32005D0150",
+    "32004D0407",
+    "32002D0971"
+]
+POLLUTION_CELEXES = [
+    "32022D0591",
+    "02018R0842-20230516",
+    "32006D0871",
+    "22006A1208(04)",
+    "32021R1119",
+    "32021R0783",
+    "32020R0852",
+    "02019R0856-20210811",
+    "02017R1369-20210501",
+    "32016D1841",
+    "22016A1019(01)",
+    "32015L2193",
+    "02015R0757-20161216",
+    "32023R1115",
+    "32023R0955",
+    "32022D0591",
+    "02018R2067-20210101",
+    "02018R2067-20210101",
+    "32021R1119",
+    "32020R1294"
+]

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
-langchain==0.1.6
 lxml==4.9.2
-tiktoken==0.6.0
 qdrant-client==1.7.3
 transformers==4.37.2
 openai==1.12.0
 gradio==4.18.0
-boto3==1.34

+langchain==0.1.14
 lxml==4.9.2
+tiktoken==0.7.0
 qdrant-client==1.7.3
 transformers==4.37.2
 openai==1.12.0
 gradio==4.18.0
+boto3==1.34
+cohere==5.5.8