Spaces:

nus-cs5242-team-3
/

project

Configuration error

App Files Files Community

kabylake commited on Apr 29, 2024

Commit

7bd11ed

1 Parent(s): 25c2fcb

commit

Browse files

Files changed (26) hide show

.gitignore +1 -0
LICENSE +9 -0
README.md +28 -13
app/__init__.py +0 -0
app/chroma.py +103 -0
app/config/__init__.py +0 -0
app/config/load.py +16 -0
app/config/models/__init__.py +0 -0
app/config/models/configs.py +118 -0
app/config/models/openai.py +29 -0
app/config/models/vertexai.py +37 -0
app/config/types.ts +65 -0
app/main.py +202 -0
app/parsers/__init__.py +0 -0
app/parsers/markdown.py +431 -0
app/parsers/splitter.py +141 -0
app/pipeline.py +165 -0
app/ranking.py +55 -0
app/splade.py +179 -0
app/utils.py +17 -0
config/config.js +40 -0
config/openai.js +31 -0
config/vertexai.js +31 -0
documents/falcon-refinedweb.zip +3 -0
evaluation_dataset.json +0 -0
requirements.txt +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /documents/falcon-refinedweb/

LICENSE ADDED Viewed

	@@ -0,0 +1,9 @@

+MIT License
+Copyright (c) 2024 Denis Lapchev
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,28 @@
----
-title: Project
-emoji: 📚
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 4.26.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## Prerequisites
+* Tested on Linux (Ubuntu 22.04) and macOS (Apple Silicon).
+* Python 3.10
+### Before
+You should install requirements.txt, and unzip `documents/falcon-refinedweb.zip` into the `documents/falcon-refinedweb` folder.
+### Create document embeddings
+```bash
+python -m app.main index -c config/config.js
+```
+### Predict on the dataset
+```bash
+python -m app.main predict -c config/config.js -m config/openai.js
+```
+### Evaluate the dataset
+```bash
+python -m app.main evaluate -c config/config.js -m config/openai.js
+```

app/__init__.py ADDED Viewed

File without changes

app/chroma.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import shutil
+from pathlib import Path
+from typing import List, Optional, Tuple
+import tqdm
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_community.vectorstores import Chroma
+from loguru import logger
+from app.config.models.configs import Config
+from app.parsers.splitter import Document
+from app.utils import torch_device
+class ChromaDenseVectorDB:
+    def __init__(self, persist_folder: str, config: Config):
+        self._persist_folder = persist_folder
+        self._config = config
+        logger.info(f"Embedding model config: {config}")
+        self._embeddings = SentenceTransformerEmbeddings(model_name=config.embeddings.embedding_model.model_name,
+                                                         model_kwargs={"device": torch_device()})
+        self.batch_size = 200
+        self._retriever = None
+        self._vectordb = None
+    @property
+    def retriever(self):
+        if self._retriever is None:
+            self._retriever = self._load_retriever()
+        return self._retriever
+    @property
+    def vectordb(self):
+        if self._vectordb is None:
+            self._vectordb = Chroma(
+                persist_directory=self._persist_folder,
+                embedding_function=self._embeddings,
+            )
+        return self._vectordb
+    def generate_embeddings(
+            self,
+            docs: List[Document],
+            clear_persist_folder: bool = True,
+    ):
+        if clear_persist_folder:
+            pf = Path(self._persist_folder)
+            if pf.exists() and pf.is_dir():
+                logger.warning(f"Deleting the content of: {pf}")
+                shutil.rmtree(pf)
+        logger.info("Generating and persisting the embeddings..")
+        vectordb = None
+        for group in tqdm.tqdm(
+                chunker(docs, size=self.batch_size),
+                total=int(len(docs) / self.batch_size),
+        ):
+            ids = [d.metadata["document_id"] for d in group]
+            if vectordb is None:
+                vectordb = Chroma.from_documents(
+                    documents=group,
+                    embedding=self._embeddings,
+                    ids=ids,
+                    persist_directory=self._persist_folder,
+                )
+            else:
+                vectordb.add_texts(
+                    texts=[doc.page_content for doc in group],
+                    embedding=self._embeddings,
+                    ids=ids,
+                    metadatas=[doc.metadata for doc in group],
+                )
+        logger.info("Generated embeddings. Persisting...")
+        if vectordb is not None:
+            vectordb.persist()
+    def _load_retriever(self, **kwargs):
+        return self.vectordb.as_retriever(**kwargs)
+    def get_documents_by_id(self, document_ids: List[str]) -> List[Document]:
+        results = self.retriever.vectorstore.get(ids=document_ids, include=["metadatas", "documents"])  # type: ignore
+        docs = [
+            Document(page_content=d, metadata=m)
+            for d, m in zip(results["documents"], results["metadatas"])
+        ]
+        return docs
+    def similarity_search_with_relevance_scores(
+            self, query: str, filter: Optional[dict]
+    ) -> List[Tuple[Document, float]]:
+        if isinstance(filter, dict) and len(filter) > 1:
+            filter = {"$and": [{key: {"$eq": value}} for key, value in filter.items()]}
+            print("Filter = ", filter)
+        return self.retriever.vectorstore.similarity_search_with_relevance_scores(
+            query, k=self._config.semantic_search.max_k, filter=filter
+        )
+def chunker(seq, size):
+    return (seq[pos: pos + size] for pos in range(0, len(seq), size))

app/config/__init__.py ADDED Viewed

File without changes

app/config/load.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import execjs
+from app.config.models.configs import Config
+def load_config(app_config_path: str, model_config_path: str = None) -> Config:
+    doc_config_dict = load_js_object(app_config_path)
+    if model_config_path is not None:
+        model_config_dict = load_js_object(model_config_path)
+    return Config(**(doc_config_dict if model_config_path is None else {**doc_config_dict, "llm": model_config_dict}))
+def load_js_object(config_path: str) -> dict:
+    with open(config_path, "r") as f:
+        return execjs.compile(f.read()).eval("module.exports")

app/config/models/__init__.py ADDED Viewed

File without changes

app/config/models/configs.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+from uuid import UUID, uuid4
+from loguru import logger
+from pydantic import (
+    BaseModel,
+    DirectoryPath,
+    Field,
+    field_validator,
+    ConfigDict,
+    ValidationInfo,
+)
+from app.config.models.openai import OpenAIModelConfig
+from app.config.models.vertexai import VertexAIModelConfig
+def create_uuid() -> str:
+    return str(uuid4())
+class Document(BaseModel):
+    """Interface for interacting with a document."""
+    page_content: str
+    metadata: dict = Field(default_factory=dict)
+class SentenseTransformerEmbeddingModel(BaseModel):
+    model_config = ConfigDict()
+    model_config["protected_namespaces"] = ()
+    model_name: str
+    additional_kwargs: dict = Field(default_factory=dict)
+class DocumentPathSettings(BaseModel):
+    doc_path: Union[DirectoryPath, str]
+    additional_parser_settings: Dict[str, Any] = Field(default_factory=dict)
+    passage_prefix: str = ""
+    label: str = ""  # Optional label, will be included in the metadata
+class EmbedddingsSpladeConfig(BaseModel):
+    n_batch: int = 3
+class EmbeddingsConfig(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    embedding_model: SentenseTransformerEmbeddingModel
+    embeddings_path: Union[DirectoryPath, str]
+    document_settings: List[DocumentPathSettings]
+    chunk_sizes: List[int] = [1024]
+    splade_config: EmbedddingsSpladeConfig = EmbedddingsSpladeConfig(n_batch=5)
+    @property
+    def labels(self) -> List[str]:
+        """Returns list of labels in document settings"""
+        return [setting.label for setting in self.document_settings if setting.label]
+class SemanticSearchConfig(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+    max_k: int = 15
+    max_char_size: int = 2048
+    query_prefix: str = ""
+class LLMConfig(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+    model_config["protected_namespaces"] = ()
+    type: str
+    params: dict
+    @field_validator("params")
+    def validate_params(cls, value, info: ValidationInfo):
+        values = info.data
+        type_ = values.get("type")
+        if type_ == 'vertexai':
+            config = VertexAIModelConfig(
+                **value
+            )  # An attempt to force conversion to the required model config
+        if type_ == 'openai':
+            config = OpenAIModelConfig(
+                **value
+            )
+        logger.info(
+            f"Loading model paramaters in configuration class {LlamaModelConfig.__name__}"
+        )
+        return config
+class ResponseModel(BaseModel):
+    id: UUID = Field(default_factory=create_uuid)
+    question: str
+    response: str
+    average_score: float
+    semantic_search: List[str] = Field(default_factory=list)
+    hyde_response: str = ""
+class Config(BaseModel):
+    cache_folder: Path
+    embeddings: EmbeddingsConfig
+    semantic_search: SemanticSearchConfig
+    llm: Optional[LLMConfig] = None
+    def check_embeddings_exist(self) -> bool:
+        """Checks if embedings exist in the specified folder"""
+        p_splade = (
+                Path(self.embeddings.embeddings_path) / "splade" / "splade_embeddings.npz"
+        )
+        p_embeddings = Path(self.embeddings.embeddings_path)
+        all_parquets = list(p_embeddings.glob("*.parquet"))
+        return p_splade.exists() and len(all_parquets) > 0

app/config/models/openai.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Optional
+from langchain_community.chat_models import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from pydantic import BaseModel, ConfigDict
+class OpenAIModelConfig(BaseModel):
+    model_config = ConfigDict()
+    model_config["protected_namespaces"] = ()
+    prompt_template: str
+    model_kwargs: dict = {}
+class OpenAIModel:
+    def __init__(self, config: OpenAIModelConfig):
+        self.config = config
+        self._model = None
+    @property
+    def model(self):
+        return ChatOpenAI(**self.config.model_kwargs)
+    @property
+    def prompt(self) -> Optional[PromptTemplate]:
+        if self.config.prompt_template:
+            return PromptTemplate(
+                input_variables=["context", "question"], template=self.config.prompt_template
+            )

app/config/models/vertexai.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Optional
+from langchain_core.prompts import PromptTemplate
+from langchain_google_vertexai import ChatVertexAI
+from pydantic import BaseModel, ConfigDict
+from vertexai.generative_models import HarmCategory, HarmBlockThreshold
+class VertexAIModelConfig(BaseModel):
+    model_config = ConfigDict()
+    model_config["protected_namespaces"] = ()
+    prompt_template: str
+    model_kwargs: dict = {}
+class VertexAIModel:
+    def __init__(self, config: VertexAIModelConfig):
+        self.config = config
+        self._model = None
+    @property
+    def model(self):
+        return ChatVertexAI(**self.config.model_kwargs,
+                            safety_settings={
+                                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
+                                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+                                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+                                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+                                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                            })
+    @property
+    def prompt(self) -> Optional[PromptTemplate]:
+        if self.config.prompt_template:
+            return PromptTemplate(
+                input_variables=["context", "question"], template=self.config.prompt_template
+            )

app/config/types.ts ADDED Viewed

	@@ -0,0 +1,65 @@

+// Configuration for embeddings, including paths, models, and document settings
+type EmbeddingsConfig = {
+    embeddings_path: string; // Path where embeddings will be saved
+    embedding_model: { // Optional embedding model specification
+        model_name: string; // Name of the model
+    };
+    splade_config: { // Optional configuration for SPLADE
+        n_batch: number; // Batch size for processing
+    };
+    chunk_sizes: number[]; // Chunk sizes for splitting during embedding
+    document_settings: { // Settings for document processing
+        doc_path: string; // Path to documents
+        additional_parser_settings?: { // Optional settings for parsing documents
+            md: { // Settings for Markdown documents
+                skip_first: boolean; // Whether to skip the first section
+                merge_sections: boolean; // Whether to merge sections
+                remove_images: boolean; // Whether to remove images from documents
+            };
+        };
+        passage_prefix: string; // Prefix for passages
+    }[];
+};
+// Configuration for semantic search functionality
+type SemanticSearchConfig = {
+    max_k: number; // Maximum number of results to return
+    max_char_size: number; // Max character size for context provided to models
+    query_prefix: string; // Prefix for queries
+};
+export type AppConfig = {
+    cache_folder: string;
+    embeddings: EmbeddingsConfig;
+    semantic_search: SemanticSearchConfig;
+};
+// Type definition for the LLM configuration section for OpenAI models
+type OpenAIConfig = {
+    type: 'openai'; // Specifies the use of an OpenAI model
+    params: {
+        prompt_template: string; // Template for constructing prompts for the model. It includes placeholders for context and questions.
+        model_kwargs: { // Keyword arguments for configuring the model's inference behavior
+            openai_api_key: string; // API key for accessing the OpenAI API
+            temperature: number; // Temperature setting for controlling the randomness of response generation. A value of 0.0 generates deterministic responses.
+            model_name: string; // Specifies the name of the model to be used for generating responses.
+        };
+    };
+};
+// Type definition for the LLM configuration section for Google VertexAI models
+type VertexAIConfig = {
+    type: 'vertexai'; // Specifies the use of an OpenAI model
+    params: {
+        prompt_template: string; // Template for constructing prompts for the model. It includes placeholders for context and questions.
+        model_kwargs: { // Keyword arguments for configuring the model's inference behavior
+            model_name: string; // Specifies the name of the model to be used for generating responses.
+            temperature: number; // Temperature setting for controlling the randomness of response generation. A value of 0.0 generates deterministic responses.
+        };
+    };
+};
+// Type definition for the LLM configuration section
+export type LLMConfig = OpenAIConfig | VertexAIConfig;

app/main.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+from typing import Tuple
+import click
+import pandas as pd
+from datasets import Dataset
+from langchain.chains import LLMChain
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_google_vertexai import ChatVertexAI
+from loguru import logger
+from ragas import evaluate
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import (
+    answer_relevancy,
+    context_precision, answer_correctness,
+)
+from tqdm import tqdm
+from app.chroma import ChromaDenseVectorDB
+from app.config.load import load_config
+from app.config.models.configs import Config
+from app.config.models.vertexai import VertexAIModel
+from app.parsers.splitter import DocumentSplitter
+from app.pipeline import LLMBundle
+from app.ranking import BCEReranker
+from app.splade import SpladeSparseVectorDB
+def get_hash_mapping_filenames(
+        config: Config,
+        file_to_hash_fn: str = "file_hash_mappings.snappy.parquet",
+        docid_to_hash_fn="docid_hash_mappings.snappy.parquet",
+) -> Tuple[str, str]:
+    file_hashes_fn = os.path.join(config.embeddings.embeddings_path, file_to_hash_fn)
+    docid_hashes_fn = os.path.join(config.embeddings.embeddings_path, docid_to_hash_fn)
+    return file_hashes_fn, docid_hashes_fn
+@click.group()
+def main():
+    pass
+@main.command(name="index")
+@click.option(
+    "-c",
+    "app_config_path",
+    required=True,
+    help="Specifies App JavaScript configuration file (should be module exported)"
+)
+def create_index(app_config_path):
+    config = load_config(app_config_path)
+    dense_db = ChromaDenseVectorDB(
+        persist_folder=str(config.embeddings.embeddings_path), config=config
+    )
+    splitter = DocumentSplitter(config)
+    all_docs, all_hash_filename_mappings, all_hash_docid_mappings = splitter.split()
+    # dense embeddings
+    dense_db.generate_embeddings(docs=all_docs)
+    # sparse embeddings
+    sparse_db = SpladeSparseVectorDB(config)
+    sparse_db.generate_embeddings(docs=all_docs)
+    file_hashes_fn, docid_hashes_fn = get_hash_mapping_filenames(config)
+    all_hash_filename_mappings.to_parquet(
+        file_hashes_fn, compression="snappy", index=False
+    )
+    all_hash_docid_mappings.to_parquet(
+        docid_hashes_fn, compression="snappy", index=False
+    )
+    logger.info("Document Embeddings Generated")
+@main.command("predict")
+@click.option(
+    "-c",
+    "app_config_path",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, file_okay=True),
+    help="Specifies App JavaScript configuration file (should be module exported)",
+)
+@click.option(
+    "-m",
+    "model_config_path",
+    required=True,
+    type=click.Path(exists=True, dir_okay=False, file_okay=True),
+    help="Specifies Model JavaScript configuration file (should be module exported)",
+)
+def predict_pipeline(app_config_path: str, model_config_path: str):
+    config = load_config(app_config_path, model_config_path)
+    # llm = OpenAIModel(config=config.llm.params)
+    llm = VertexAIModel(config=config.llm.params)
+    chain = load_qa_chain(llm=llm.model, prompt=llm.prompt)
+    store = ChromaDenseVectorDB(
+        persist_folder=str(config.embeddings.embeddings_path), config=config
+    )
+    store._load_retriever()
+    reranker = BCEReranker()
+    chunk_sizes = config.embeddings.chunk_sizes
+    splade = SpladeSparseVectorDB(config=config)
+    splade.load()
+    hyde_chain = LLMChain(
+        llm=llm.model,
+        prompt=PromptTemplate(
+            template="Write a short passage to answer the question: {question}",
+            input_variables=["question"],
+        ),
+    )
+    llm_bundle = LLMBundle(
+        chain=chain,
+        reranker=reranker,
+        chunk_sizes=chunk_sizes,
+        sparse_db=splade,
+        dense_db=store,
+        hyde_chain=hyde_chain,
+    )
+    test_dataset = pd.read_json("evaluation_dataset.json", lines=True)
+    evaluate_data = {
+        "question": [],
+        "answer": [],
+        "contexts": [],  # should be a list[list[str]]
+        'ground_truth': [],
+        'context_ground_truth': []
+    }
+    test_dataset = test_dataset.head(10)
+    for idx, row in tqdm(test_dataset.iterrows()):
+        output = llm_bundle.get_and_parse_response(
+            query=row["question"],
+            config=config,
+        )
+        response = output.response
+        evaluate_data["question"].append(row["question"])
+        evaluate_data["answer"].append(response)
+        evaluate_data["contexts"].append(output.semantic_search)
+        evaluate_data["ground_truth"].append(row["answer"])
+        evaluate_data["context_ground_truth"].append(row["context"])
+    evaluate_dataset = Dataset.from_dict(evaluate_data)
+    # store the evaluation dataset
+    evaluate_dataset.to_pandas().to_json("evaluation_output.json", orient="records", lines=True)
+@main.command("evaluate")
+def evaluate_pipeline():
+    ragas_vertexai_llm = ChatVertexAI(model_name="gemini-pro")
+    ragas_vertexai_llm = LangchainLLMWrapper(ragas_vertexai_llm)
+    vertexai_embeddings = SentenceTransformerEmbeddings(model_name="maidalun1020/bce-embedding-base_v1")
+    vertexai_embeddings = LangchainEmbeddingsWrapper(vertexai_embeddings)
+    metrics = [
+        # the accuracy of the generated answer when compared to the ground truth
+        answer_correctness,
+        # evaluates whether all the ground-truth relevant items present in the contexts are ranked higher or not
+        context_precision,
+        # how pertinent the generated answer is to the given prompt
+        answer_relevancy,
+    ]
+    evaluate_dataset = pd.read_json("evaluation_output.json", lines=True)
+    evaluate_dataset = Dataset.from_pandas(evaluate_dataset)
+    evaluate_result = evaluate(
+        dataset=evaluate_dataset,
+        metrics=metrics,
+        llm=ragas_vertexai_llm,
+        embeddings=vertexai_embeddings,
+        is_async=True
+    )
+    evaluate_result_df = evaluate_result.to_pandas()
+    # drop the contexts, context_ground_truth
+    evaluate_result_df = evaluate_result_df.drop(columns=["contexts", "context_ground_truth"])
+    # print the mean for answer_correctness context_precision answer_relevancy columns
+    print(evaluate_result_df.mean(numeric_only=True))
+    evaluate_result_df.to_csv("evaluation_results.csv", index=False)
+if __name__ == "__main__":
+    main()

app/parsers/__init__.py ADDED Viewed

File without changes

app/parsers/markdown.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import re
+import urllib
+from collections import namedtuple
+from enum import Enum
+from pathlib import Path
+from typing import Generator, List, Union, Tuple
+from loguru import logger
+FORMATTING_SEQUENCES = {"*", "**", "***", "_", "__", "~~", "||"}
+CODE_BLOCK_SEQUENCES = {"`", "``", "```"}
+ALL_SEQUENCES = FORMATTING_SEQUENCES | CODE_BLOCK_SEQUENCES
+MAX_FORMATTING_SEQUENCE_LENGTH = max(len(seq) for seq in ALL_SEQUENCES)
+class SplitCandidates(Enum):
+    SPACE = 1
+    NEWLINE = 2
+    LAST_CHAR = 3
+SPLIT_CANDIDATES_PREFRENCE = [
+    SplitCandidates.NEWLINE,
+    SplitCandidates.SPACE,
+    SplitCandidates.LAST_CHAR,
+]
+BLOCK_SPLIT_CANDIDATES = [r"\n#\s+", r"\n##\s+", r"\n###\s+"]
+CODE_BLOCK_LEVEL = 10
+MarkdownChunk = namedtuple("MarkdownChunk", "string level")
+class SplitCandidateInfo:
+    last_seen: int
+    active_sequences: List[str]
+    active_sequences_length: int
+    def __init__(self):
+        self.last_seen = None
+        self.active_sequences = []
+        self.active_sequences_length = 0
+    def process_sequence(self, seq: str, is_in_code_block: bool):
+        if is_in_code_block:
+            if self.active_sequences and seq == self.active_sequences[-1]:
+                last_seq = self.active_sequences.pop()
+                self.active_sequences_length -= len(last_seq)
+            return True
+        elif seq in CODE_BLOCK_SEQUENCES:
+            self.active_sequences.append(seq)
+            self.active_sequences_length += len(seq)
+            return True
+        else:
+            for k in range(len(self.active_sequences) - 1, -1, -1):
+                if seq == self.active_sequences[k]:
+                    sequences_being_removed = self.active_sequences[k:]
+                    self.active_sequences = self.active_sequences[:k]
+                    self.active_sequences_length -= sum(
+                        len(seq) for seq in sequences_being_removed
+                    )
+                    return False
+            self.active_sequences.append(seq)
+            self.active_sequences_length += len(seq)
+            return False
+    def copy_from(self, other):
+        self.last_seen = other.last_seen
+        self.active_sequences = other.active_sequences.copy()
+        self.active_sequences_length = other.active_sequences_length
+def physical_split(markdown: str, max_chunk_size: int) -> Generator[str, None, None]:
+    if max_chunk_size <= MAX_FORMATTING_SEQUENCE_LENGTH:
+        raise ValueError(
+            f"max_chunk_size must be greater than {MAX_FORMATTING_SEQUENCE_LENGTH}"
+        )
+    split_candidates = {
+        SplitCandidates.SPACE: SplitCandidateInfo(),
+        SplitCandidates.NEWLINE: SplitCandidateInfo(),
+        SplitCandidates.LAST_CHAR: SplitCandidateInfo(),
+    }
+    is_in_code_block = False
+    chunk_start_from, chunk_char_count, chunk_prefix = 0, 0, ""
+    def split_chunk():
+        for split_variant in SPLIT_CANDIDATES_PREFRENCE:
+            split_candidate = split_candidates[split_variant]
+            if split_candidate.last_seen is None:
+                continue
+            chunk_end = split_candidate.last_seen + (
+                1 if split_variant == SplitCandidates.LAST_CHAR else 0
+            )
+            chunk = (
+                    chunk_prefix
+                    + markdown[chunk_start_from:chunk_end]
+                    + "".join(reversed(split_candidate.active_sequences))
+            )
+            next_chunk_prefix = "".join(split_candidate.active_sequences)
+            next_chunk_char_count = len(next_chunk_prefix)
+            next_chunk_start_from = chunk_end + (
+                0 if split_variant == SplitCandidates.LAST_CHAR else 1
+            )
+            split_candidates[SplitCandidates.NEWLINE] = SplitCandidateInfo()
+            split_candidates[SplitCandidates.SPACE] = SplitCandidateInfo()
+            return (
+                chunk,
+                next_chunk_start_from,
+                next_chunk_char_count,
+                next_chunk_prefix,
+            )
+    i = 0
+    while i < len(markdown):
+        for j in range(MAX_FORMATTING_SEQUENCE_LENGTH, 0, -1):
+            seq = markdown[i: i + j]
+            if seq in ALL_SEQUENCES:
+                last_char_split_candidate_len = (
+                        chunk_char_count
+                        + split_candidates[
+                            SplitCandidates.LAST_CHAR
+                        ].active_sequences_length
+                        + len(seq)
+                )
+                if last_char_split_candidate_len >= max_chunk_size:
+                    (
+                        next_chunk,
+                        chunk_start_from,
+                        chunk_char_count,
+                        chunk_prefix,
+                    ) = split_chunk()
+                    yield next_chunk
+                is_in_code_block = split_candidates[
+                    SplitCandidates.LAST_CHAR
+                ].process_sequence(seq, is_in_code_block)
+                i += len(seq)
+                chunk_char_count += len(seq)
+                split_candidates[SplitCandidates.LAST_CHAR].last_seen = i - 1
+                break
+        if i >= len(markdown):
+            break
+        split_candidates[SplitCandidates.LAST_CHAR].last_seen = i
+        chunk_char_count += 1
+        if markdown[i] == "\n":
+            split_candidates[SplitCandidates.NEWLINE].copy_from(
+                split_candidates[SplitCandidates.LAST_CHAR]
+            )
+        elif markdown[i] == " ":
+            split_candidates[SplitCandidates.SPACE].copy_from(
+                split_candidates[SplitCandidates.LAST_CHAR]
+            )
+        last_char_split_candidate_len = (
+                chunk_char_count
+                + split_candidates[SplitCandidates.LAST_CHAR].active_sequences_length
+        )
+        if last_char_split_candidate_len == max_chunk_size:
+            next_chunk, chunk_start_from, chunk_char_count, chunk_prefix = split_chunk()
+            yield next_chunk
+        i += 1
+    if chunk_start_from < len(markdown):
+        yield chunk_prefix + markdown[chunk_start_from:]
+def get_logical_blocks_recursively(
+        markdown: str, max_chunk_size: int, all_sections: list, split_candidate_index=0
+) -> List[MarkdownChunk]:
+    if split_candidate_index >= len(BLOCK_SPLIT_CANDIDATES):
+        for chunk in physical_split(markdown, max_chunk_size):
+            all_sections.append(
+                MarkdownChunk(string=chunk, level=split_candidate_index)
+            )
+        return all_sections
+    chunks = []
+    add_index = 0
+    for add_index, split_candidate in enumerate(
+            BLOCK_SPLIT_CANDIDATES[split_candidate_index:]
+    ):
+        chunks = re.split(split_candidate, markdown)
+        if len(chunks) > 1:
+            break
+    for i, chunk in enumerate(chunks):
+        level = split_candidate_index + add_index
+        if i > 0:
+            level += 1
+        prefix = "\n\n" + "#" * level + " "
+        if not chunk.strip():
+            continue
+        if len(chunk) <= max_chunk_size:
+            all_sections.append(MarkdownChunk(string=prefix + chunk, level=level - 1))
+        else:
+            get_logical_blocks_recursively(
+                chunk,
+                max_chunk_size,
+                all_sections,
+                split_candidate_index=split_candidate_index + add_index + 1,
+            )
+    return all_sections
+def markdown_splitter(
+        path: Union[str, Path], max_chunk_size: int, **additional_splitter_settings
+) -> List[dict]:
+    try:
+        with open(path, "r") as f:
+            markdown = f.read()
+    except OSError:
+        return []
+    if len(markdown) < max_chunk_size:
+        return [{"text": markdown, "metadata": {"heading": ""}}]
+    sections = [MarkdownChunk(string="", level=0)]
+    markdown, additional_metadata = preprocess_markdown(
+        markdown, additional_splitter_settings
+    )
+    # Split by code and non-code
+    chunks = markdown.split("```")
+    for i, chunk in enumerate(chunks):
+        if i % 2 == 0:  # Every even element (0 indexed) is a non-code
+            logical_blocks = get_logical_blocks_recursively(
+                chunk, max_chunk_size=max_chunk_size, all_sections=[]
+            )
+            sections += logical_blocks
+        else:  # Process the code section
+            rows = chunk.split("\n")
+            code = rows[1:]
+            lang = rows[0]  # Get the language name
+            # Provide a hint to LLM
+            all_code_rows = (
+                    [
+                        f"\nFollowing is a code section in {lang}, delimited by triple backticks:",
+                        f"```{lang}",
+                    ]
+                    + code
+                    + ["```"]
+            )
+            all_code_str = "\n".join(all_code_rows)
+            # Merge code to a previous logical block if there is enough space
+            if len(sections[-1].string) + len(all_code_str) < max_chunk_size:
+                sections[-1] = MarkdownChunk(
+                    string=sections[-1].string + all_code_str, level=sections[-1].level
+                )
+            # If code block is larger than max size, physically split it
+            elif len(all_code_str) >= max_chunk_size:
+                code_chunks = physical_split(
+                    all_code_str, max_chunk_size=max_chunk_size
+                )
+                for cchunk in code_chunks:
+                    # Assign language header to the code chunk, if doesn't exist
+                    if f"```{lang}" not in cchunk:
+                        cchunk_rows = cchunk.split("```")
+                        cchunk = f"```{lang}\n" + cchunk_rows[1] + "```"
+                    sections.append(
+                        MarkdownChunk(string=cchunk, level=CODE_BLOCK_LEVEL)
+                    )
+            # Otherwise, add as a single chunk
+            else:
+                sections.append(
+                    MarkdownChunk(string=all_code_str, level=CODE_BLOCK_LEVEL)
+                )
+    all_out = postprocess_sections(
+        sections,
+        max_chunk_size,
+        additional_splitter_settings,
+        additional_metadata,
+        path,
+    )
+    return all_out
+def preprocess_markdown(markdown: str, additional_settings: dict) -> Tuple[str, dict]:
+    preprocess_remove_images = additional_settings.get("remove_images", False)
+    preprocess_remove_extra_newlines = additional_settings.get(
+        "remove_extra_newlines", True
+    )
+    preprocess_find_metadata = additional_settings.get("find_metadata", dict())
+    if preprocess_remove_images:
+        markdown = remove_images(markdown)
+    if preprocess_remove_extra_newlines:
+        markdown = remove_extra_newlines(markdown)
+    additional_metadata = {}
+    if preprocess_find_metadata:
+        if not isinstance(preprocess_find_metadata, dict):
+            raise TypeError(
+                f"find_metadata settings should be of type dict. Got {type(preprocess_find_metadata)}"
+            )
+        for label, search_string in preprocess_find_metadata.items():
+            logger.info(f"Looking for metadata: {search_string}")
+            metadata = find_metadata(markdown, search_string)
+            if metadata:
+                logger.info(f"\tFound metadata for {label} - {metadata}")
+                additional_metadata[label] = metadata
+    return markdown, additional_metadata
+def postprocess_sections(
+        sections: List[MarkdownChunk],
+        max_chunk_size: int,
+        additional_settings: dict,
+        additional_metadata: dict,
+        path: Union[str, Path],
+) -> List[dict]:
+    all_out = []
+    skip_first = additional_settings.get("skip_first", False)
+    merge_headers = additional_settings.get("merge_sections", False)
+    # Remove all empty sections
+    sections = [s for s in sections if s.string]
+    if sections and skip_first:
+        # remove first section
+        sections = sections[1:]
+    if sections and merge_headers:
+        # Merge sections
+        sections = merge_sections(sections, max_chunk_size=max_chunk_size)
+    current_heading = ""
+    sections_metadata = {"Document name": Path(path).name}
+    for s in sections:
+        stripped_string = s.string.strip()
+        doc_metadata = {}
+        if len(stripped_string) > 0:
+            heading = ""
+            if stripped_string.startswith("#"):  # heading detected
+                heading = stripped_string.split("\n")[0].replace("#", "").strip()
+                stripped_heading = heading.replace("#", "").replace(" ", "").strip()
+                if not stripped_heading:
+                    heading = ""
+                if s.level == 0:
+                    current_heading = heading
+                doc_metadata["heading"] = urllib.parse.quote(
+                    heading
+                )  # isolate the heading
+            else:
+                doc_metadata["heading"] = ""
+            final_section = add_section_metadata(
+                stripped_string,
+                section_metadata={
+                    **sections_metadata,
+                    **{"Subsection of": current_heading},
+                    **additional_metadata,
+                },
+            )
+            all_out.append({"text": final_section, "metadata": doc_metadata})
+    return all_out
+def remove_images(page_md: str) -> str:
+    return re.sub(r"""!\[[^\]]*\]\((.*?)\s*("(?:.*[^"])")?\s*\)""", "", page_md)
+def remove_extra_newlines(page_md) -> str:
+    page_md = re.sub(r"\n{3,}", "\n\n", page_md)
+    return page_md
+def add_section_metadata(s, section_metadata: dict):
+    metadata_s = ""
+    for k, v in section_metadata.items():
+        if v:
+            metadata_s += f"{k}: {v}\n"
+    metadata = f"Metadata applicable to the next chunk of text delimited by five stars:\n>> METADATA START\n{metadata_s}>> METADATA END\n\n"
+    return metadata + "*****\n" + s + "\n*****"
+def find_metadata(page_md: str, search_string: str) -> str:
+    pattern = rf"{search_string}(.*)"
+    match = re.search(pattern, page_md)
+    if match:
+        return match.group(1)
+    return ""
+def merge_sections(
+        sections: List[MarkdownChunk], max_chunk_size: int
+) -> List[MarkdownChunk]:
+    current_section = sections[0]
+    all_out = []
+    prev_level = 0
+    for s in sections[1:]:
+        if (
+                len(current_section.string + s.string) > max_chunk_size
+                or s.level <= prev_level
+        ):
+            all_out.append(current_section)
+            current_section = s
+            prev_level = 0
+        else:
+            current_section = MarkdownChunk(
+                string=current_section.string + s.string, level=current_section.level
+            )
+            prev_level = s.level if s.level != CODE_BLOCK_LEVEL else prev_level
+    all_out.append(current_section)
+    return all_out

app/parsers/splitter.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import hashlib
+import urllib
+import uuid
+from pathlib import Path
+from typing import List, Tuple
+import pandas as pd
+from loguru import logger
+from app.config.models.configs import Document, Config
+from app.parsers.markdown import markdown_splitter
+HASH_BLOCKSIZE = 65536
+class DocumentSplitter:
+    def __init__(self, config: Config) -> None:
+        self.document_path_settings = config.embeddings.document_settings
+        self.chunk_sizes = config.embeddings.chunk_sizes
+    def split(
+            self,
+            limit: int = None,
+    ) -> Tuple[List[Document], pd.DataFrame, pd.DataFrame]:
+        all_docs = []
+        hash_filename_mappings = []
+        hash_docid_mappings = []
+        for setting in self.document_path_settings:
+            passage_prefix = setting.passage_prefix
+            docs_path = Path(setting.doc_path)
+            extension = "md"
+            for chunk_size in self.chunk_sizes:
+                paths = [p for p in list(docs_path.glob(f"**/*.{extension}"))]
+                additional_parser_settings = setting.additional_parser_settings.get(
+                    extension, dict()
+                )
+                (
+                    docs,
+                    hf_mappings,
+                    hd_mappings,
+                ) = self._get_documents_from_custom_splitter(
+                    document_paths=paths,
+                    splitter_func=markdown_splitter,
+                    max_size=chunk_size,
+                    passage_prefix=passage_prefix,
+                    **additional_parser_settings,
+                )
+                all_docs.extend(docs)
+                hash_filename_mappings.extend(hf_mappings)
+                hash_docid_mappings.extend(hd_mappings)
+        all_hash_filename_mappings = pd.DataFrame(hash_filename_mappings)
+        all_hash_docid_mappings = pd.concat(hash_docid_mappings, axis=0)
+        if limit:
+            all_docs = all_docs[:limit]
+            all_hash_filename_mappings = all_hash_filename_mappings[:limit]
+            all_hash_docid_mappings = all_hash_docid_mappings[:limit]
+        return all_docs, all_hash_filename_mappings, all_hash_docid_mappings
+    def _get_documents_from_custom_splitter(
+            self,
+            document_paths: List[Path],
+            splitter_func,
+            max_size,
+            passage_prefix: str,
+            **additional_kwargs,
+    ) -> Tuple[List[Document], List[dict], List[pd.DataFrame]]:
+        all_docs = []
+        hash_filename_mappings = []
+        hash_docid_mappings = []
+        for path in document_paths:
+            filepath = str(path)
+            filename = filepath.split("/")[-1].replace(f".{path.suffix}", "")
+            if path.suffix != ".md":
+                continue
+            additional_kwargs.update({"filename": filepath})
+            docs_data = splitter_func(path, max_size, **additional_kwargs)
+            file_hash = get_md5_hash(path)
+            path = urllib.parse.quote(str(path))  # type: ignore
+            logger.info(path)
+            docs = [
+                Document(
+                    page_content=passage_prefix + d["text"],
+                    metadata={
+                        **d["metadata"],
+                        **{
+                            "source": str(path),
+                            "chunk_size": max_size,
+                            "document_id": str(uuid.uuid1()),
+                            "label": filename,
+                        },
+                    },
+                )
+                for d in docs_data
+            ]
+            for d in docs:
+                if 'page' in d.metadata and d.metadata['page'] is None:
+                    d.metadata['page'] = -1
+            all_docs.extend(docs)
+            hash_filename_mappings.append(dict(filename=filepath, filehash=file_hash))
+            df_hash_docid = (
+                pd.DataFrame()
+                .assign(docid=[d.metadata["document_id"] for d in docs])
+                .assign(filehash=file_hash)
+            )
+            hash_docid_mappings.append(df_hash_docid)
+        logger.info(f"Got {len(all_docs)} nodes.")
+        return all_docs, hash_filename_mappings, hash_docid_mappings
+def get_md5_hash(file_path: Path) -> str:
+    hasher = hashlib.md5()
+    with open(file_path, "rb") as file:
+        buf = file.read(HASH_BLOCKSIZE)
+        while buf:
+            hasher.update(buf)
+            buf = file.read(HASH_BLOCKSIZE)
+    return hasher.hexdigest()

app/pipeline.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import string
+from typing import List, Optional, Tuple
+from langchain.chains import LLMChain
+from langchain.chains.base import Chain
+from loguru import logger
+from app.chroma import ChromaDenseVectorDB
+from app.config.models.configs import (
+    ResponseModel,
+    Config, SemanticSearchConfig,
+)
+from app.ranking import BCEReranker, rerank
+from app.splade import SpladeSparseVectorDB
+class LLMBundle:
+    def __init__(
+            self,
+            chain: Chain,
+            dense_db: ChromaDenseVectorDB,
+            reranker: BCEReranker,
+            sparse_db: SpladeSparseVectorDB,
+            chunk_sizes: List[int],
+            hyde_chain: Optional[LLMChain] = None
+    ) -> None:
+        self.chain = chain
+        self.dense_db = dense_db
+        self.reranker = reranker
+        self.sparse_db = sparse_db
+        self.chunk_sizes = chunk_sizes
+        self.hyde_chain = hyde_chain
+    def get_relevant_documents(
+            self,
+            original_query: str,
+            query: str,
+            config: SemanticSearchConfig,
+            label: str,
+    ) -> Tuple[List[str], float]:
+        most_relevant_docs = []
+        docs = []
+        current_reranker_score, reranker_score = -1e5, -1e5
+        for chunk_size in self.chunk_sizes:
+            all_relevant_docs = []
+            all_relevant_doc_ids = set()
+            logger.debug("Evaluating query: {}", query)
+            if config.query_prefix:
+                logger.info(f"Adding query prefix for retrieval: {config.query_prefix}")
+                query = config.query_prefix + query
+            sparse_search_docs_ids, sparse_scores = self.sparse_db.query(
+                search=query, n=config.max_k, label=label, chunk_size=chunk_size
+            )
+            logger.info(f"Stage 1: Got {len(sparse_search_docs_ids)} documents.")
+            filter = (
+                {"chunk_size": chunk_size}
+                if len(self.chunk_sizes) > 1
+                else dict()
+            )
+            if label:
+                filter.update({"label": label})
+            if (
+                    not filter
+            ):
+                filter = None
+            logger.info(f"Dense embeddings filter: {filter}")
+            res = self.dense_db.similarity_search_with_relevance_scores(
+                query, filter=filter
+            )
+            dense_search_doc_ids = [r[0].metadata["document_id"] for r in res]
+            all_doc_ids = (
+                set(sparse_search_docs_ids).union(set(dense_search_doc_ids))
+            ).difference(all_relevant_doc_ids)
+            if all_doc_ids:
+                relevant_docs = self.dense_db.get_documents_by_id(
+                    document_ids=list(all_doc_ids)
+                )
+                all_relevant_docs += relevant_docs
+            # Re-rank embeddings
+            reranker_score, relevant_docs = rerank(
+                rerank_model=self.reranker,
+                query=original_query,
+                docs=all_relevant_docs,
+            )
+            if reranker_score > current_reranker_score:
+                docs = relevant_docs
+                current_reranker_score = reranker_score
+        len_ = 0
+        for doc in docs:
+            doc_length = len(doc.page_content)
+            if len_ + doc_length < config.max_char_size:
+                most_relevant_docs.append(doc)
+                len_ += doc_length
+        return most_relevant_docs, current_reranker_score
+    def get_and_parse_response(
+            self,
+            query: str,
+            config: Config,
+            label: str = "",
+    ) -> ResponseModel:
+        original_query = query
+        # Add HyDE queries
+        hyde_response = self.hyde_chain.run(query)
+        query += hyde_response
+        logger.info(f"query: {query}")
+        semantic_search_config = config.semantic_search
+        most_relevant_docs, score = self.get_relevant_documents(
+            original_query, query, semantic_search_config, label
+        )
+        res = self.chain(
+            {"input_documents": most_relevant_docs, "question": original_query},
+        )
+        out = ResponseModel(
+            response=res["output_text"],
+            question=query,
+            average_score=score,
+            hyde_response="",
+        )
+        for doc in res["input_documents"]:
+            out.semantic_search.append(doc.page_content)
+        return out
+class PartialFormatter(string.Formatter):
+    def __init__(self, missing="~~", bad_fmt="!!"):
+        self.missing, self.bad_fmt = missing, bad_fmt
+    def get_field(self, field_name, args, kwargs):
+        try:
+            val = super(PartialFormatter, self).get_field(field_name, args, kwargs)
+        except (KeyError, AttributeError):
+            val = None, field_name
+        return val
+    def format_field(self, value, spec):
+        if value is None:
+            return self.missing
+        try:
+            return super(PartialFormatter, self).format_field(value, spec)
+        except ValueError:
+            if self.bad_fmt is not None:
+                return self.bad_fmt
+            else:
+                raise

app/ranking.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import statistics
+from typing import List
+from typing import Tuple
+import torch
+from loguru import logger
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from app.config.models.configs import Document
+class BCEReranker:
+    def __init__(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("maidalun1020/bce-reranker-base_v1")
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "maidalun1020/bce-reranker-base_v1"
+        )
+        self.model.eval()
+        logger.info("Initialized BCE Reranker")
+    def get_scores(self, query: str, docs: List[Document]) -> List[float]:
+        logger.info("Reranking documents ... ")
+        features = [[query, doc.page_content] for doc in docs]
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                features,
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors="pt",
+            )
+            scores = (
+                self.model(**inputs, return_dict=True)
+                .logits.view(-1, )
+                .float()
+                .tolist()
+            )
+        return scores
+def rerank(
+        rerank_model: BCEReranker, query: str, docs: List[Document]
+) -> Tuple[float, List[Document]]:
+    logger.info("Reranking...")
+    scores = rerank_model.get_scores(query, docs)
+    for score, d in zip(scores, docs):
+        d.metadata["score"] = score
+    sorted_scores = sorted(scores, reverse=True)
+    logger.info(sorted_scores)
+    median_ = statistics.mean(sorted_scores[:10])
+    return median_, [
+        doc for doc in sorted(docs, key=lambda it: it.metadata["score"], reverse=True)
+    ]

app/splade.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import pickle
+from collections import defaultdict
+from typing import List, Tuple
+import numpy as np
+import scipy
+import torch
+import tqdm
+from loguru import logger
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from app.config.models.configs import Config, Document
+from app.utils import torch_device, split
+class SpladeSparseVectorDB:
+    def __init__(
+            self,
+            config: Config,
+    ) -> None:
+        self._config = config
+        # cuda or mps or cpu
+        self._device = torch_device()
+        logger.info(f"Setting device to {self._device}")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "naver/splade-v3", device=self._device, use_fast=True
+        )
+        self.model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3")
+        self.model.to(self._device)
+        self._embeddings = None
+        self._ids = None
+        self._l2_norm_matrix = None
+        self._labels_to_ind = defaultdict(list)
+        self._chunk_size_to_ind = defaultdict(list)
+        self.n_batch = config.embeddings.splade_config.n_batch
+    def _get_batch_embeddings(
+            self, docs: List[str]
+    ) -> np.ndarray:
+        tokens = self.tokenizer(
+            docs, return_tensors="pt", padding=True, truncation=True
+        ).to(self._device)
+        output = self.model(**tokens)
+        vecs = (
+            torch.max(
+                torch.log(1 + torch.relu(output.logits))
+                * tokens.attention_mask.unsqueeze(-1),
+                dim=1,
+            )[0]
+            .squeeze()
+            .detach()
+            .cpu()
+            .numpy()
+        )
+        del output
+        del tokens
+        return vecs
+    def _get_embedding_fnames(self):
+        folder_name = os.path.join(self._config.embeddings.embeddings_path, "splade")
+        fn_embeddings = os.path.join(folder_name, "splade_embeddings.npz")
+        fn_ids = os.path.join(folder_name, "splade_ids.pickle")
+        fn_metadatas = os.path.join(folder_name, "splade_metadatas.pickle")
+        return folder_name, fn_embeddings, fn_ids, fn_metadatas
+    def load(self) -> None:
+        _, fn_embeddings, fn_ids, fn_metadatas = self._get_embedding_fnames()
+        try:
+            self._embeddings = scipy.sparse.load_npz(fn_embeddings)
+            with open(fn_ids, "rb") as fp:
+                self._ids = np.array(pickle.load(fp))
+            with open(fn_metadatas, "rb") as fm:
+                self._metadatas = np.array(pickle.load(fm))
+            self._l2_norm_matrix = scipy.sparse.linalg.norm(self._embeddings, axis=1)
+            for ind, m in enumerate(self._metadatas):
+                if m["label"]:
+                    self._labels_to_ind[m["label"]].append(ind)
+                self._chunk_size_to_ind[m["chunk_size"]].append(ind)
+            logger.info(f"SPLADE: Got {len(self._labels_to_ind)} labels.")
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                "Embeddings don't exist"
+            )
+        logger.info(f"Loaded sparse embeddings from {fn_embeddings}")
+    def generate_embeddings(
+            self, docs: List[Document], persist: bool = True
+    ) -> Tuple[np.ndarray, List[str], List[dict]]:
+        chunk_size = self.n_batch
+        ids = [d.metadata["document_id"] for d in docs]
+        metadatas = [d.metadata for d in docs]
+        vecs = []
+        for chunk in tqdm.tqdm(
+                split(docs, chunk_size=chunk_size), total=int(len(docs) / chunk_size)
+        ):
+            texts = [d.page_content for d in chunk if d.page_content]
+            vecs.append(self._get_batch_embeddings(texts))
+        embeddings = np.vstack(vecs)
+        if persist:
+            self.persist_embeddings(embeddings, metadatas, ids)
+        return embeddings, ids, metadatas
+    def persist_embeddings(self, embeddings, metadatas, ids):
+        folder_name, fn_embeddings, fn_ids, fn_metadatas = self._get_embedding_fnames()
+        csr_embeddings = scipy.sparse.csr_matrix(embeddings)
+        if not os.path.exists(folder_name):
+            os.makedirs(folder_name)
+        scipy.sparse.save_npz(fn_embeddings, csr_embeddings)
+        self.save_list(ids, fn_ids)
+        self.save_list(metadatas, fn_metadatas)
+        logger.info(f"Saved embeddings to {fn_embeddings}")
+    def query(
+            self, search: str, chunk_size: int, n: int = 50, label: str = ""
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        if self._embeddings is None or self._ids is None:
+            logger.info("Loading embeddings...")
+            self.load()
+        if (
+                label
+                and label in self._labels_to_ind
+                and self._embeddings is not None
+                and self._ids is not None
+        ):
+            indices = sorted(
+                list(
+                    set(self._labels_to_ind[label]).intersection(
+                        set(self._chunk_size_to_ind[chunk_size])
+                    )
+                )
+            )
+        else:
+            indices = sorted(list(set(self._chunk_size_to_ind[chunk_size])))
+        embeddings = self._embeddings[indices]
+        ids = self._ids[indices]
+        l2_norm_matrix = scipy.sparse.linalg.norm(embeddings, axis=1)
+        embed_query = self._get_batch_embeddings(docs=[search])
+        l2_norm_query = scipy.linalg.norm(embed_query)
+        if embeddings is not None and l2_norm_matrix is not None and ids is not None:
+            cosine_similarity = embeddings.dot(embed_query) / (
+                    l2_norm_matrix * l2_norm_query
+            )
+            most_similar = np.argsort(cosine_similarity)
+            top_similar_indices = most_similar[-n:][::-1]
+            return (
+                ids[top_similar_indices],
+                cosine_similarity[top_similar_indices],
+            )
+    def save_list(self, list_: list, fname: str) -> None:
+        with open(fname, "wb") as fp:
+            pickle.dump(list_, fp)

app/utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import List
+import torch
+def torch_device():
+    device = (
+        f"cuda:{torch.cuda.current_device()}"
+        if torch.cuda.is_available()
+        else ("mps" if torch.backends.mps.is_available() else "cpu")
+    )
+    return device
+def split(iterable: List, chunk_size: int):
+    for i in range(0, len(iterable), chunk_size):
+        yield iterable[i: i + chunk_size]

config/config.js ADDED Viewed

	@@ -0,0 +1,40 @@

+/**
+ * @typedef {import('app/config/types').AppConfig} AppConfig
+ */
+/**
+ * @type {AppConfig}
+ */
+const config = {
+    cache_folder: ".cache",
+    embeddings: {
+        embeddings_path: ".embeddings",
+        embedding_model: {
+            model_name: "maidalun1020/bce-embedding-base_v1",
+        },
+        splade_config: {
+            n_batch: 8,
+        },
+        chunk_sizes: [1024],
+        document_settings: [
+            {
+                doc_path: "documents/falcon-refinedweb",
+                additional_parser_settings: {
+                    md: {
+                        skip_first: true,
+                        merge_sections: false,
+                        remove_images: true,
+                    },
+                },
+                passage_prefix: "passage: ",
+            },
+        ],
+    },
+    semantic_search: {
+        max_k: 15,
+        max_char_size: 4096,
+        query_prefix: "query: ",
+    },
+};
+module.exports = config;

config/openai.js ADDED Viewed

	@@ -0,0 +1,31 @@

+/**
+ * @typedef {import('app/config/types').LLMConfig} LLMConfig
+ */
+/**
+ * @type {LLMConfig}
+ */
+const config = {
+    type: "openai",
+    params: {
+        prompt_template: `
+### Instruction:
+Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you don't know, don't try to make up an answer.
+### Context:
+---------------
+{context}
+---------------
+### Question: {question}
+### Response:
+    `,
+        model_kwargs: {
+            model_name: "gpt-3.5-turbo",
+            temperature: 0.0,
+        },
+    },
+};
+module.exports = config;

config/vertexai.js ADDED Viewed

	@@ -0,0 +1,31 @@

+/**
+ * @typedef {import('app/config/types').LLMConfig} LLMConfig
+ */
+/**
+ * @type {LLMConfig}
+ */
+const config = {
+    type: "vertexai",
+    params: {
+        prompt_template: `
+### Instruction:
+Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you don't know, don't try to make up an answer.
+### Context:
+---------------
+{context}
+---------------
+### Question: {question}
+### Response:
+    `,
+        model_kwargs: {
+            model_name: "gemini-pro",
+            temperature: 0.0,
+        },
+    },
+};
+module.exports = config;

documents/falcon-refinedweb.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b0345300dfa1636ae1a44d868624818dff553bde63fd4c647b2ab5a5db813fa
+size 22271742

evaluation_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+llama-cpp-python==0.2.55
+langchain==0.1.13
+pydantic~=2.5
+transformers~=4.36
+loguru~=0.7.2
+termcolor~=2.4.0
+click~=8.1.7
+unstructured~=0.12.4
+sentence-transformers~=2.6
+tqdm==4.65.0
+pandas~=2.2.1
+chromadb==0.4.15
+numpy~=1.26.4
+scipy~=1.12.0
+torch~=2.2.1
+PyExecJS~=1.5.1
+fastparquet~=2024.2.0
+ragas==0.1.0
+langchain-google-vertexai~=1.0.1
+datasets~=2.18.0