kabylake commited on
Commit
7bd11ed
1 Parent(s): 25c2fcb
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /documents/falcon-refinedweb/
LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Denis Lapchev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md CHANGED
@@ -1,13 +1,28 @@
1
- ---
2
- title: Project
3
- emoji: 📚
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.26.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Prerequisites
3
+
4
+ * Tested on Linux (Ubuntu 22.04) and macOS (Apple Silicon).
5
+ * Python 3.10
6
+
7
+
8
+ ### Before
9
+ You should install requirements.txt, and unzip `documents/falcon-refinedweb.zip` into the `documents/falcon-refinedweb` folder.
10
+
11
+ ### Create document embeddings
12
+
13
+ ```bash
14
+ python -m app.main index -c config/config.js
15
+ ```
16
+
17
+
18
+ ### Predict on the dataset
19
+ ```bash
20
+ python -m app.main predict -c config/config.js -m config/openai.js
21
+ ```
22
+
23
+
24
+ ### Evaluate the dataset
25
+ ```bash
26
+ python -m app.main evaluate -c config/config.js -m config/openai.js
27
+ ```
28
+
app/__init__.py ADDED
File without changes
app/chroma.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import List, Optional, Tuple
4
+
5
+ import tqdm
6
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+ from loguru import logger
9
+
10
+ from app.config.models.configs import Config
11
+ from app.parsers.splitter import Document
12
+ from app.utils import torch_device
13
+
14
+
15
+ class ChromaDenseVectorDB:
16
+ def __init__(self, persist_folder: str, config: Config):
17
+ self._persist_folder = persist_folder
18
+ self._config = config
19
+ logger.info(f"Embedding model config: {config}")
20
+ self._embeddings = SentenceTransformerEmbeddings(model_name=config.embeddings.embedding_model.model_name,
21
+ model_kwargs={"device": torch_device()})
22
+ self.batch_size = 200
23
+
24
+ self._retriever = None
25
+ self._vectordb = None
26
+
27
+ @property
28
+ def retriever(self):
29
+ if self._retriever is None:
30
+ self._retriever = self._load_retriever()
31
+ return self._retriever
32
+
33
+ @property
34
+ def vectordb(self):
35
+ if self._vectordb is None:
36
+ self._vectordb = Chroma(
37
+ persist_directory=self._persist_folder,
38
+ embedding_function=self._embeddings,
39
+ )
40
+ return self._vectordb
41
+
42
+ def generate_embeddings(
43
+ self,
44
+ docs: List[Document],
45
+ clear_persist_folder: bool = True,
46
+ ):
47
+ if clear_persist_folder:
48
+ pf = Path(self._persist_folder)
49
+ if pf.exists() and pf.is_dir():
50
+ logger.warning(f"Deleting the content of: {pf}")
51
+ shutil.rmtree(pf)
52
+
53
+ logger.info("Generating and persisting the embeddings..")
54
+
55
+ vectordb = None
56
+ for group in tqdm.tqdm(
57
+ chunker(docs, size=self.batch_size),
58
+ total=int(len(docs) / self.batch_size),
59
+ ):
60
+ ids = [d.metadata["document_id"] for d in group]
61
+ if vectordb is None:
62
+ vectordb = Chroma.from_documents(
63
+ documents=group,
64
+ embedding=self._embeddings,
65
+ ids=ids,
66
+ persist_directory=self._persist_folder,
67
+ )
68
+ else:
69
+ vectordb.add_texts(
70
+ texts=[doc.page_content for doc in group],
71
+ embedding=self._embeddings,
72
+ ids=ids,
73
+ metadatas=[doc.metadata for doc in group],
74
+ )
75
+ logger.info("Generated embeddings. Persisting...")
76
+ if vectordb is not None:
77
+ vectordb.persist()
78
+
79
+ def _load_retriever(self, **kwargs):
80
+ return self.vectordb.as_retriever(**kwargs)
81
+
82
+ def get_documents_by_id(self, document_ids: List[str]) -> List[Document]:
83
+ results = self.retriever.vectorstore.get(ids=document_ids, include=["metadatas", "documents"]) # type: ignore
84
+ docs = [
85
+ Document(page_content=d, metadata=m)
86
+ for d, m in zip(results["documents"], results["metadatas"])
87
+ ]
88
+ return docs
89
+
90
+ def similarity_search_with_relevance_scores(
91
+ self, query: str, filter: Optional[dict]
92
+ ) -> List[Tuple[Document, float]]:
93
+ if isinstance(filter, dict) and len(filter) > 1:
94
+ filter = {"$and": [{key: {"$eq": value}} for key, value in filter.items()]}
95
+ print("Filter = ", filter)
96
+
97
+ return self.retriever.vectorstore.similarity_search_with_relevance_scores(
98
+ query, k=self._config.semantic_search.max_k, filter=filter
99
+ )
100
+
101
+
102
+ def chunker(seq, size):
103
+ return (seq[pos: pos + size] for pos in range(0, len(seq), size))
app/config/__init__.py ADDED
File without changes
app/config/load.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import execjs
2
+
3
+ from app.config.models.configs import Config
4
+
5
+
6
+ def load_config(app_config_path: str, model_config_path: str = None) -> Config:
7
+ doc_config_dict = load_js_object(app_config_path)
8
+ if model_config_path is not None:
9
+ model_config_dict = load_js_object(model_config_path)
10
+
11
+ return Config(**(doc_config_dict if model_config_path is None else {**doc_config_dict, "llm": model_config_dict}))
12
+
13
+
14
+ def load_js_object(config_path: str) -> dict:
15
+ with open(config_path, "r") as f:
16
+ return execjs.compile(f.read()).eval("module.exports")
app/config/models/__init__.py ADDED
File without changes
app/config/models/configs.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List, Optional, Union
3
+ from uuid import UUID, uuid4
4
+
5
+ from loguru import logger
6
+ from pydantic import (
7
+ BaseModel,
8
+ DirectoryPath,
9
+ Field,
10
+ field_validator,
11
+ ConfigDict,
12
+ ValidationInfo,
13
+ )
14
+
15
+ from app.config.models.openai import OpenAIModelConfig
16
+ from app.config.models.vertexai import VertexAIModelConfig
17
+
18
+
19
+ def create_uuid() -> str:
20
+ return str(uuid4())
21
+
22
+
23
+ class Document(BaseModel):
24
+ """Interface for interacting with a document."""
25
+
26
+ page_content: str
27
+ metadata: dict = Field(default_factory=dict)
28
+
29
+
30
+ class SentenseTransformerEmbeddingModel(BaseModel):
31
+ model_config = ConfigDict()
32
+ model_config["protected_namespaces"] = ()
33
+ model_name: str
34
+ additional_kwargs: dict = Field(default_factory=dict)
35
+
36
+
37
+ class DocumentPathSettings(BaseModel):
38
+ doc_path: Union[DirectoryPath, str]
39
+ additional_parser_settings: Dict[str, Any] = Field(default_factory=dict)
40
+ passage_prefix: str = ""
41
+ label: str = "" # Optional label, will be included in the metadata
42
+
43
+
44
+ class EmbedddingsSpladeConfig(BaseModel):
45
+ n_batch: int = 3
46
+
47
+
48
+ class EmbeddingsConfig(BaseModel):
49
+ model_config = ConfigDict(extra="forbid")
50
+
51
+ embedding_model: SentenseTransformerEmbeddingModel
52
+ embeddings_path: Union[DirectoryPath, str]
53
+ document_settings: List[DocumentPathSettings]
54
+ chunk_sizes: List[int] = [1024]
55
+ splade_config: EmbedddingsSpladeConfig = EmbedddingsSpladeConfig(n_batch=5)
56
+
57
+ @property
58
+ def labels(self) -> List[str]:
59
+ """Returns list of labels in document settings"""
60
+ return [setting.label for setting in self.document_settings if setting.label]
61
+
62
+
63
+ class SemanticSearchConfig(BaseModel):
64
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
65
+ max_k: int = 15
66
+ max_char_size: int = 2048
67
+ query_prefix: str = ""
68
+
69
+
70
+ class LLMConfig(BaseModel):
71
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
72
+ model_config["protected_namespaces"] = ()
73
+
74
+ type: str
75
+ params: dict
76
+
77
+ @field_validator("params")
78
+ def validate_params(cls, value, info: ValidationInfo):
79
+ values = info.data
80
+ type_ = values.get("type")
81
+ if type_ == 'vertexai':
82
+ config = VertexAIModelConfig(
83
+ **value
84
+ ) # An attempt to force conversion to the required model config
85
+ if type_ == 'openai':
86
+ config = OpenAIModelConfig(
87
+ **value
88
+ )
89
+ logger.info(
90
+ f"Loading model paramaters in configuration class {LlamaModelConfig.__name__}"
91
+ )
92
+ return config
93
+
94
+
95
+ class ResponseModel(BaseModel):
96
+ id: UUID = Field(default_factory=create_uuid)
97
+ question: str
98
+ response: str
99
+ average_score: float
100
+ semantic_search: List[str] = Field(default_factory=list)
101
+ hyde_response: str = ""
102
+
103
+
104
+ class Config(BaseModel):
105
+ cache_folder: Path
106
+ embeddings: EmbeddingsConfig
107
+ semantic_search: SemanticSearchConfig
108
+ llm: Optional[LLMConfig] = None
109
+
110
+ def check_embeddings_exist(self) -> bool:
111
+ """Checks if embedings exist in the specified folder"""
112
+
113
+ p_splade = (
114
+ Path(self.embeddings.embeddings_path) / "splade" / "splade_embeddings.npz"
115
+ )
116
+ p_embeddings = Path(self.embeddings.embeddings_path)
117
+ all_parquets = list(p_embeddings.glob("*.parquet"))
118
+ return p_splade.exists() and len(all_parquets) > 0
app/config/models/openai.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from langchain_community.chat_models import ChatOpenAI
4
+ from langchain_core.prompts import PromptTemplate
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+
8
+ class OpenAIModelConfig(BaseModel):
9
+ model_config = ConfigDict()
10
+ model_config["protected_namespaces"] = ()
11
+ prompt_template: str
12
+ model_kwargs: dict = {}
13
+
14
+
15
+ class OpenAIModel:
16
+ def __init__(self, config: OpenAIModelConfig):
17
+ self.config = config
18
+ self._model = None
19
+
20
+ @property
21
+ def model(self):
22
+ return ChatOpenAI(**self.config.model_kwargs)
23
+
24
+ @property
25
+ def prompt(self) -> Optional[PromptTemplate]:
26
+ if self.config.prompt_template:
27
+ return PromptTemplate(
28
+ input_variables=["context", "question"], template=self.config.prompt_template
29
+ )
app/config/models/vertexai.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_google_vertexai import ChatVertexAI
5
+ from pydantic import BaseModel, ConfigDict
6
+ from vertexai.generative_models import HarmCategory, HarmBlockThreshold
7
+
8
+
9
+ class VertexAIModelConfig(BaseModel):
10
+ model_config = ConfigDict()
11
+ model_config["protected_namespaces"] = ()
12
+ prompt_template: str
13
+ model_kwargs: dict = {}
14
+
15
+
16
+ class VertexAIModel:
17
+ def __init__(self, config: VertexAIModelConfig):
18
+ self.config = config
19
+ self._model = None
20
+
21
+ @property
22
+ def model(self):
23
+ return ChatVertexAI(**self.config.model_kwargs,
24
+ safety_settings={
25
+ HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
26
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
27
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
28
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
29
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
30
+ })
31
+
32
+ @property
33
+ def prompt(self) -> Optional[PromptTemplate]:
34
+ if self.config.prompt_template:
35
+ return PromptTemplate(
36
+ input_variables=["context", "question"], template=self.config.prompt_template
37
+ )
app/config/types.ts ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Configuration for embeddings, including paths, models, and document settings
2
+ type EmbeddingsConfig = {
3
+ embeddings_path: string; // Path where embeddings will be saved
4
+ embedding_model: { // Optional embedding model specification
5
+ model_name: string; // Name of the model
6
+ };
7
+ splade_config: { // Optional configuration for SPLADE
8
+ n_batch: number; // Batch size for processing
9
+ };
10
+ chunk_sizes: number[]; // Chunk sizes for splitting during embedding
11
+ document_settings: { // Settings for document processing
12
+ doc_path: string; // Path to documents
13
+ additional_parser_settings?: { // Optional settings for parsing documents
14
+ md: { // Settings for Markdown documents
15
+ skip_first: boolean; // Whether to skip the first section
16
+ merge_sections: boolean; // Whether to merge sections
17
+ remove_images: boolean; // Whether to remove images from documents
18
+ };
19
+ };
20
+ passage_prefix: string; // Prefix for passages
21
+ }[];
22
+ };
23
+
24
+ // Configuration for semantic search functionality
25
+ type SemanticSearchConfig = {
26
+ max_k: number; // Maximum number of results to return
27
+ max_char_size: number; // Max character size for context provided to models
28
+ query_prefix: string; // Prefix for queries
29
+ };
30
+
31
+ export type AppConfig = {
32
+ cache_folder: string;
33
+ embeddings: EmbeddingsConfig;
34
+ semantic_search: SemanticSearchConfig;
35
+ };
36
+
37
+ // Type definition for the LLM configuration section for OpenAI models
38
+ type OpenAIConfig = {
39
+ type: 'openai'; // Specifies the use of an OpenAI model
40
+ params: {
41
+ prompt_template: string; // Template for constructing prompts for the model. It includes placeholders for context and questions.
42
+ model_kwargs: { // Keyword arguments for configuring the model's inference behavior
43
+ openai_api_key: string; // API key for accessing the OpenAI API
44
+ temperature: number; // Temperature setting for controlling the randomness of response generation. A value of 0.0 generates deterministic responses.
45
+ model_name: string; // Specifies the name of the model to be used for generating responses.
46
+ };
47
+ };
48
+ };
49
+
50
+ // Type definition for the LLM configuration section for Google VertexAI models
51
+ type VertexAIConfig = {
52
+ type: 'vertexai'; // Specifies the use of an OpenAI model
53
+ params: {
54
+ prompt_template: string; // Template for constructing prompts for the model. It includes placeholders for context and questions.
55
+ model_kwargs: { // Keyword arguments for configuring the model's inference behavior
56
+ model_name: string; // Specifies the name of the model to be used for generating responses.
57
+ temperature: number; // Temperature setting for controlling the randomness of response generation. A value of 0.0 generates deterministic responses.
58
+ };
59
+ };
60
+ };
61
+
62
+ // Type definition for the LLM configuration section
63
+ export type LLMConfig = OpenAIConfig | VertexAIConfig;
64
+
65
+
app/main.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple
3
+
4
+ import click
5
+ import pandas as pd
6
+ from datasets import Dataset
7
+ from langchain.chains import LLMChain
8
+ from langchain.chains.question_answering import load_qa_chain
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
11
+ from langchain_google_vertexai import ChatVertexAI
12
+ from loguru import logger
13
+ from ragas import evaluate
14
+ from ragas.embeddings import LangchainEmbeddingsWrapper
15
+ from ragas.llms import LangchainLLMWrapper
16
+ from ragas.metrics import (
17
+ answer_relevancy,
18
+ context_precision, answer_correctness,
19
+ )
20
+ from tqdm import tqdm
21
+
22
+ from app.chroma import ChromaDenseVectorDB
23
+ from app.config.load import load_config
24
+ from app.config.models.configs import Config
25
+ from app.config.models.vertexai import VertexAIModel
26
+ from app.parsers.splitter import DocumentSplitter
27
+ from app.pipeline import LLMBundle
28
+ from app.ranking import BCEReranker
29
+ from app.splade import SpladeSparseVectorDB
30
+
31
+
32
+ def get_hash_mapping_filenames(
33
+ config: Config,
34
+ file_to_hash_fn: str = "file_hash_mappings.snappy.parquet",
35
+ docid_to_hash_fn="docid_hash_mappings.snappy.parquet",
36
+ ) -> Tuple[str, str]:
37
+ file_hashes_fn = os.path.join(config.embeddings.embeddings_path, file_to_hash_fn)
38
+ docid_hashes_fn = os.path.join(config.embeddings.embeddings_path, docid_to_hash_fn)
39
+ return file_hashes_fn, docid_hashes_fn
40
+
41
+
42
+ @click.group()
43
+ def main():
44
+ pass
45
+
46
+
47
+ @main.command(name="index")
48
+ @click.option(
49
+ "-c",
50
+ "app_config_path",
51
+ required=True,
52
+ help="Specifies App JavaScript configuration file (should be module exported)"
53
+ )
54
+ def create_index(app_config_path):
55
+ config = load_config(app_config_path)
56
+
57
+ dense_db = ChromaDenseVectorDB(
58
+ persist_folder=str(config.embeddings.embeddings_path), config=config
59
+ )
60
+ splitter = DocumentSplitter(config)
61
+ all_docs, all_hash_filename_mappings, all_hash_docid_mappings = splitter.split()
62
+
63
+ # dense embeddings
64
+ dense_db.generate_embeddings(docs=all_docs)
65
+
66
+ # sparse embeddings
67
+ sparse_db = SpladeSparseVectorDB(config)
68
+ sparse_db.generate_embeddings(docs=all_docs)
69
+
70
+ file_hashes_fn, docid_hashes_fn = get_hash_mapping_filenames(config)
71
+
72
+ all_hash_filename_mappings.to_parquet(
73
+ file_hashes_fn, compression="snappy", index=False
74
+ )
75
+
76
+ all_hash_docid_mappings.to_parquet(
77
+ docid_hashes_fn, compression="snappy", index=False
78
+ )
79
+
80
+ logger.info("Document Embeddings Generated")
81
+
82
+
83
+ @main.command("predict")
84
+ @click.option(
85
+ "-c",
86
+ "app_config_path",
87
+ required=True,
88
+ type=click.Path(exists=True, dir_okay=False, file_okay=True),
89
+ help="Specifies App JavaScript configuration file (should be module exported)",
90
+ )
91
+ @click.option(
92
+ "-m",
93
+ "model_config_path",
94
+ required=True,
95
+ type=click.Path(exists=True, dir_okay=False, file_okay=True),
96
+ help="Specifies Model JavaScript configuration file (should be module exported)",
97
+ )
98
+ def predict_pipeline(app_config_path: str, model_config_path: str):
99
+ config = load_config(app_config_path, model_config_path)
100
+
101
+ # llm = OpenAIModel(config=config.llm.params)
102
+ llm = VertexAIModel(config=config.llm.params)
103
+
104
+ chain = load_qa_chain(llm=llm.model, prompt=llm.prompt)
105
+
106
+ store = ChromaDenseVectorDB(
107
+ persist_folder=str(config.embeddings.embeddings_path), config=config
108
+ )
109
+ store._load_retriever()
110
+
111
+ reranker = BCEReranker()
112
+
113
+ chunk_sizes = config.embeddings.chunk_sizes
114
+
115
+ splade = SpladeSparseVectorDB(config=config)
116
+ splade.load()
117
+
118
+ hyde_chain = LLMChain(
119
+ llm=llm.model,
120
+ prompt=PromptTemplate(
121
+ template="Write a short passage to answer the question: {question}",
122
+ input_variables=["question"],
123
+ ),
124
+ )
125
+
126
+ llm_bundle = LLMBundle(
127
+ chain=chain,
128
+ reranker=reranker,
129
+ chunk_sizes=chunk_sizes,
130
+ sparse_db=splade,
131
+ dense_db=store,
132
+ hyde_chain=hyde_chain,
133
+ )
134
+
135
+ test_dataset = pd.read_json("evaluation_dataset.json", lines=True)
136
+ evaluate_data = {
137
+ "question": [],
138
+ "answer": [],
139
+ "contexts": [], # should be a list[list[str]]
140
+ 'ground_truth': [],
141
+ 'context_ground_truth': []
142
+ }
143
+
144
+ test_dataset = test_dataset.head(10)
145
+
146
+ for idx, row in tqdm(test_dataset.iterrows()):
147
+ output = llm_bundle.get_and_parse_response(
148
+ query=row["question"],
149
+ config=config,
150
+ )
151
+ response = output.response
152
+
153
+ evaluate_data["question"].append(row["question"])
154
+ evaluate_data["answer"].append(response)
155
+ evaluate_data["contexts"].append(output.semantic_search)
156
+ evaluate_data["ground_truth"].append(row["answer"])
157
+ evaluate_data["context_ground_truth"].append(row["context"])
158
+
159
+ evaluate_dataset = Dataset.from_dict(evaluate_data)
160
+
161
+ # store the evaluation dataset
162
+
163
+ evaluate_dataset.to_pandas().to_json("evaluation_output.json", orient="records", lines=True)
164
+
165
+
166
+ @main.command("evaluate")
167
+ def evaluate_pipeline():
168
+ ragas_vertexai_llm = ChatVertexAI(model_name="gemini-pro")
169
+ ragas_vertexai_llm = LangchainLLMWrapper(ragas_vertexai_llm)
170
+ vertexai_embeddings = SentenceTransformerEmbeddings(model_name="maidalun1020/bce-embedding-base_v1")
171
+ vertexai_embeddings = LangchainEmbeddingsWrapper(vertexai_embeddings)
172
+
173
+ metrics = [
174
+ # the accuracy of the generated answer when compared to the ground truth
175
+ answer_correctness,
176
+ # evaluates whether all the ground-truth relevant items present in the contexts are ranked higher or not
177
+ context_precision,
178
+ # how pertinent the generated answer is to the given prompt
179
+ answer_relevancy,
180
+ ]
181
+
182
+ evaluate_dataset = pd.read_json("evaluation_output.json", lines=True)
183
+ evaluate_dataset = Dataset.from_pandas(evaluate_dataset)
184
+
185
+ evaluate_result = evaluate(
186
+ dataset=evaluate_dataset,
187
+ metrics=metrics,
188
+ llm=ragas_vertexai_llm,
189
+ embeddings=vertexai_embeddings,
190
+ is_async=True
191
+ )
192
+
193
+ evaluate_result_df = evaluate_result.to_pandas()
194
+ # drop the contexts, context_ground_truth
195
+ evaluate_result_df = evaluate_result_df.drop(columns=["contexts", "context_ground_truth"])
196
+ # print the mean for answer_correctness context_precision answer_relevancy columns
197
+ print(evaluate_result_df.mean(numeric_only=True))
198
+ evaluate_result_df.to_csv("evaluation_results.csv", index=False)
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
app/parsers/__init__.py ADDED
File without changes
app/parsers/markdown.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import urllib
3
+ from collections import namedtuple
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Generator, List, Union, Tuple
7
+
8
+ from loguru import logger
9
+
10
+ FORMATTING_SEQUENCES = {"*", "**", "***", "_", "__", "~~", "||"}
11
+ CODE_BLOCK_SEQUENCES = {"`", "``", "```"}
12
+ ALL_SEQUENCES = FORMATTING_SEQUENCES | CODE_BLOCK_SEQUENCES
13
+ MAX_FORMATTING_SEQUENCE_LENGTH = max(len(seq) for seq in ALL_SEQUENCES)
14
+
15
+
16
+ class SplitCandidates(Enum):
17
+ SPACE = 1
18
+ NEWLINE = 2
19
+ LAST_CHAR = 3
20
+
21
+
22
+ SPLIT_CANDIDATES_PREFRENCE = [
23
+ SplitCandidates.NEWLINE,
24
+ SplitCandidates.SPACE,
25
+ SplitCandidates.LAST_CHAR,
26
+ ]
27
+
28
+ BLOCK_SPLIT_CANDIDATES = [r"\n#\s+", r"\n##\s+", r"\n###\s+"]
29
+ CODE_BLOCK_LEVEL = 10
30
+
31
+ MarkdownChunk = namedtuple("MarkdownChunk", "string level")
32
+
33
+
34
+ class SplitCandidateInfo:
35
+ last_seen: int
36
+ active_sequences: List[str]
37
+ active_sequences_length: int
38
+
39
+ def __init__(self):
40
+ self.last_seen = None
41
+ self.active_sequences = []
42
+ self.active_sequences_length = 0
43
+
44
+ def process_sequence(self, seq: str, is_in_code_block: bool):
45
+ if is_in_code_block:
46
+ if self.active_sequences and seq == self.active_sequences[-1]:
47
+ last_seq = self.active_sequences.pop()
48
+ self.active_sequences_length -= len(last_seq)
49
+ return True
50
+ elif seq in CODE_BLOCK_SEQUENCES:
51
+ self.active_sequences.append(seq)
52
+ self.active_sequences_length += len(seq)
53
+ return True
54
+ else:
55
+ for k in range(len(self.active_sequences) - 1, -1, -1):
56
+ if seq == self.active_sequences[k]:
57
+ sequences_being_removed = self.active_sequences[k:]
58
+ self.active_sequences = self.active_sequences[:k]
59
+ self.active_sequences_length -= sum(
60
+ len(seq) for seq in sequences_being_removed
61
+ )
62
+ return False
63
+ self.active_sequences.append(seq)
64
+ self.active_sequences_length += len(seq)
65
+ return False
66
+
67
+ def copy_from(self, other):
68
+ self.last_seen = other.last_seen
69
+ self.active_sequences = other.active_sequences.copy()
70
+ self.active_sequences_length = other.active_sequences_length
71
+
72
+
73
+ def physical_split(markdown: str, max_chunk_size: int) -> Generator[str, None, None]:
74
+ if max_chunk_size <= MAX_FORMATTING_SEQUENCE_LENGTH:
75
+ raise ValueError(
76
+ f"max_chunk_size must be greater than {MAX_FORMATTING_SEQUENCE_LENGTH}"
77
+ )
78
+
79
+ split_candidates = {
80
+ SplitCandidates.SPACE: SplitCandidateInfo(),
81
+ SplitCandidates.NEWLINE: SplitCandidateInfo(),
82
+ SplitCandidates.LAST_CHAR: SplitCandidateInfo(),
83
+ }
84
+ is_in_code_block = False
85
+
86
+ chunk_start_from, chunk_char_count, chunk_prefix = 0, 0, ""
87
+
88
+ def split_chunk():
89
+ for split_variant in SPLIT_CANDIDATES_PREFRENCE:
90
+ split_candidate = split_candidates[split_variant]
91
+ if split_candidate.last_seen is None:
92
+ continue
93
+ chunk_end = split_candidate.last_seen + (
94
+ 1 if split_variant == SplitCandidates.LAST_CHAR else 0
95
+ )
96
+ chunk = (
97
+ chunk_prefix
98
+ + markdown[chunk_start_from:chunk_end]
99
+ + "".join(reversed(split_candidate.active_sequences))
100
+ )
101
+
102
+ next_chunk_prefix = "".join(split_candidate.active_sequences)
103
+ next_chunk_char_count = len(next_chunk_prefix)
104
+ next_chunk_start_from = chunk_end + (
105
+ 0 if split_variant == SplitCandidates.LAST_CHAR else 1
106
+ )
107
+
108
+ split_candidates[SplitCandidates.NEWLINE] = SplitCandidateInfo()
109
+ split_candidates[SplitCandidates.SPACE] = SplitCandidateInfo()
110
+ return (
111
+ chunk,
112
+ next_chunk_start_from,
113
+ next_chunk_char_count,
114
+ next_chunk_prefix,
115
+ )
116
+
117
+ i = 0
118
+ while i < len(markdown):
119
+ for j in range(MAX_FORMATTING_SEQUENCE_LENGTH, 0, -1):
120
+ seq = markdown[i: i + j]
121
+ if seq in ALL_SEQUENCES:
122
+ last_char_split_candidate_len = (
123
+ chunk_char_count
124
+ + split_candidates[
125
+ SplitCandidates.LAST_CHAR
126
+ ].active_sequences_length
127
+ + len(seq)
128
+ )
129
+ if last_char_split_candidate_len >= max_chunk_size:
130
+ (
131
+ next_chunk,
132
+ chunk_start_from,
133
+ chunk_char_count,
134
+ chunk_prefix,
135
+ ) = split_chunk()
136
+ yield next_chunk
137
+ is_in_code_block = split_candidates[
138
+ SplitCandidates.LAST_CHAR
139
+ ].process_sequence(seq, is_in_code_block)
140
+ i += len(seq)
141
+ chunk_char_count += len(seq)
142
+ split_candidates[SplitCandidates.LAST_CHAR].last_seen = i - 1
143
+ break
144
+
145
+ if i >= len(markdown):
146
+ break
147
+
148
+ split_candidates[SplitCandidates.LAST_CHAR].last_seen = i
149
+ chunk_char_count += 1
150
+ if markdown[i] == "\n":
151
+ split_candidates[SplitCandidates.NEWLINE].copy_from(
152
+ split_candidates[SplitCandidates.LAST_CHAR]
153
+ )
154
+ elif markdown[i] == " ":
155
+ split_candidates[SplitCandidates.SPACE].copy_from(
156
+ split_candidates[SplitCandidates.LAST_CHAR]
157
+ )
158
+
159
+ last_char_split_candidate_len = (
160
+ chunk_char_count
161
+ + split_candidates[SplitCandidates.LAST_CHAR].active_sequences_length
162
+ )
163
+ if last_char_split_candidate_len == max_chunk_size:
164
+ next_chunk, chunk_start_from, chunk_char_count, chunk_prefix = split_chunk()
165
+ yield next_chunk
166
+
167
+ i += 1
168
+
169
+ if chunk_start_from < len(markdown):
170
+ yield chunk_prefix + markdown[chunk_start_from:]
171
+
172
+
173
+ def get_logical_blocks_recursively(
174
+ markdown: str, max_chunk_size: int, all_sections: list, split_candidate_index=0
175
+ ) -> List[MarkdownChunk]:
176
+ if split_candidate_index >= len(BLOCK_SPLIT_CANDIDATES):
177
+ for chunk in physical_split(markdown, max_chunk_size):
178
+ all_sections.append(
179
+ MarkdownChunk(string=chunk, level=split_candidate_index)
180
+ )
181
+ return all_sections
182
+ chunks = []
183
+ add_index = 0
184
+ for add_index, split_candidate in enumerate(
185
+ BLOCK_SPLIT_CANDIDATES[split_candidate_index:]
186
+ ):
187
+ chunks = re.split(split_candidate, markdown)
188
+ if len(chunks) > 1:
189
+ break
190
+
191
+ for i, chunk in enumerate(chunks):
192
+ level = split_candidate_index + add_index
193
+ if i > 0:
194
+ level += 1
195
+
196
+ prefix = "\n\n" + "#" * level + " "
197
+ if not chunk.strip():
198
+ continue
199
+
200
+ if len(chunk) <= max_chunk_size:
201
+ all_sections.append(MarkdownChunk(string=prefix + chunk, level=level - 1))
202
+ else:
203
+ get_logical_blocks_recursively(
204
+ chunk,
205
+ max_chunk_size,
206
+ all_sections,
207
+ split_candidate_index=split_candidate_index + add_index + 1,
208
+ )
209
+ return all_sections
210
+
211
+
212
+ def markdown_splitter(
213
+ path: Union[str, Path], max_chunk_size: int, **additional_splitter_settings
214
+ ) -> List[dict]:
215
+ try:
216
+ with open(path, "r") as f:
217
+ markdown = f.read()
218
+ except OSError:
219
+ return []
220
+
221
+ if len(markdown) < max_chunk_size:
222
+ return [{"text": markdown, "metadata": {"heading": ""}}]
223
+
224
+ sections = [MarkdownChunk(string="", level=0)]
225
+
226
+ markdown, additional_metadata = preprocess_markdown(
227
+ markdown, additional_splitter_settings
228
+ )
229
+
230
+ # Split by code and non-code
231
+ chunks = markdown.split("```")
232
+
233
+ for i, chunk in enumerate(chunks):
234
+ if i % 2 == 0: # Every even element (0 indexed) is a non-code
235
+ logical_blocks = get_logical_blocks_recursively(
236
+ chunk, max_chunk_size=max_chunk_size, all_sections=[]
237
+ )
238
+ sections += logical_blocks
239
+ else: # Process the code section
240
+ rows = chunk.split("\n")
241
+ code = rows[1:]
242
+
243
+ lang = rows[0] # Get the language name
244
+
245
+ # Provide a hint to LLM
246
+ all_code_rows = (
247
+ [
248
+ f"\nFollowing is a code section in {lang}, delimited by triple backticks:",
249
+ f"```{lang}",
250
+ ]
251
+ + code
252
+ + ["```"]
253
+ )
254
+ all_code_str = "\n".join(all_code_rows)
255
+
256
+ # Merge code to a previous logical block if there is enough space
257
+ if len(sections[-1].string) + len(all_code_str) < max_chunk_size:
258
+ sections[-1] = MarkdownChunk(
259
+ string=sections[-1].string + all_code_str, level=sections[-1].level
260
+ )
261
+
262
+ # If code block is larger than max size, physically split it
263
+ elif len(all_code_str) >= max_chunk_size:
264
+ code_chunks = physical_split(
265
+ all_code_str, max_chunk_size=max_chunk_size
266
+ )
267
+ for cchunk in code_chunks:
268
+ # Assign language header to the code chunk, if doesn't exist
269
+ if f"```{lang}" not in cchunk:
270
+ cchunk_rows = cchunk.split("```")
271
+ cchunk = f"```{lang}\n" + cchunk_rows[1] + "```"
272
+
273
+ sections.append(
274
+ MarkdownChunk(string=cchunk, level=CODE_BLOCK_LEVEL)
275
+ )
276
+
277
+ # Otherwise, add as a single chunk
278
+ else:
279
+ sections.append(
280
+ MarkdownChunk(string=all_code_str, level=CODE_BLOCK_LEVEL)
281
+ )
282
+
283
+ all_out = postprocess_sections(
284
+ sections,
285
+ max_chunk_size,
286
+ additional_splitter_settings,
287
+ additional_metadata,
288
+ path,
289
+ )
290
+ return all_out
291
+
292
+
293
+ def preprocess_markdown(markdown: str, additional_settings: dict) -> Tuple[str, dict]:
294
+ preprocess_remove_images = additional_settings.get("remove_images", False)
295
+ preprocess_remove_extra_newlines = additional_settings.get(
296
+ "remove_extra_newlines", True
297
+ )
298
+ preprocess_find_metadata = additional_settings.get("find_metadata", dict())
299
+
300
+ if preprocess_remove_images:
301
+ markdown = remove_images(markdown)
302
+
303
+ if preprocess_remove_extra_newlines:
304
+ markdown = remove_extra_newlines(markdown)
305
+
306
+ additional_metadata = {}
307
+
308
+ if preprocess_find_metadata:
309
+ if not isinstance(preprocess_find_metadata, dict):
310
+ raise TypeError(
311
+ f"find_metadata settings should be of type dict. Got {type(preprocess_find_metadata)}"
312
+ )
313
+
314
+ for label, search_string in preprocess_find_metadata.items():
315
+ logger.info(f"Looking for metadata: {search_string}")
316
+ metadata = find_metadata(markdown, search_string)
317
+ if metadata:
318
+ logger.info(f"\tFound metadata for {label} - {metadata}")
319
+ additional_metadata[label] = metadata
320
+
321
+ return markdown, additional_metadata
322
+
323
+
324
+ def postprocess_sections(
325
+ sections: List[MarkdownChunk],
326
+ max_chunk_size: int,
327
+ additional_settings: dict,
328
+ additional_metadata: dict,
329
+ path: Union[str, Path],
330
+ ) -> List[dict]:
331
+ all_out = []
332
+
333
+ skip_first = additional_settings.get("skip_first", False)
334
+ merge_headers = additional_settings.get("merge_sections", False)
335
+
336
+ # Remove all empty sections
337
+ sections = [s for s in sections if s.string]
338
+
339
+ if sections and skip_first:
340
+ # remove first section
341
+ sections = sections[1:]
342
+
343
+ if sections and merge_headers:
344
+ # Merge sections
345
+ sections = merge_sections(sections, max_chunk_size=max_chunk_size)
346
+
347
+ current_heading = ""
348
+
349
+ sections_metadata = {"Document name": Path(path).name}
350
+
351
+ for s in sections:
352
+ stripped_string = s.string.strip()
353
+ doc_metadata = {}
354
+ if len(stripped_string) > 0:
355
+ heading = ""
356
+ if stripped_string.startswith("#"): # heading detected
357
+ heading = stripped_string.split("\n")[0].replace("#", "").strip()
358
+ stripped_heading = heading.replace("#", "").replace(" ", "").strip()
359
+ if not stripped_heading:
360
+ heading = ""
361
+ if s.level == 0:
362
+ current_heading = heading
363
+ doc_metadata["heading"] = urllib.parse.quote(
364
+ heading
365
+ ) # isolate the heading
366
+ else:
367
+ doc_metadata["heading"] = ""
368
+
369
+ final_section = add_section_metadata(
370
+ stripped_string,
371
+ section_metadata={
372
+ **sections_metadata,
373
+ **{"Subsection of": current_heading},
374
+ **additional_metadata,
375
+ },
376
+ )
377
+ all_out.append({"text": final_section, "metadata": doc_metadata})
378
+ return all_out
379
+
380
+
381
+ def remove_images(page_md: str) -> str:
382
+ return re.sub(r"""!\[[^\]]*\]\((.*?)\s*("(?:.*[^"])")?\s*\)""", "", page_md)
383
+
384
+
385
+ def remove_extra_newlines(page_md) -> str:
386
+ page_md = re.sub(r"\n{3,}", "\n\n", page_md)
387
+ return page_md
388
+
389
+
390
+ def add_section_metadata(s, section_metadata: dict):
391
+ metadata_s = ""
392
+ for k, v in section_metadata.items():
393
+ if v:
394
+ metadata_s += f"{k}: {v}\n"
395
+ metadata = f"Metadata applicable to the next chunk of text delimited by five stars:\n>> METADATA START\n{metadata_s}>> METADATA END\n\n"
396
+
397
+ return metadata + "*****\n" + s + "\n*****"
398
+
399
+
400
+ def find_metadata(page_md: str, search_string: str) -> str:
401
+ pattern = rf"{search_string}(.*)"
402
+ match = re.search(pattern, page_md)
403
+ if match:
404
+ return match.group(1)
405
+ return ""
406
+
407
+
408
+ def merge_sections(
409
+ sections: List[MarkdownChunk], max_chunk_size: int
410
+ ) -> List[MarkdownChunk]:
411
+ current_section = sections[0]
412
+ all_out = []
413
+
414
+ prev_level = 0
415
+ for s in sections[1:]:
416
+ if (
417
+ len(current_section.string + s.string) > max_chunk_size
418
+ or s.level <= prev_level
419
+ ):
420
+ all_out.append(current_section)
421
+ current_section = s
422
+ prev_level = 0
423
+ else:
424
+ current_section = MarkdownChunk(
425
+ string=current_section.string + s.string, level=current_section.level
426
+ )
427
+ prev_level = s.level if s.level != CODE_BLOCK_LEVEL else prev_level
428
+
429
+ all_out.append(current_section)
430
+
431
+ return all_out
app/parsers/splitter.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import urllib
3
+ import uuid
4
+ from pathlib import Path
5
+ from typing import List, Tuple
6
+
7
+ import pandas as pd
8
+ from loguru import logger
9
+
10
+ from app.config.models.configs import Document, Config
11
+ from app.parsers.markdown import markdown_splitter
12
+
13
+ HASH_BLOCKSIZE = 65536
14
+
15
+
16
+ class DocumentSplitter:
17
+ def __init__(self, config: Config) -> None:
18
+
19
+ self.document_path_settings = config.embeddings.document_settings
20
+ self.chunk_sizes = config.embeddings.chunk_sizes
21
+
22
+ def split(
23
+ self,
24
+ limit: int = None,
25
+ ) -> Tuple[List[Document], pd.DataFrame, pd.DataFrame]:
26
+ all_docs = []
27
+ hash_filename_mappings = []
28
+ hash_docid_mappings = []
29
+
30
+ for setting in self.document_path_settings:
31
+ passage_prefix = setting.passage_prefix
32
+ docs_path = Path(setting.doc_path)
33
+
34
+ extension = "md"
35
+ for chunk_size in self.chunk_sizes:
36
+ paths = [p for p in list(docs_path.glob(f"**/*.{extension}"))]
37
+
38
+ additional_parser_settings = setting.additional_parser_settings.get(
39
+ extension, dict()
40
+ )
41
+
42
+ (
43
+ docs,
44
+ hf_mappings,
45
+ hd_mappings,
46
+ ) = self._get_documents_from_custom_splitter(
47
+ document_paths=paths,
48
+ splitter_func=markdown_splitter,
49
+ max_size=chunk_size,
50
+ passage_prefix=passage_prefix,
51
+ **additional_parser_settings,
52
+ )
53
+
54
+ all_docs.extend(docs)
55
+ hash_filename_mappings.extend(hf_mappings)
56
+ hash_docid_mappings.extend(hd_mappings)
57
+
58
+ all_hash_filename_mappings = pd.DataFrame(hash_filename_mappings)
59
+ all_hash_docid_mappings = pd.concat(hash_docid_mappings, axis=0)
60
+
61
+ if limit:
62
+ all_docs = all_docs[:limit]
63
+ all_hash_filename_mappings = all_hash_filename_mappings[:limit]
64
+ all_hash_docid_mappings = all_hash_docid_mappings[:limit]
65
+
66
+ return all_docs, all_hash_filename_mappings, all_hash_docid_mappings
67
+
68
+ def _get_documents_from_custom_splitter(
69
+ self,
70
+ document_paths: List[Path],
71
+ splitter_func,
72
+ max_size,
73
+ passage_prefix: str,
74
+ **additional_kwargs,
75
+ ) -> Tuple[List[Document], List[dict], List[pd.DataFrame]]:
76
+ all_docs = []
77
+ hash_filename_mappings = []
78
+ hash_docid_mappings = []
79
+
80
+ for path in document_paths:
81
+
82
+ filepath = str(path)
83
+
84
+ filename = filepath.split("/")[-1].replace(f".{path.suffix}", "")
85
+
86
+ if path.suffix != ".md":
87
+ continue
88
+
89
+ additional_kwargs.update({"filename": filepath})
90
+ docs_data = splitter_func(path, max_size, **additional_kwargs)
91
+ file_hash = get_md5_hash(path)
92
+
93
+ path = urllib.parse.quote(str(path)) # type: ignore
94
+ logger.info(path)
95
+
96
+ docs = [
97
+ Document(
98
+ page_content=passage_prefix + d["text"],
99
+ metadata={
100
+ **d["metadata"],
101
+ **{
102
+ "source": str(path),
103
+ "chunk_size": max_size,
104
+ "document_id": str(uuid.uuid1()),
105
+ "label": filename,
106
+ },
107
+ },
108
+ )
109
+ for d in docs_data
110
+ ]
111
+
112
+ for d in docs:
113
+ if 'page' in d.metadata and d.metadata['page'] is None:
114
+ d.metadata['page'] = -1
115
+
116
+ all_docs.extend(docs)
117
+
118
+ hash_filename_mappings.append(dict(filename=filepath, filehash=file_hash))
119
+
120
+ df_hash_docid = (
121
+ pd.DataFrame()
122
+ .assign(docid=[d.metadata["document_id"] for d in docs])
123
+ .assign(filehash=file_hash)
124
+ )
125
+
126
+ hash_docid_mappings.append(df_hash_docid)
127
+
128
+ logger.info(f"Got {len(all_docs)} nodes.")
129
+ return all_docs, hash_filename_mappings, hash_docid_mappings
130
+
131
+
132
+ def get_md5_hash(file_path: Path) -> str:
133
+ hasher = hashlib.md5()
134
+
135
+ with open(file_path, "rb") as file:
136
+ buf = file.read(HASH_BLOCKSIZE)
137
+ while buf:
138
+ hasher.update(buf)
139
+ buf = file.read(HASH_BLOCKSIZE)
140
+
141
+ return hasher.hexdigest()
app/pipeline.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ from typing import List, Optional, Tuple
3
+
4
+ from langchain.chains import LLMChain
5
+ from langchain.chains.base import Chain
6
+ from loguru import logger
7
+
8
+ from app.chroma import ChromaDenseVectorDB
9
+ from app.config.models.configs import (
10
+ ResponseModel,
11
+ Config, SemanticSearchConfig,
12
+ )
13
+ from app.ranking import BCEReranker, rerank
14
+ from app.splade import SpladeSparseVectorDB
15
+
16
+
17
+ class LLMBundle:
18
+ def __init__(
19
+ self,
20
+ chain: Chain,
21
+ dense_db: ChromaDenseVectorDB,
22
+ reranker: BCEReranker,
23
+ sparse_db: SpladeSparseVectorDB,
24
+ chunk_sizes: List[int],
25
+ hyde_chain: Optional[LLMChain] = None
26
+ ) -> None:
27
+ self.chain = chain
28
+ self.dense_db = dense_db
29
+ self.reranker = reranker
30
+ self.sparse_db = sparse_db
31
+ self.chunk_sizes = chunk_sizes
32
+ self.hyde_chain = hyde_chain
33
+
34
+ def get_relevant_documents(
35
+ self,
36
+ original_query: str,
37
+ query: str,
38
+ config: SemanticSearchConfig,
39
+ label: str,
40
+ ) -> Tuple[List[str], float]:
41
+ most_relevant_docs = []
42
+ docs = []
43
+
44
+ current_reranker_score, reranker_score = -1e5, -1e5
45
+
46
+ for chunk_size in self.chunk_sizes:
47
+ all_relevant_docs = []
48
+ all_relevant_doc_ids = set()
49
+ logger.debug("Evaluating query: {}", query)
50
+ if config.query_prefix:
51
+ logger.info(f"Adding query prefix for retrieval: {config.query_prefix}")
52
+ query = config.query_prefix + query
53
+ sparse_search_docs_ids, sparse_scores = self.sparse_db.query(
54
+ search=query, n=config.max_k, label=label, chunk_size=chunk_size
55
+ )
56
+
57
+ logger.info(f"Stage 1: Got {len(sparse_search_docs_ids)} documents.")
58
+
59
+ filter = (
60
+ {"chunk_size": chunk_size}
61
+ if len(self.chunk_sizes) > 1
62
+ else dict()
63
+ )
64
+
65
+ if label:
66
+ filter.update({"label": label})
67
+
68
+ if (
69
+ not filter
70
+ ):
71
+ filter = None
72
+
73
+ logger.info(f"Dense embeddings filter: {filter}")
74
+
75
+ res = self.dense_db.similarity_search_with_relevance_scores(
76
+ query, filter=filter
77
+ )
78
+ dense_search_doc_ids = [r[0].metadata["document_id"] for r in res]
79
+
80
+ all_doc_ids = (
81
+ set(sparse_search_docs_ids).union(set(dense_search_doc_ids))
82
+ ).difference(all_relevant_doc_ids)
83
+ if all_doc_ids:
84
+ relevant_docs = self.dense_db.get_documents_by_id(
85
+ document_ids=list(all_doc_ids)
86
+ )
87
+ all_relevant_docs += relevant_docs
88
+
89
+ # Re-rank embeddings
90
+ reranker_score, relevant_docs = rerank(
91
+ rerank_model=self.reranker,
92
+ query=original_query,
93
+ docs=all_relevant_docs,
94
+ )
95
+ if reranker_score > current_reranker_score:
96
+ docs = relevant_docs
97
+ current_reranker_score = reranker_score
98
+
99
+ len_ = 0
100
+
101
+ for doc in docs:
102
+ doc_length = len(doc.page_content)
103
+ if len_ + doc_length < config.max_char_size:
104
+ most_relevant_docs.append(doc)
105
+ len_ += doc_length
106
+
107
+ return most_relevant_docs, current_reranker_score
108
+
109
+ def get_and_parse_response(
110
+ self,
111
+ query: str,
112
+ config: Config,
113
+ label: str = "",
114
+ ) -> ResponseModel:
115
+ original_query = query
116
+
117
+ # Add HyDE queries
118
+ hyde_response = self.hyde_chain.run(query)
119
+
120
+ query += hyde_response
121
+
122
+ logger.info(f"query: {query}")
123
+
124
+ semantic_search_config = config.semantic_search
125
+ most_relevant_docs, score = self.get_relevant_documents(
126
+ original_query, query, semantic_search_config, label
127
+ )
128
+
129
+ res = self.chain(
130
+ {"input_documents": most_relevant_docs, "question": original_query},
131
+ )
132
+
133
+ out = ResponseModel(
134
+ response=res["output_text"],
135
+ question=query,
136
+ average_score=score,
137
+ hyde_response="",
138
+ )
139
+ for doc in res["input_documents"]:
140
+ out.semantic_search.append(doc.page_content)
141
+
142
+ return out
143
+
144
+
145
+ class PartialFormatter(string.Formatter):
146
+ def __init__(self, missing="~~", bad_fmt="!!"):
147
+ self.missing, self.bad_fmt = missing, bad_fmt
148
+
149
+ def get_field(self, field_name, args, kwargs):
150
+ try:
151
+ val = super(PartialFormatter, self).get_field(field_name, args, kwargs)
152
+ except (KeyError, AttributeError):
153
+ val = None, field_name
154
+ return val
155
+
156
+ def format_field(self, value, spec):
157
+ if value is None:
158
+ return self.missing
159
+ try:
160
+ return super(PartialFormatter, self).format_field(value, spec)
161
+ except ValueError:
162
+ if self.bad_fmt is not None:
163
+ return self.bad_fmt
164
+ else:
165
+ raise
app/ranking.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import statistics
2
+ from typing import List
3
+ from typing import Tuple
4
+
5
+ import torch
6
+ from loguru import logger
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ from app.config.models.configs import Document
10
+
11
+
12
+ class BCEReranker:
13
+ def __init__(self) -> None:
14
+ self.tokenizer = AutoTokenizer.from_pretrained("maidalun1020/bce-reranker-base_v1")
15
+ self.model = AutoModelForSequenceClassification.from_pretrained(
16
+ "maidalun1020/bce-reranker-base_v1"
17
+ )
18
+ self.model.eval()
19
+ logger.info("Initialized BCE Reranker")
20
+
21
+ def get_scores(self, query: str, docs: List[Document]) -> List[float]:
22
+ logger.info("Reranking documents ... ")
23
+ features = [[query, doc.page_content] for doc in docs]
24
+ with torch.no_grad():
25
+ inputs = self.tokenizer(
26
+ features,
27
+ padding=True,
28
+ truncation=True,
29
+ max_length=512,
30
+ return_tensors="pt",
31
+ )
32
+ scores = (
33
+ self.model(**inputs, return_dict=True)
34
+ .logits.view(-1, )
35
+ .float()
36
+ .tolist()
37
+ )
38
+ return scores
39
+
40
+
41
+ def rerank(
42
+ rerank_model: BCEReranker, query: str, docs: List[Document]
43
+ ) -> Tuple[float, List[Document]]:
44
+ logger.info("Reranking...")
45
+ scores = rerank_model.get_scores(query, docs)
46
+ for score, d in zip(scores, docs):
47
+ d.metadata["score"] = score
48
+
49
+ sorted_scores = sorted(scores, reverse=True)
50
+
51
+ logger.info(sorted_scores)
52
+ median_ = statistics.mean(sorted_scores[:10])
53
+ return median_, [
54
+ doc for doc in sorted(docs, key=lambda it: it.metadata["score"], reverse=True)
55
+ ]
app/splade.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from collections import defaultdict
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ import scipy
8
+ import torch
9
+ import tqdm
10
+ from loguru import logger
11
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
12
+
13
+ from app.config.models.configs import Config, Document
14
+ from app.utils import torch_device, split
15
+
16
+
17
+ class SpladeSparseVectorDB:
18
+ def __init__(
19
+ self,
20
+ config: Config,
21
+ ) -> None:
22
+ self._config = config
23
+
24
+ # cuda or mps or cpu
25
+ self._device = torch_device()
26
+ logger.info(f"Setting device to {self._device}")
27
+
28
+ self.tokenizer = AutoTokenizer.from_pretrained(
29
+ "naver/splade-v3", device=self._device, use_fast=True
30
+ )
31
+ self.model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3")
32
+ self.model.to(self._device)
33
+ self._embeddings = None
34
+ self._ids = None
35
+ self._l2_norm_matrix = None
36
+ self._labels_to_ind = defaultdict(list)
37
+ self._chunk_size_to_ind = defaultdict(list)
38
+
39
+ self.n_batch = config.embeddings.splade_config.n_batch
40
+
41
+ def _get_batch_embeddings(
42
+ self, docs: List[str]
43
+ ) -> np.ndarray:
44
+ tokens = self.tokenizer(
45
+ docs, return_tensors="pt", padding=True, truncation=True
46
+ ).to(self._device)
47
+
48
+ output = self.model(**tokens)
49
+
50
+ vecs = (
51
+ torch.max(
52
+ torch.log(1 + torch.relu(output.logits))
53
+ * tokens.attention_mask.unsqueeze(-1),
54
+ dim=1,
55
+ )[0]
56
+ .squeeze()
57
+ .detach()
58
+ .cpu()
59
+ .numpy()
60
+ )
61
+
62
+ del output
63
+ del tokens
64
+
65
+ return vecs
66
+
67
+ def _get_embedding_fnames(self):
68
+ folder_name = os.path.join(self._config.embeddings.embeddings_path, "splade")
69
+ fn_embeddings = os.path.join(folder_name, "splade_embeddings.npz")
70
+ fn_ids = os.path.join(folder_name, "splade_ids.pickle")
71
+ fn_metadatas = os.path.join(folder_name, "splade_metadatas.pickle")
72
+ return folder_name, fn_embeddings, fn_ids, fn_metadatas
73
+
74
+ def load(self) -> None:
75
+ _, fn_embeddings, fn_ids, fn_metadatas = self._get_embedding_fnames()
76
+ try:
77
+ self._embeddings = scipy.sparse.load_npz(fn_embeddings)
78
+ with open(fn_ids, "rb") as fp:
79
+ self._ids = np.array(pickle.load(fp))
80
+
81
+ with open(fn_metadatas, "rb") as fm:
82
+ self._metadatas = np.array(pickle.load(fm))
83
+
84
+ self._l2_norm_matrix = scipy.sparse.linalg.norm(self._embeddings, axis=1)
85
+
86
+ for ind, m in enumerate(self._metadatas):
87
+ if m["label"]:
88
+ self._labels_to_ind[m["label"]].append(ind)
89
+
90
+ self._chunk_size_to_ind[m["chunk_size"]].append(ind)
91
+
92
+ logger.info(f"SPLADE: Got {len(self._labels_to_ind)} labels.")
93
+
94
+ except FileNotFoundError:
95
+ raise FileNotFoundError(
96
+ "Embeddings don't exist"
97
+ )
98
+ logger.info(f"Loaded sparse embeddings from {fn_embeddings}")
99
+
100
+ def generate_embeddings(
101
+ self, docs: List[Document], persist: bool = True
102
+ ) -> Tuple[np.ndarray, List[str], List[dict]]:
103
+
104
+ chunk_size = self.n_batch
105
+
106
+ ids = [d.metadata["document_id"] for d in docs]
107
+ metadatas = [d.metadata for d in docs]
108
+
109
+ vecs = []
110
+ for chunk in tqdm.tqdm(
111
+ split(docs, chunk_size=chunk_size), total=int(len(docs) / chunk_size)
112
+ ):
113
+ texts = [d.page_content for d in chunk if d.page_content]
114
+ vecs.append(self._get_batch_embeddings(texts))
115
+
116
+ embeddings = np.vstack(vecs)
117
+
118
+ if persist:
119
+ self.persist_embeddings(embeddings, metadatas, ids)
120
+ return embeddings, ids, metadatas
121
+
122
+ def persist_embeddings(self, embeddings, metadatas, ids):
123
+ folder_name, fn_embeddings, fn_ids, fn_metadatas = self._get_embedding_fnames()
124
+ csr_embeddings = scipy.sparse.csr_matrix(embeddings)
125
+
126
+ if not os.path.exists(folder_name):
127
+ os.makedirs(folder_name)
128
+
129
+ scipy.sparse.save_npz(fn_embeddings, csr_embeddings)
130
+ self.save_list(ids, fn_ids)
131
+ self.save_list(metadatas, fn_metadatas)
132
+ logger.info(f"Saved embeddings to {fn_embeddings}")
133
+
134
+ def query(
135
+ self, search: str, chunk_size: int, n: int = 50, label: str = ""
136
+ ) -> Tuple[np.ndarray, np.ndarray]:
137
+ if self._embeddings is None or self._ids is None:
138
+ logger.info("Loading embeddings...")
139
+ self.load()
140
+
141
+ if (
142
+ label
143
+ and label in self._labels_to_ind
144
+ and self._embeddings is not None
145
+ and self._ids is not None
146
+ ):
147
+ indices = sorted(
148
+ list(
149
+ set(self._labels_to_ind[label]).intersection(
150
+ set(self._chunk_size_to_ind[chunk_size])
151
+ )
152
+ )
153
+ )
154
+
155
+ else:
156
+ indices = sorted(list(set(self._chunk_size_to_ind[chunk_size])))
157
+
158
+ embeddings = self._embeddings[indices]
159
+ ids = self._ids[indices]
160
+ l2_norm_matrix = scipy.sparse.linalg.norm(embeddings, axis=1)
161
+
162
+ embed_query = self._get_batch_embeddings(docs=[search])
163
+ l2_norm_query = scipy.linalg.norm(embed_query)
164
+
165
+ if embeddings is not None and l2_norm_matrix is not None and ids is not None:
166
+ cosine_similarity = embeddings.dot(embed_query) / (
167
+ l2_norm_matrix * l2_norm_query
168
+ )
169
+ most_similar = np.argsort(cosine_similarity)
170
+
171
+ top_similar_indices = most_similar[-n:][::-1]
172
+ return (
173
+ ids[top_similar_indices],
174
+ cosine_similarity[top_similar_indices],
175
+ )
176
+
177
+ def save_list(self, list_: list, fname: str) -> None:
178
+ with open(fname, "wb") as fp:
179
+ pickle.dump(list_, fp)
app/utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import torch
4
+
5
+
6
+ def torch_device():
7
+ device = (
8
+ f"cuda:{torch.cuda.current_device()}"
9
+ if torch.cuda.is_available()
10
+ else ("mps" if torch.backends.mps.is_available() else "cpu")
11
+ )
12
+ return device
13
+
14
+
15
+ def split(iterable: List, chunk_size: int):
16
+ for i in range(0, len(iterable), chunk_size):
17
+ yield iterable[i: i + chunk_size]
config/config.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * @typedef {import('app/config/types').AppConfig} AppConfig
3
+ */
4
+
5
+ /**
6
+ * @type {AppConfig}
7
+ */
8
+ const config = {
9
+ cache_folder: ".cache",
10
+ embeddings: {
11
+ embeddings_path: ".embeddings",
12
+ embedding_model: {
13
+ model_name: "maidalun1020/bce-embedding-base_v1",
14
+ },
15
+ splade_config: {
16
+ n_batch: 8,
17
+ },
18
+ chunk_sizes: [1024],
19
+ document_settings: [
20
+ {
21
+ doc_path: "documents/falcon-refinedweb",
22
+ additional_parser_settings: {
23
+ md: {
24
+ skip_first: true,
25
+ merge_sections: false,
26
+ remove_images: true,
27
+ },
28
+ },
29
+ passage_prefix: "passage: ",
30
+ },
31
+ ],
32
+ },
33
+ semantic_search: {
34
+ max_k: 15,
35
+ max_char_size: 4096,
36
+ query_prefix: "query: ",
37
+ },
38
+ };
39
+
40
+ module.exports = config;
config/openai.js ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * @typedef {import('app/config/types').LLMConfig} LLMConfig
3
+ */
4
+
5
+ /**
6
+ * @type {LLMConfig}
7
+ */
8
+
9
+ const config = {
10
+ type: "openai",
11
+ params: {
12
+ prompt_template: `
13
+ ### Instruction:
14
+ Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you don't know, don't try to make up an answer.
15
+
16
+ ### Context:
17
+ ---------------
18
+ {context}
19
+ ---------------
20
+
21
+ ### Question: {question}
22
+ ### Response:
23
+ `,
24
+ model_kwargs: {
25
+ model_name: "gpt-3.5-turbo",
26
+ temperature: 0.0,
27
+ },
28
+ },
29
+ };
30
+
31
+ module.exports = config;
config/vertexai.js ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * @typedef {import('app/config/types').LLMConfig} LLMConfig
3
+ */
4
+
5
+ /**
6
+ * @type {LLMConfig}
7
+ */
8
+
9
+ const config = {
10
+ type: "vertexai",
11
+ params: {
12
+ prompt_template: `
13
+ ### Instruction:
14
+ Use the following pieces of context to answer the question at the end. If the answer isn't in the context, say that you don't know, don't try to make up an answer.
15
+
16
+ ### Context:
17
+ ---------------
18
+ {context}
19
+ ---------------
20
+
21
+ ### Question: {question}
22
+ ### Response:
23
+ `,
24
+ model_kwargs: {
25
+ model_name: "gemini-pro",
26
+ temperature: 0.0,
27
+ },
28
+ },
29
+ };
30
+
31
+ module.exports = config;
documents/falcon-refinedweb.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b0345300dfa1636ae1a44d868624818dff553bde63fd4c647b2ab5a5db813fa
3
+ size 22271742
evaluation_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ llama-cpp-python==0.2.55
2
+ langchain==0.1.13
3
+ pydantic~=2.5
4
+ transformers~=4.36
5
+ loguru~=0.7.2
6
+ termcolor~=2.4.0
7
+ click~=8.1.7
8
+ unstructured~=0.12.4
9
+ sentence-transformers~=2.6
10
+ tqdm==4.65.0
11
+ pandas~=2.2.1
12
+ chromadb==0.4.15
13
+ numpy~=1.26.4
14
+ scipy~=1.12.0
15
+ torch~=2.2.1
16
+ PyExecJS~=1.5.1
17
+ fastparquet~=2024.2.0
18
+ ragas==0.1.0
19
+ langchain-google-vertexai~=1.0.1
20
+ datasets~=2.18.0