davidmezzetti commited on
Commit
75af753
1 Parent(s): c6545db

Initial version

Browse files
Files changed (5) hide show
  1. .gitattributes +3 -0
  2. README.md +66 -0
  3. config.json +24 -0
  4. documents +3 -0
  5. embeddings +3 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ documents filter=lfs diff=lfs merge=lfs -text
37
+ embeddings filter=lfs diff=lfs merge=lfs -text
38
+
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ inference: false
3
+ language: en
4
+ license:
5
+ - cc0-1.0
6
+ library_name: txtai
7
+ tags:
8
+ - sentence-similarity
9
+ datasets:
10
+ - arxiv_dataset
11
+ ---
12
+
13
+ # arXiv txtai embeddings index
14
+
15
+ This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [arXiv dataset](https://hf.co/datasets/arxiv_dataset) [metadata](https://info.arxiv.org/help/prep.html).
16
+
17
+ txtai must be [installed](https://neuml.github.io/txtai/install/) to use this model.
18
+
19
+ ## Example
20
+
21
+ This index can be loaded from the Hugging Face Hub with txtai as shown below.
22
+
23
+ ```python
24
+ from txtai.embeddings import Embeddings
25
+
26
+ # Load the index from the HF Hub
27
+ embeddings = Embeddings()
28
+ embeddings.load(provider="huggingface-hub", container="neuml/txtai-arxiv")
29
+
30
+ # Run a search
31
+ embeddings.search("txtai is an all-in-one embeddings database for semantic search, LLM orchestration and language model workflows.")
32
+ ```
33
+
34
+ ## Use Cases
35
+
36
+ An embeddings index generated by txtai is a fully encapsulated index format. It doesn't require a database server or dependencies outside of the Python install.
37
+
38
+ The arXiv index works well as a fact-based context source for retrieval augmented generation (RAG). In other words, search results from this model can be passed to LLM prompts as the context in which to answer questions.
39
+
40
+ Additionally, this model can identify articles to cite in research. Passing a title + abstract pair will find similar existing articles.
41
+
42
+ ## Build the index
43
+
44
+ The following steps show how to build this index.
45
+
46
+ - Install required build dependencies
47
+ ```bash
48
+ pip install txtchat datasets
49
+ ```
50
+
51
+ - Follow these [instructions](https://huggingface.co/datasets/arxiv_dataset/blob/main/arxiv_dataset.py#L67) to download the dataset
52
+
53
+ - Build txtai-arxiv index
54
+ ```bash
55
+ python -m txtchat.data.arxiv.index \
56
+ -d <path to directory with file downloaded in previous step> \
57
+ -o txtai-arxiv
58
+ ```
59
+
60
+ ## More information
61
+
62
+ See the following links for more information on the arXiv metadata dataset.
63
+
64
+ - [Dataset on Hugging Face](https://huggingface.co/datasets/arxiv_dataset)
65
+ - [Dataset on Kaggle](https://www.kaggle.com/datasets/Cornell-University/arxiv)
66
+ - [Metadata description](https://info.arxiv.org/help/prep.html)
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format": "json",
3
+ "path": "thenlper/gte-base",
4
+ "batch": 8192,
5
+ "encodebatch": 128,
6
+ "faiss": {
7
+ "quantize": true,
8
+ "sample": 0.05
9
+ },
10
+ "content": true,
11
+ "dimensions": 768,
12
+ "backend": "faiss",
13
+ "offset": 2399802,
14
+ "build": {
15
+ "create": "2024-01-15T06:00:38Z",
16
+ "python": "3.8.18",
17
+ "settings": {
18
+ "components": "IVF1386,SQ8"
19
+ },
20
+ "system": "Linux (x86_64)",
21
+ "txtai": "6.4.0"
22
+ },
23
+ "update": "2024-01-15T06:00:38Z"
24
+ }
documents ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4b751621223cf5a6a0331e2f89eaf5e7d622a5d07e1fe19112a0fc275ede3e8
3
+ size 4296163328
embeddings ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62f1464e1e69958255d959e7f03a7e7173592d62462d7783a30b4876c72a17b8
3
+ size 1866521560