Graduation / pipelines /bm25s /examples /index_with_metadata.py
DuyTa's picture
Upload folder using huggingface_hub
74b1bac verified
raw
history blame
2.08 kB
"""
Sometimes, you might want to have a corpus consisting of dict rather than pure text.
dicts, and any json-serializable object, is supported by bm25s. This example shows you how to pass a list of dict.
Note: If the elements in your corpus is not json serializable, it will not be properly saved. In those cases, you
should avoid passing
"""
import bm25s
# Create your corpus here
corpus_json = [
{"text": "a cat is a feline and likes to purr", "metadata": {"source": "internet"}},
{"text": "a dog is the human's best friend and loves to play", "metadata": {"source": "encyclopedia"}},
{"text": "a bird is a beautiful animal that can fly", "metadata": {"source": "cnn"}},
{"text": "a fish is a creature that lives in water and swims", "metadata": {"source": "i made it up"}},
]
corpus_text = [doc["text"] for doc in corpus_json]
# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
# Create the BM25 retriever and attach your corpus_json to it
retriever = bm25s.BM25(corpus=corpus_json)
# Now, index the corpus_tokens (the corpus_json is not used yet)
retriever.index(corpus_tokens)
# Query the corpus
query = "does the fish purr like a cat?"
query_tokens = bm25s.tokenize(query)
# Get top-k results as a tuple of (doc, scores). Note that results
# will correspond to the corpus item at the corresponding index
# (you are responsible to make sure each element in corpus_json
# corresponds to each element in your tokenized corpus)
results, scores = retriever.retrieve(query_tokens, k=2)
for i in range(results.shape[1]):
doc, score = results[0, i], scores[0, i]
print(f"Rank {i+1} (score: {score:.2f}): {doc}")
# You can save the arrays to a directory...
# Note that this will fail if your corpus passed to `BM25(corpus...)` is not serializable
retriever.save("animal_index_bm25")
# ...and load them when you need them
import bm25s
reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True)
# set load_corpus=False if you don't need the corpus