|
""" |
|
Sometimes, you might want to have a corpus consisting of dict rather than pure text. |
|
|
|
dicts, and any json-serializable object, is supported by bm25s. This example shows you how to pass a list of dict. |
|
|
|
Note: If the elements in your corpus is not json serializable, it will not be properly saved. In those cases, you |
|
should avoid passing |
|
""" |
|
import bm25s |
|
|
|
|
|
|
|
corpus_json = [ |
|
{"text": "a cat is a feline and likes to purr", "metadata": {"source": "internet"}}, |
|
{"text": "a dog is the human's best friend and loves to play", "metadata": {"source": "encyclopedia"}}, |
|
{"text": "a bird is a beautiful animal that can fly", "metadata": {"source": "cnn"}}, |
|
{"text": "a fish is a creature that lives in water and swims", "metadata": {"source": "i made it up"}}, |
|
] |
|
corpus_text = [doc["text"] for doc in corpus_json] |
|
|
|
|
|
|
|
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") |
|
|
|
|
|
retriever = bm25s.BM25(corpus=corpus_json) |
|
|
|
retriever.index(corpus_tokens) |
|
|
|
|
|
query = "does the fish purr like a cat?" |
|
query_tokens = bm25s.tokenize(query) |
|
|
|
|
|
|
|
|
|
|
|
results, scores = retriever.retrieve(query_tokens, k=2) |
|
|
|
for i in range(results.shape[1]): |
|
doc, score = results[0, i], scores[0, i] |
|
print(f"Rank {i+1} (score: {score:.2f}): {doc}") |
|
|
|
|
|
|
|
retriever.save("animal_index_bm25") |
|
|
|
|
|
import bm25s |
|
reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True) |
|
|
|
|