DuyTa
/

Graduation

Model card Files Files and versions Community

Graduation / pipelines /bm25s /examples /index_with_metadata.py

DuyTa

Upload folder using huggingface_hub

74b1bac verified 4 months ago

raw

history blame

2.08 kB

	"""
	Sometimes, you might want to have a corpus consisting of dict rather than pure text.

	dicts, and any json-serializable object, is supported by bm25s. This example shows you how to pass a list of dict.

	Note: If the elements in your corpus is not json serializable, it will not be properly saved. In those cases, you
	should avoid passing
	"""
	import bm25s

	# Create your corpus here

	corpus_json = [
	{"text": "a cat is a feline and likes to purr", "metadata": {"source": "internet"}},
	{"text": "a dog is the human's best friend and loves to play", "metadata": {"source": "encyclopedia"}},
	{"text": "a bird is a beautiful animal that can fly", "metadata": {"source": "cnn"}},
	{"text": "a fish is a creature that lives in water and swims", "metadata": {"source": "i made it up"}},
	]
	corpus_text = [doc["text"] for doc in corpus_json]


	# Tokenize the corpus and only keep the ids (faster and saves memory)
	corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")

	# Create the BM25 retriever and attach your corpus_json to it
	retriever = bm25s.BM25(corpus=corpus_json)
	# Now, index the corpus_tokens (the corpus_json is not used yet)
	retriever.index(corpus_tokens)

	# Query the corpus
	query = "does the fish purr like a cat?"
	query_tokens = bm25s.tokenize(query)

	# Get top-k results as a tuple of (doc, scores). Note that results
	# will correspond to the corpus item at the corresponding index
	# (you are responsible to make sure each element in corpus_json
	# corresponds to each element in your tokenized corpus)
	results, scores = retriever.retrieve(query_tokens, k=2)

	for i in range(results.shape[1]):
	doc, score = results[0, i], scores[0, i]
	print(f"Rank {i+1} (score: {score:.2f}): {doc}")

	# You can save the arrays to a directory...
	# Note that this will fail if your corpus passed to `BM25(corpus...)` is not serializable
	retriever.save("animal_index_bm25")

	# ...and load them when you need them
	import bm25s
	reloaded_retriever = bm25s.BM25.load("animal_index_bm25", load_corpus=True)
	# set load_corpus=False if you don't need the corpus