QDrantRAG9

Sleeping

App Files Files Community

QDrantRAG9 / app.py

dinhquangson

Update app.py

504f0a9 verified 8 months ago

raw

history blame

9.06 kB

	from fastapi import FastAPI, UploadFile, File
	from fastapi.responses import FileResponse

	from fastapi.middleware.cors import CORSMiddleware
	# Loading
	import os
	import shutil
	from os import makedirs,getcwd
	from os.path import join,exists,dirname
	from datasets import load_dataset
	import torch
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer
	import uuid
	from qdrant_client import models, QdrantClient
	from itertools import islice
	from tqdm import tqdm

	# The file where NeuralSearcher is stored
	from neural_searcher import NeuralSearcher
	# The file where HybridSearcher is stored
	from hybrid_searcher import HybridSearcher

	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	FILEPATH_PATTERN = "structured_data_doc.parquet"
	NUM_PROC = os.cpu_count()
	parent_path = dirname(getcwd())

	temp_path = join(parent_path,'temp')
	if not exists(temp_path ):
	makedirs(temp_path )

	# Determine device based on GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")
	# Load the desired model
	model = SentenceTransformer(
	'sentence-transformers/all-MiniLM-L6-v2',
	device=device
	)

	# Create function to upsert embeddings in batches
	def batched(iterable, n):
	iterator = iter(iterable)
	while batch := list(islice(iterator, n)):
	yield batch

	batch_size = 100
	# Create an in-memory Qdrant instance
	client2 = QdrantClient(path="database")

	# Create a Qdrant collection for the embeddings
	client2.create_collection(
	collection_name="law",
	vectors_config=models.VectorParams(
	size=model.get_sentence_embedding_dimension(),
	distance=models.Distance.COSINE,
	),
	)

	# Create function to generate embeddings (in batches) for a given dataset split
	def generate_embeddings(dataset, text_field, batch_size=32):
	embeddings = []

	with tqdm(total=len(dataset), desc=f"Generating embeddings for dataset") as pbar:
	for i in range(0, len(dataset), batch_size):
	print(dataset)
	batch_sentences = dataset[text_field][i:i+batch_size]
	batch_embeddings = model.encode(batch_sentences)
	embeddings.extend(batch_embeddings)
	pbar.update(len(batch_sentences))

	return embeddings

	@app.post("/uploadfile/")
	async def create_upload_file(text_field: str, file: UploadFile = File(...)):
	import time

	start_time = time.time()

	file_savePath = join(temp_path,file.filename)

	with open(file_savePath,'wb') as f:
	shutil.copyfileobj(file.file, f)
	# Here you can save the file and do other operations as needed
	if '.json' in file_savePath:
	full_dataset = load_dataset('json',
	data_files=file_savePath,
	split="train",
	cache_dir=temp_path,
	keep_in_memory=True,
	num_proc=NUM_PROC*2)
	elif '.parquet' in file_savePath:
	full_dataset = load_dataset("parquet",
	data_files=file_savePath,
	split="train",
	cache_dir=temp_path,
	keep_in_memory=True,
	num_proc=NUM_PROC*2)
	else:
	raise NotImplementedError("This feature is not supported yet")
	# Generate and append embeddings to the train split
	law_embeddings = generate_embeddings(full_dataset, text_field)
	full_dataset= full_dataset.add_column("embeddings", law_embeddings)

	if not 'uuid' in full_dataset.column_names:
	full_dataset = full_dataset.add_column('uuid', [str(uuid.uuid4()) for _ in range(len(full_dataset))])
	# Upsert the embeddings in batches
	for batch in batched(full_dataset, batch_size):
	ids = [point.pop("uuid") for point in batch]
	vectors = [point.pop("embeddings") for point in batch]

	client2.upsert(
	collection_name=collection_name,
	points=models.Batch(
	ids=ids,
	vectors=vectors,
	payloads=batch,
	),
	)


	end_time = time.time()

	elapsed_time = end_time - start_time

	return {"filename": file.filename, "message": "Done", "execution_time": elapsed_time}

	@app.post("/uploadfile4hypersearch/")
	async def upload_file_4_hyper_search(collection_name: str, text_field: str, file: UploadFile = File(...)):
	import time

	start_time = time.time()

	file_savePath = join(temp_path,file.filename)
	client2.set_model("sentence-transformers/all-MiniLM-L6-v2")

	# comment this line to use dense vectors only
	client2.set_sparse_model("prithivida/Splade_PP_en_v1")
	with open(file_savePath,'wb') as f:
	shutil.copyfileobj(file.file, f)

	print(f"Uploaded complete!")

	client2.recreate_collection(
	collection_name=collection_name,
	vectors_config=client2.get_fastembed_vector_params(),

	# comment this line to use dense vectors only
	sparse_vectors_config=client2.get_fastembed_sparse_vector_params(),
	)

	print(f"The collection is created complete!")

	# Here you can save the file and do other operations as needed
	if '.json' in file_savePath:
	import json
	import uuid

	# Define your batch size
	batch_size = 100

	metadata = []
	documents = []

	with open(file_savePath) as fd:
	for line in fd:
	obj = json.loads(line)
	documents.append(obj.pop(text_field))
	metadata.append(obj)

	# Generate UUIDs for each document
	document_ids = [str(uuid.uuid4()) for _ in range(len(documents))]

	# Split documents and metadata into batches
	for i in range(0, len(documents), batch_size):
	batch_documents = documents[i:i + batch_size]
	batch_metadata = metadata[i:i + batch_size]
	batch_ids = document_ids[i:i + batch_size]

	# Upsert the embeddings in batches
	client2.add(
	collection_name=collection_name,
	documents=batch_documents,
	metadata=batch_metadata,
	ids=batch_ids,
	)
	print(f"The documents and metadata are parsed and upserted in batches with unique UUIDs: {batch_ids}!")

	print(f"The documents and metadata are parsed and upserted in batches of {batch_size} with unique UUIDs!")

	print(f"The documents and metadata is upserted complete!")
	else:
	raise NotImplementedError("This feature is not supported yet")

	end_time = time.time()

	elapsed_time = end_time - start_time

	return {"filename": file.filename, "message": "Done", "execution_time": elapsed_time}

	@app.get("/search")
	def search(prompt: str):
	import time

	start_time = time.time()

	# Let's see what senators are saying about immigration policy
	hits = client2.search(
	collection_name="law",
	query_vector=model.encode(prompt).tolist(),
	limit=5
	)

	for hit in hits:
	print(hit.payload, "score:", hit.score)

	end_time = time.time()

	elapsed_time = end_time - start_time

	print(f"Execution time: {elapsed_time:.6f} seconds")

	return hits

	@app.get("/download-database/")
	async def download_database():
	import time

	start_time = time.time()

	# Path to the database directory
	database_dir = join(os.getcwd(), 'database')
	# Path for the zip file
	zip_path = join(os.getcwd(), 'database.zip')

	# Create a zip file of the database directory
	shutil.make_archive(zip_path.replace('.zip', ''), 'zip', database_dir)

	end_time = time.time()

	elapsed_time = end_time - start_time

	print(f"Execution time: {elapsed_time:.6f} seconds")

	# Return the zip file as a response for download
	return FileResponse(zip_path, media_type='application/zip', filename='database.zip')

	@app.get("/neural_search")
	def neural_search(q: str, city: str, collection_name: str):
	import time

	start_time = time.time()

	# Create a neural searcher instance
	neural_searcher = NeuralSearcher(collection_name=collection_name)

	end_time = time.time()

	elapsed_time = end_time - start_time

	return {"result": neural_searcher.search(text=q, city=city), "execution_time": elapsed_time}

	@app.get("/hybrid_search")
	def hybrid_search(q: str, city: str, collection_name: str):
	import time

	start_time = time.time()

	# Create a hybrid searcher instance
	hybrid_searcher = HybridSearcher(collection_name=collection_name)

	end_time = time.time()

	elapsed_time = end_time - start_time

	return {"result": hybrid_searcher.search(text=q, city=city), "execution_time": elapsed_time}

	@app.get("/")
	def api_home():
	return {'detail': 'Welcome to FastAPI Qdrant importer!'}