QDrantRAG9

Sleeping

App Files Files Community

dinhquangson commited on Jun 18

Commit

4beb7b0

•

1 Parent(s): b68f37e

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -27

app.py CHANGED Viewed

@@ -1,37 +1,124 @@
-from fastapi import FastAPI
-from fastapi import File, UploadFile
-import shutil
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
-from modelscope.models import Model
-from modelscope.pipelines import pipeline
-model = Model.from_pretrained('damo/multi-modal_convnext-roberta-base_vldoc-embedding')
-doc_VL_emb_pipeline = pipeline(task='document-vl-embedding', model=model)
 app = FastAPI()
 parent_path  = dirname(getcwd())
-temp_path = join(parent_path,'temp')
-if not exists(temp_path):
-    makedirs(temp_path)
-@app.post("/analyze")
-def pdf2images(file: UploadFile=File(...)):
-    file_savePath =  join(temp_path,file.filename)
-    with open(file_savePath,'wb') as f:
-        shutil.copyfileobj(file.file, f)
-    inp = {
-        'images': ['./demo.png'],
-        'ocr_info_paths': ['./demo.json']
-    }
-    result = doc_VL_emb_pipeline(inp)
-    print('Results of VLDoc: ')
-    for k, v in result.items():
-        print(f'{k}: {v}')
-    return result["img_embedding"],result["text_embedding"]

+from fastapi import FastAPI, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+# Loading
+import os
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
+from datasets import load_dataset
+import torch
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+import uuid
+from qdrant_client import models, QdrantClient
+from itertools import islice
 app = FastAPI()
+FILEPATH_PATTERN = "structured_data_doc.parquet"
+NUM_PROC = os.cpu_count()
 parent_path  = dirname(getcwd())
+temp_path  = join(parent_path,'temp')
+if not exists(temp_path ):
+    makedirs(temp_path )
+# Determine device based on GPU availability
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load the desired model
+model = SentenceTransformer(
+          'sentence-transformers/all-MiniLM-L6-v2',
+          device=device
+)
+# Create function to upsert embeddings in batches
+def batched(iterable, n):
+    iterator = iter(iterable)
+    while batch := list(islice(iterator, n)):
+        yield batch
+batch_size = 100
+# Create an in-memory Qdrant instance
+client2 = QdrantClient(":memory:")
+# Create a Qdrant collection for the embeddings
+client2.create_collection(
+    collection_name="law",
+    vectors_config=models.VectorParams(
+        size=model.get_sentence_embedding_dimension(),
+        distance=models.Distance.COSINE,
+    ),
+)
+# Create function to generate embeddings (in batches) for a given dataset split
+def generate_embeddings(dataset, batch_size=32):
+    embeddings = []
+    with tqdm(total=len(dataset), desc=f"Generating embeddings for dataset") as pbar:
+        for i in range(0, len(dataset), batch_size):
+            batch_sentences = dataset['content'][i:i+batch_size]
+            batch_embeddings = model.encode(batch_sentences)
+            embeddings.extend(batch_embeddings)
+            pbar.update(len(batch_sentences))
+    return embeddings
+@app.post("/uploadfile/")
+async def create_upload_file(file: UploadFile = File(...)):
+    # Here you can save the file and do other operations as needed
+    full_dataset = load_dataset("parquet",
+                data_files=FILEPATH_PATTERN,
+                split="train",
+                cache_path=temp_path,
+                keep_in_memory=True,
+                num_proc=NUM_PROC*2)
+    # Generate and append embeddings to the train split
+    law_embeddings = generate_embeddings(full_dataset)
+    full_dataset= full_dataset.add_column("embeddings", law_embeddings)
+    if not 'uuid' in full_dataset.column_names:
+      full_dataset = full_dataset.add_column('uuid', [str(uuid.uuid4()) for _ in range(len(full_dataset))])
+    # Upsert the embeddings in batches
+    for batch in batched(full_dataset, batch_size):
+        ids = [point.pop("uuid") for point in batch]
+        vectors = [point.pop("embeddings") for point in batch]
+        client2.upsert(
+            collection_name="law",
+            points=models.Batch(
+                ids=ids,
+                vectors=vectors,
+                payloads=batch,
+            ),
+        )
+    return {"filename": file.filename, "message": "Done"}
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/search")
+def search(prompt: str):
+    # Let's see what senators are saying about immigration policy
+    hits = client2.search(
+        collection_name="law",
+        query_vector=model.encode(prompt).tolist(),
+        limit=5
+    )
+    for hit in hits:
+      print(hit.payload, "score:", hit.score)
+    return {'detail': 'hit.payload', 'score:': hit.score}
+@app.get("/")
+def api_home():
+    return {'detail': 'Welcome to FastAPI Qdrant importer!'}