Spaces:

IR-IIITH
/

MultiAgent-OpenDomain-QnA-System

Sleeping

App Files Files Community

raghuv-aditya commited on Nov 20, 2024

Commit

e46379d

verified ·

1 Parent(s): 79457ed

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +1 -0
Retrieval/.DS_Store +0 -0
Retrieval/__pycache__/bm25.cpython-311.pyc +0 -0
Retrieval/__pycache__/tf_idf.cpython-311.pyc +0 -0
Retrieval/__pycache__/vision.cpython-311.pyc +0 -0
Retrieval/bm25.py +14 -0
Retrieval/openSource.py +48 -0
Retrieval/savedModels/.DS_Store +0 -0
Retrieval/savedModels/bm25-1_0.pkl +3 -0
Retrieval/savedModels/document-vision-embeddings.json +3 -0
Retrieval/savedModels/document_matrix.pkl +3 -0
Retrieval/savedModels/document_matrix.zip +3 -0
Retrieval/savedModels/idf.pkl +3 -0
Retrieval/savedModels/ids.pkl +3 -0
Retrieval/savedModels/open_source_embeddings.pkl +3 -0
Retrieval/savedModels/tf_idf_dict.pkl +3 -0
Retrieval/savedModels/vocab.pkl +3 -0
Retrieval/tf_idf.py +66 -0
Retrieval/vision.py +174 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text
+Retrieval/savedModels/document-vision-embeddings.json filter=lfs diff=lfs merge=lfs -text

Retrieval/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

Retrieval/__pycache__/bm25.cpython-311.pyc ADDED Viewed

Binary file (1.24 kB). View file

Retrieval/__pycache__/tf_idf.cpython-311.pyc ADDED Viewed

Binary file (4.45 kB). View file

Retrieval/__pycache__/vision.cpython-311.pyc ADDED Viewed

Binary file (9.7 kB). View file

Retrieval/bm25.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import numpy as np
+import joblib
+from gensim.utils import simple_preprocess
+from rank_bm25 import BM25Okapi
+def bm25_pipeline(query, bm25_path="Retrieval/savedModels/bm25-1_0.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
+    bm25 = joblib.load(bm25_path)
+    ids = joblib.load(ids_path)
+    ranking = bm25.get_scores(simple_preprocess(query))
+    ranking = np.argsort(np.array(ranking))[::-1]
+    ranking = ranking[:k]
+    for j in range(len(ranking)):
+        ranking[j] = ids[ranking[j]]
+    return ranking

Retrieval/openSource.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from tqdm import tqdm
+import joblib
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+# Load the model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def get_documents_from_scores(scores):
+    rankings = []
+    for score in scores:
+        rankings.append(score[0])
+    return rankings
+def cosine_similarity(v1, v2):
+    v1 = np.array(v1)
+    v2 = np.array(v2)
+    if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
+        sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
+    else:
+        sim = 0
+    return sim
+def get_open_source_embeddings(documents):
+    documents_embeddings = []
+    for document in tqdm(documents):
+        documents_embeddings.append(model.encode(document))
+    return documents_embeddings
+def open_source_rankings(query, document_embeddings, k):
+    query_embedding = model.encode(query)
+    scores = []
+    for idx, embedding in enumerate(document_embeddings):
+        scores.append((idx, cosine_similarity(query_embedding, embedding)))
+    scores = sorted(scores, key=lambda x: x[1], reverse=True)
+    scores = scores[:k]
+    rankings = get_documents_from_scores(scores)
+    return rankings, scores
+def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
+    document_embeddings = joblib.load(documents_embeddings_path)
+    ids = joblib.load(ids_path)
+    rankings, scores = open_source_rankings(query, document_embeddings, k)
+    rankings2 = []
+    for ranking in tqdm(rankings):
+        rankings2.append(ids[ranking])
+    return rankings2

Retrieval/savedModels/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Retrieval/savedModels/bm25-1_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ece3c19027cd35ca6dde2d4aac8412f726715b9ac135ab28ab84bdd480451c09
+size 9361012

Retrieval/savedModels/document-vision-embeddings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c73ac57ca7de5276aef16fc2c1ccbd47ac2aea133784264239152ef4d4820274
+size 16544464

Retrieval/savedModels/document_matrix.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd045763d2222b592255289eb9f269d1cba3a45ec6f73507dca3bd70a7da7ec
+size 625240225

Retrieval/savedModels/document_matrix.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d377da907541907f1da87e18f02bf84f621f8337a2e63004c120ba049c1bc1a4
+size 5911195

Retrieval/savedModels/idf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f76f99e75d4b35f2e9aa06825f92f961d1a867061e242db347cfb45563c2e4f
+size 1533535

Retrieval/savedModels/ids.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b724a3d8820d865881b964a130948e1d780f8d6bdcb0e027f9e84bd4bba8480
+size 10071

Retrieval/savedModels/open_source_embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3588adcbde10e19ffd96ae65ea2c0d799f9a86889bdf642c1607613951c3257
+size 1584194

Retrieval/savedModels/tf_idf_dict.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:765eed596ae38d7a54c78ecf7f60ab1e25c0da09bbf4e4e5ccbad10aa1438c6c
+size 13293122

Retrieval/savedModels/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0cf1aa0710b6b11ecded1a4fe90e55c5502f223109713d02a4c580ea16583e6
+size 986100

Retrieval/tf_idf.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+from collections import defaultdict
+from gensim.utils import simple_preprocess
+from tqdm import tqdm
+import joblib
+def get_tf_query(query):
+    k = len(query)
+    tf_query = defaultdict(lambda: 0)
+    for i in range(k):
+        tf_query[query[i]] += 1
+    for token in tf_query.keys():
+        tf_query[token] /= k
+    return tf_query
+def get_tf_idf_query(query, idf_dict):
+    query = simple_preprocess(query)
+    tf_idf_query = defaultdict(lambda: 0)
+    tf_query = get_tf_query(query)
+    for token in tf_query.keys():
+        tf_idf_query[token] = tf_query[token] * idf_dict[token]
+    return tf_idf_query
+def get_tf_idf_vector(tf_idf_instance, vocab):
+    temp = []
+    for key in vocab.keys():
+        temp.append(tf_idf_instance[key])
+    return temp
+def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k):
+    query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1))
+    scores = []
+    dot_products = document_matrix @ query_vector.T
+    query_norm = np.linalg.norm(query_vector)
+    doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True)
+    cosine_similarities = dot_products / (doc_norms * query_norm)
+    cosine_similarities = cosine_similarities.flatten()
+    rankings = np.argsort(cosine_similarities)[::-1]
+    rankings = rankings[:k]
+    scores = []
+    for rank in rankings:
+        scores.append(cosine_similarities[rank])
+    # scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)
+    # scores = scores[:k]
+    # rankings = get_documents_from_scores(scores)
+    return rankings, scores
+def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
+    idf_dict = joblib.load(idf_dict_path)
+    print("idf loaded...")
+    tf_idf_dict = joblib.load(tf_idf_dict_path)
+    print("tf-idf loaded...")
+    vocab = joblib.load(vocab_path)
+    print("vocab loaded...")
+    document_matrix = joblib.load(document_matrix_path)
+    print("document_matrix loaded...")
+    ids = joblib.load(ids_path)
+    print("ids loaded")
+    rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k)
+    rankings2 = []
+    for ranking in tqdm(rankings):
+        rankings2.append(ids[ranking])
+    return rankings2

Retrieval/vision.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+from tqdm import tqdm
+import numpy as np
+from transformers import ViTModel, ViTFeatureExtractor, ViTImageProcessor
+from PIL import Image
+import re
+from fpdf import FPDF
+from datetime import datetime
+import fitz
+import joblib
+import json
+model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
+def create_pdf(input_text):
+    # Create instance of FPDF class
+    pdf = FPDF()
+    # Add a page
+    pdf.add_page()
+    # Set font
+    pdf.set_font("Arial", size=10)
+    # Split the input text into multiple lines if necessary
+    # This ensures that the text fits the page and multiple pages are handled
+    pdf.multi_cell(0, 5, txt=input_text)
+    # Create a unique file name with the current time
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    file_name = f"temp/PDFs/{timestamp}.pdf"
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(file_name), exist_ok=True)
+    # Save the PDF
+    pdf.output(file_name)
+    # Return the file path
+    return file_name
+def pdf_to_image(pdf_path, zoom=2.0):
+    # Open the PDF file
+    pdf_document = fitz.open(pdf_path)
+    # Create a list to store image paths
+    image_paths = []
+    # Create an 'Images' directory if it doesn't exist
+    os.makedirs("temp/Images", exist_ok=True)
+    # Iterate over PDF pages and convert each to an image
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)  # Load the page
+        # Set zoom level to improve quality
+        mat = fitz.Matrix(zoom, zoom)  # Create a transformation matrix with the zoom level
+        pix = page.get_pixmap(matrix=mat)  # Render the page to an image with the specified zoom
+        image_file = f'temp/Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
+        pix.save(image_file)  # Save the image as PNG
+        image_paths.append(image_file)
+    # Return the list containing paths of all images
+    return image_paths
+def sanitize_text(text):
+    """
+    Cleans and standardizes text by keeping only alphanumeric characters and spaces.
+    Args:
+        text (str): Text to sanitize.
+    Returns:
+        str: Sanitized text.
+    """
+    if isinstance(text, str):
+        # Use regex to keep only alphanumeric characters and spaces
+        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+        # Optionally, collapse multiple spaces into a single space
+        text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def text_to_images(text):
+    text = sanitize_text(text)
+    pdf_path = create_pdf(text)
+    image_paths = pdf_to_image(pdf_path)
+    return image_paths
+def documents_to_images(path):
+    document_set = []
+    for filename in os.listdir(path):
+        file_path = os.path.join(path, filename)
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as f:
+                content = f.read()
+                document_set.append(content)
+    document_image_paths = []
+    for document in document_set:
+        image_paths = text_to_images(document)
+        document_image_paths.append(image_paths)
+    return document_image_paths
+def single_unit_embedding(text):
+    image_paths = text_to_images(text)
+    temp = []
+    for image_path in image_paths:
+        image = Image.open(image_path)
+        inputs = processor(images=image, return_tensors="pt")
+        outputs = model(**inputs)
+        vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
+        temp.append(vector)
+    return np.mean(np.array(temp), axis=0)
+def single_image_embedding(image):
+    inputs = processor(images=image, return_tensors="pt")
+    outputs = model(**inputs)
+    vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
+    return vector
+def documents_to_vision_embeddings(documents):
+    document_vision_embeddings = []
+    for document in tqdm(documents):
+        vector = single_unit_embedding(document)
+        document_vision_embeddings.append(vector)
+    return document_vision_embeddings
+def queries_to_vision_embeddings(queries):
+    query_vision_embeddings = []
+    for query in tqdm(queries):
+        vector = single_unit_embedding(query)
+        query_vision_embeddings.append(vector)
+    return query_vision_embeddings
+def get_documents_from_scores(scores):
+    rankings = []
+    for score in scores:
+        rankings.append(score[0])
+    return rankings
+def cosine_similarity(v1, v2):
+    v1 = np.array(v1)
+    v2 = np.array(v2)
+    if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
+        sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
+    else:
+        sim = 0
+    return sim
+def vision_rankings(query_embedding, document_embeddings, k):
+    # query_embedding = single_unit_embedding(query)
+    scores = []
+    for idx, embedding in enumerate(document_embeddings):
+        scores.append((idx, cosine_similarity(query_embedding[0], embedding[0])))
+    scores = sorted(scores, key=lambda x: x[1], reverse=True)
+    scores = scores[:k]
+    rankings = get_documents_from_scores(scores)
+    return rankings, scores
+def vision_pipeline(query, document_embeddings_path="Retrieval/savedModels/document-vision-embeddings.json", ids_path="Retrieval/savedModels/ids.pkl", k=100):
+    # document_embeddings = joblib.load(document_embeddings_path)
+    ids = joblib.load(ids_path)
+    with open(document_embeddings_path, "r") as f:
+        document_vision_embeddings2 = json.load(f)
+    document_vision_embeddings = []
+    for embedding in tqdm(document_vision_embeddings2):
+        document_vision_embeddings.append(np.array(embedding))
+    print("loaded embeddings")
+    query_embedding = single_unit_embedding(query)
+    rankings, scores = vision_rankings(query_embedding, document_vision_embeddings, k)
+    rankings2 = []
+    for ranking in rankings:
+        rankings2.append(ids[ranking])
+    return rankings2