smart_search / app.py
AKT47's picture
updated app.py
e101645 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import json
from typing import List, Dict
def load_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("models/tokenizer")
model = AutoModel.from_pretrained("models/model")
return tokenizer, model
def load_data(file_path: str = "data.json") -> List[Dict]:
with open(file_path, "r") as f:
data = json.load(f)
flattened_courses = []
for course_category in data["courses"]:
for subcourse in course_category["subcourses"]:
if not subcourse or not all(
key in subcourse for key in ["name", "description", "link"]
):
continue
flattened_courses.append(
{
"course_type": course_category["course_type"],
"name": subcourse["name"],
"description": subcourse["description"],
"link": subcourse["link"],
"content": f"{course_category['course_type']} - {subcourse['name']}: {subcourse['description']}",
}
)
return flattened_courses
def get_embedding(text: str, tokenizer, model) -> torch.Tensor:
inputs = tokenizer(
text, return_tensors="pt", truncation=True, max_length=512, padding=True
)
with torch.no_grad():
outputs = model(**inputs)
attention_mask = inputs["attention_mask"]
token_embeddings = outputs.last_hidden_state
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
def precompute_embeddings(documents: List[Dict], tokenizer, model) -> torch.Tensor:
embeddings = []
for doc in documents:
embedding = get_embedding(doc["content"], tokenizer, model)
embeddings.append(embedding)
return torch.cat(embeddings, dim=0)
def semantic_search(
query: str,
doc_embeddings: torch.Tensor,
documents: List[Dict],
tokenizer,
model,
top_k: int = 3,
) -> List[Dict]:
query_embedding = get_embedding(query, tokenizer, model)
similarities = F.cosine_similarity(query_embedding, doc_embeddings)
top_k_indices = torch.topk(similarities, min(top_k, len(documents))).indices
results = []
for idx in top_k_indices:
doc = documents[idx.item()]
results.append(
{
"course_type": doc["course_type"],
"name": doc["name"],
"description": doc["description"],
"link": doc["link"],
}
)
return results
try:
print("Loading model and tokenizer...")
tokenizer, model = load_model_and_tokenizer()
print("Loading documents...")
documents = load_data()
if not documents:
raise ValueError("No valid courses found in the data file")
print("Precomputing embeddings...")
doc_embeddings = precompute_embeddings(documents, tokenizer, model)
print("Initialization complete!")
except Exception as e:
print(f"Error during initialization: {str(e)}")
raise
def search_interface(query: str) -> str:
if not query.strip():
return "Please enter a search query."
results = semantic_search(query, doc_embeddings, documents, tokenizer, model)
output = "# Search Results:\n\n"
for i, result in enumerate(results, 1):
output += f"## {i}. {result['course_type']} - {result['name']}\n"
output += f"**Description:** {result['description']}\n"
output += f"_[Link to Course]({result['link']})_\n\n"
return output
app = gr.Interface(
fn=search_interface,
inputs=gr.Textbox(
lines=2,
placeholder="Enter your search query here (e.g., 'machine learning', 'python for beginners', 'deep learning')",
),
outputs=gr.Markdown(
value="## Search Results will be displayed here.",
line_breaks=True,
label="Search Results",
show_label=True,
),
title="Analytics Vidhya Course Search Engine",
description="Search for courses using semantic similarity. Results are ordered by relevance.",
allow_flagging="never",
)
app.launch()