Spaces:
Running
Running
updated app.py
Browse filesadded markdown for better display
removed scores
app.py
CHANGED
@@ -4,76 +4,98 @@ import torch
|
|
4 |
import torch.nn.functional as F
|
5 |
import json
|
6 |
from typing import List, Dict
|
7 |
-
|
8 |
|
9 |
def load_model_and_tokenizer():
|
10 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
11 |
-
model = AutoModel.from_pretrained(
|
12 |
return tokenizer, model
|
13 |
|
|
|
14 |
def load_data(file_path: str = "data.json") -> List[Dict]:
|
15 |
-
with open(file_path,
|
16 |
data = json.load(f)
|
17 |
flattened_courses = []
|
18 |
-
for course_category in data[
|
19 |
-
for subcourse in course_category[
|
20 |
-
if not subcourse or not all(
|
|
|
|
|
21 |
continue
|
22 |
-
|
23 |
-
flattened_courses.append(
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
30 |
return flattened_courses
|
31 |
|
|
|
32 |
def get_embedding(text: str, tokenizer, model) -> torch.Tensor:
|
33 |
-
inputs = tokenizer(
|
|
|
|
|
34 |
with torch.no_grad():
|
35 |
outputs = model(**inputs)
|
36 |
-
|
37 |
-
attention_mask = inputs[
|
38 |
token_embeddings = outputs.last_hidden_state
|
39 |
-
input_mask_expanded =
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
def precompute_embeddings(documents: List[Dict], tokenizer, model) -> torch.Tensor:
|
43 |
embeddings = []
|
44 |
for doc in documents:
|
45 |
-
embedding = get_embedding(doc[
|
46 |
embeddings.append(embedding)
|
47 |
return torch.cat(embeddings, dim=0)
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
query_embedding = get_embedding(query, tokenizer, model)
|
51 |
similarities = F.cosine_similarity(query_embedding, doc_embeddings)
|
52 |
top_k_indices = torch.topk(similarities, min(top_k, len(documents))).indices
|
53 |
-
|
54 |
results = []
|
55 |
for idx in top_k_indices:
|
56 |
doc = documents[idx.item()]
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
return results
|
67 |
|
|
|
68 |
try:
|
69 |
print("Loading model and tokenizer...")
|
70 |
tokenizer, model = load_model_and_tokenizer()
|
71 |
-
|
72 |
print("Loading documents...")
|
73 |
documents = load_data()
|
74 |
if not documents:
|
75 |
raise ValueError("No valid courses found in the data file")
|
76 |
-
|
77 |
print("Precomputing embeddings...")
|
78 |
doc_embeddings = precompute_embeddings(documents, tokenizer, model)
|
79 |
print("Initialization complete!")
|
@@ -81,27 +103,37 @@ except Exception as e:
|
|
81 |
print(f"Error during initialization: {str(e)}")
|
82 |
raise
|
83 |
|
|
|
84 |
def search_interface(query: str) -> str:
|
85 |
if not query.strip():
|
86 |
return "Please enter a search query."
|
87 |
-
|
88 |
results = semantic_search(query, doc_embeddings, documents, tokenizer, model)
|
89 |
-
|
90 |
-
output = "Search Results:\n\n"
|
91 |
for i, result in enumerate(results, 1):
|
92 |
-
output += f"{i}. {result['course_type']} - {result['name']}\n"
|
93 |
-
output += f"
|
94 |
-
output += f"
|
95 |
-
|
96 |
-
|
97 |
return output
|
98 |
|
|
|
99 |
app = gr.Interface(
|
100 |
fn=search_interface,
|
101 |
-
inputs=gr.Textbox(
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
)
|
106 |
|
107 |
-
app.launch()
|
|
|
4 |
import torch.nn.functional as F
|
5 |
import json
|
6 |
from typing import List, Dict
|
7 |
+
|
8 |
|
9 |
def load_model_and_tokenizer():
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("models/tokenizer")
|
11 |
+
model = AutoModel.from_pretrained("models/model")
|
12 |
return tokenizer, model
|
13 |
|
14 |
+
|
15 |
def load_data(file_path: str = "data.json") -> List[Dict]:
|
16 |
+
with open(file_path, "r") as f:
|
17 |
data = json.load(f)
|
18 |
flattened_courses = []
|
19 |
+
for course_category in data["courses"]:
|
20 |
+
for subcourse in course_category["subcourses"]:
|
21 |
+
if not subcourse or not all(
|
22 |
+
key in subcourse for key in ["name", "description", "link"]
|
23 |
+
):
|
24 |
continue
|
25 |
+
|
26 |
+
flattened_courses.append(
|
27 |
+
{
|
28 |
+
"course_type": course_category["course_type"],
|
29 |
+
"name": subcourse["name"],
|
30 |
+
"description": subcourse["description"],
|
31 |
+
"link": subcourse["link"],
|
32 |
+
"content": f"{course_category['course_type']} - {subcourse['name']}: {subcourse['description']}",
|
33 |
+
}
|
34 |
+
)
|
35 |
return flattened_courses
|
36 |
|
37 |
+
|
38 |
def get_embedding(text: str, tokenizer, model) -> torch.Tensor:
|
39 |
+
inputs = tokenizer(
|
40 |
+
text, return_tensors="pt", truncation=True, max_length=512, padding=True
|
41 |
+
)
|
42 |
with torch.no_grad():
|
43 |
outputs = model(**inputs)
|
44 |
+
|
45 |
+
attention_mask = inputs["attention_mask"]
|
46 |
token_embeddings = outputs.last_hidden_state
|
47 |
+
input_mask_expanded = (
|
48 |
+
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
49 |
+
)
|
50 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
|
51 |
+
input_mask_expanded.sum(1), min=1e-9
|
52 |
+
)
|
53 |
+
|
54 |
|
55 |
def precompute_embeddings(documents: List[Dict], tokenizer, model) -> torch.Tensor:
|
56 |
embeddings = []
|
57 |
for doc in documents:
|
58 |
+
embedding = get_embedding(doc["content"], tokenizer, model)
|
59 |
embeddings.append(embedding)
|
60 |
return torch.cat(embeddings, dim=0)
|
61 |
|
62 |
+
|
63 |
+
def semantic_search(
|
64 |
+
query: str,
|
65 |
+
doc_embeddings: torch.Tensor,
|
66 |
+
documents: List[Dict],
|
67 |
+
tokenizer,
|
68 |
+
model,
|
69 |
+
top_k: int = 3,
|
70 |
+
) -> List[Dict]:
|
71 |
query_embedding = get_embedding(query, tokenizer, model)
|
72 |
similarities = F.cosine_similarity(query_embedding, doc_embeddings)
|
73 |
top_k_indices = torch.topk(similarities, min(top_k, len(documents))).indices
|
74 |
+
|
75 |
results = []
|
76 |
for idx in top_k_indices:
|
77 |
doc = documents[idx.item()]
|
78 |
+
results.append(
|
79 |
+
{
|
80 |
+
"course_type": doc["course_type"],
|
81 |
+
"name": doc["name"],
|
82 |
+
"description": doc["description"],
|
83 |
+
"link": doc["link"],
|
84 |
+
}
|
85 |
+
)
|
86 |
+
|
87 |
return results
|
88 |
|
89 |
+
|
90 |
try:
|
91 |
print("Loading model and tokenizer...")
|
92 |
tokenizer, model = load_model_and_tokenizer()
|
93 |
+
|
94 |
print("Loading documents...")
|
95 |
documents = load_data()
|
96 |
if not documents:
|
97 |
raise ValueError("No valid courses found in the data file")
|
98 |
+
|
99 |
print("Precomputing embeddings...")
|
100 |
doc_embeddings = precompute_embeddings(documents, tokenizer, model)
|
101 |
print("Initialization complete!")
|
|
|
103 |
print(f"Error during initialization: {str(e)}")
|
104 |
raise
|
105 |
|
106 |
+
|
107 |
def search_interface(query: str) -> str:
|
108 |
if not query.strip():
|
109 |
return "Please enter a search query."
|
110 |
+
|
111 |
results = semantic_search(query, doc_embeddings, documents, tokenizer, model)
|
112 |
+
|
113 |
+
output = "# Search Results:\n\n"
|
114 |
for i, result in enumerate(results, 1):
|
115 |
+
output += f"## {i}. {result['course_type']} - {result['name']}\n"
|
116 |
+
output += f"**Description:** {result['description']}\n"
|
117 |
+
output += f"_[Link to Course]({result['link']})_\n\n"
|
118 |
+
|
|
|
119 |
return output
|
120 |
|
121 |
+
|
122 |
app = gr.Interface(
|
123 |
fn=search_interface,
|
124 |
+
inputs=gr.Textbox(
|
125 |
+
lines=2,
|
126 |
+
placeholder="Enter your search query here (e.g., 'machine learning', 'python for beginners', 'deep learning')",
|
127 |
+
),
|
128 |
+
outputs=gr.Markdown(
|
129 |
+
value="## Search Results will be displayed here.",
|
130 |
+
line_breaks=True,
|
131 |
+
label="Search Results",
|
132 |
+
show_label=True,
|
133 |
+
),
|
134 |
+
title="Analytics Vidhya Course Search Engine",
|
135 |
+
description="Search for courses using semantic similarity. Results are ordered by relevance.",
|
136 |
+
allow_flagging="never",
|
137 |
)
|
138 |
|
139 |
+
app.launch()
|