AKT47 commited on
Commit
e101645
·
verified ·
1 Parent(s): edf1360

updated app.py

Browse files

added markdown for better display
removed scores

Files changed (1) hide show
  1. app.py +79 -47
app.py CHANGED
@@ -4,76 +4,98 @@ import torch
4
  import torch.nn.functional as F
5
  import json
6
  from typing import List, Dict
7
- import os
8
 
9
  def load_model_and_tokenizer():
10
- tokenizer = AutoTokenizer.from_pretrained('models/tokenizer')
11
- model = AutoModel.from_pretrained('models/model')
12
  return tokenizer, model
13
 
 
14
  def load_data(file_path: str = "data.json") -> List[Dict]:
15
- with open(file_path, 'r') as f:
16
  data = json.load(f)
17
  flattened_courses = []
18
- for course_category in data['courses']:
19
- for subcourse in course_category['subcourses']:
20
- if not subcourse or not all(key in subcourse for key in ['name', 'description', 'link']):
 
 
21
  continue
22
-
23
- flattened_courses.append({
24
- 'course_type': course_category['course_type'],
25
- 'name': subcourse['name'],
26
- 'description': subcourse['description'],
27
- 'link': subcourse['link'],
28
- 'content': f"{course_category['course_type']} - {subcourse['name']}: {subcourse['description']}"
29
- })
 
 
30
  return flattened_courses
31
 
 
32
  def get_embedding(text: str, tokenizer, model) -> torch.Tensor:
33
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
 
 
34
  with torch.no_grad():
35
  outputs = model(**inputs)
36
-
37
- attention_mask = inputs['attention_mask']
38
  token_embeddings = outputs.last_hidden_state
39
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
40
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
 
 
 
41
 
42
  def precompute_embeddings(documents: List[Dict], tokenizer, model) -> torch.Tensor:
43
  embeddings = []
44
  for doc in documents:
45
- embedding = get_embedding(doc['content'], tokenizer, model)
46
  embeddings.append(embedding)
47
  return torch.cat(embeddings, dim=0)
48
 
49
- def semantic_search(query: str, doc_embeddings: torch.Tensor, documents: List[Dict], tokenizer, model, top_k: int = 5) -> List[Dict]:
 
 
 
 
 
 
 
 
50
  query_embedding = get_embedding(query, tokenizer, model)
51
  similarities = F.cosine_similarity(query_embedding, doc_embeddings)
52
  top_k_indices = torch.topk(similarities, min(top_k, len(documents))).indices
53
-
54
  results = []
55
  for idx in top_k_indices:
56
  doc = documents[idx.item()]
57
- score = similarities[idx].item()
58
- results.append({
59
- 'course_type': doc['course_type'],
60
- 'name': doc['name'],
61
- 'description': doc['description'],
62
- 'link': doc['link'],
63
- 'score': f"{score:.4f}"
64
- })
65
-
66
  return results
67
 
 
68
  try:
69
  print("Loading model and tokenizer...")
70
  tokenizer, model = load_model_and_tokenizer()
71
-
72
  print("Loading documents...")
73
  documents = load_data()
74
  if not documents:
75
  raise ValueError("No valid courses found in the data file")
76
-
77
  print("Precomputing embeddings...")
78
  doc_embeddings = precompute_embeddings(documents, tokenizer, model)
79
  print("Initialization complete!")
@@ -81,27 +103,37 @@ except Exception as e:
81
  print(f"Error during initialization: {str(e)}")
82
  raise
83
 
 
84
  def search_interface(query: str) -> str:
85
  if not query.strip():
86
  return "Please enter a search query."
87
-
88
  results = semantic_search(query, doc_embeddings, documents, tokenizer, model)
89
-
90
- output = "Search Results:\n\n"
91
  for i, result in enumerate(results, 1):
92
- output += f"{i}. {result['course_type']} - {result['name']}\n"
93
- output += f"Relevance Score: {result['score']}\n"
94
- output += f"Description: {result['description']}\n"
95
- output += f"Link: {result['link']}\n\n"
96
-
97
  return output
98
 
 
99
  app = gr.Interface(
100
  fn=search_interface,
101
- inputs=gr.Textbox(lines=2, placeholder="Enter your search query here (e.g., 'machine learning', 'python for beginners', 'deep learning')"),
102
- outputs=gr.Textbox(lines=20, label="Search Results"),
103
- title="Course Search Engine",
104
- description="Search for courses using semantic similarity. Results are ordered by relevance."
 
 
 
 
 
 
 
 
 
105
  )
106
 
107
- app.launch()
 
4
  import torch.nn.functional as F
5
  import json
6
  from typing import List, Dict
7
+
8
 
9
  def load_model_and_tokenizer():
10
+ tokenizer = AutoTokenizer.from_pretrained("models/tokenizer")
11
+ model = AutoModel.from_pretrained("models/model")
12
  return tokenizer, model
13
 
14
+
15
  def load_data(file_path: str = "data.json") -> List[Dict]:
16
+ with open(file_path, "r") as f:
17
  data = json.load(f)
18
  flattened_courses = []
19
+ for course_category in data["courses"]:
20
+ for subcourse in course_category["subcourses"]:
21
+ if not subcourse or not all(
22
+ key in subcourse for key in ["name", "description", "link"]
23
+ ):
24
  continue
25
+
26
+ flattened_courses.append(
27
+ {
28
+ "course_type": course_category["course_type"],
29
+ "name": subcourse["name"],
30
+ "description": subcourse["description"],
31
+ "link": subcourse["link"],
32
+ "content": f"{course_category['course_type']} - {subcourse['name']}: {subcourse['description']}",
33
+ }
34
+ )
35
  return flattened_courses
36
 
37
+
38
  def get_embedding(text: str, tokenizer, model) -> torch.Tensor:
39
+ inputs = tokenizer(
40
+ text, return_tensors="pt", truncation=True, max_length=512, padding=True
41
+ )
42
  with torch.no_grad():
43
  outputs = model(**inputs)
44
+
45
+ attention_mask = inputs["attention_mask"]
46
  token_embeddings = outputs.last_hidden_state
47
+ input_mask_expanded = (
48
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
49
+ )
50
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
51
+ input_mask_expanded.sum(1), min=1e-9
52
+ )
53
+
54
 
55
  def precompute_embeddings(documents: List[Dict], tokenizer, model) -> torch.Tensor:
56
  embeddings = []
57
  for doc in documents:
58
+ embedding = get_embedding(doc["content"], tokenizer, model)
59
  embeddings.append(embedding)
60
  return torch.cat(embeddings, dim=0)
61
 
62
+
63
+ def semantic_search(
64
+ query: str,
65
+ doc_embeddings: torch.Tensor,
66
+ documents: List[Dict],
67
+ tokenizer,
68
+ model,
69
+ top_k: int = 3,
70
+ ) -> List[Dict]:
71
  query_embedding = get_embedding(query, tokenizer, model)
72
  similarities = F.cosine_similarity(query_embedding, doc_embeddings)
73
  top_k_indices = torch.topk(similarities, min(top_k, len(documents))).indices
74
+
75
  results = []
76
  for idx in top_k_indices:
77
  doc = documents[idx.item()]
78
+ results.append(
79
+ {
80
+ "course_type": doc["course_type"],
81
+ "name": doc["name"],
82
+ "description": doc["description"],
83
+ "link": doc["link"],
84
+ }
85
+ )
86
+
87
  return results
88
 
89
+
90
  try:
91
  print("Loading model and tokenizer...")
92
  tokenizer, model = load_model_and_tokenizer()
93
+
94
  print("Loading documents...")
95
  documents = load_data()
96
  if not documents:
97
  raise ValueError("No valid courses found in the data file")
98
+
99
  print("Precomputing embeddings...")
100
  doc_embeddings = precompute_embeddings(documents, tokenizer, model)
101
  print("Initialization complete!")
 
103
  print(f"Error during initialization: {str(e)}")
104
  raise
105
 
106
+
107
  def search_interface(query: str) -> str:
108
  if not query.strip():
109
  return "Please enter a search query."
110
+
111
  results = semantic_search(query, doc_embeddings, documents, tokenizer, model)
112
+
113
+ output = "# Search Results:\n\n"
114
  for i, result in enumerate(results, 1):
115
+ output += f"## {i}. {result['course_type']} - {result['name']}\n"
116
+ output += f"**Description:** {result['description']}\n"
117
+ output += f"_[Link to Course]({result['link']})_\n\n"
118
+
 
119
  return output
120
 
121
+
122
  app = gr.Interface(
123
  fn=search_interface,
124
+ inputs=gr.Textbox(
125
+ lines=2,
126
+ placeholder="Enter your search query here (e.g., 'machine learning', 'python for beginners', 'deep learning')",
127
+ ),
128
+ outputs=gr.Markdown(
129
+ value="## Search Results will be displayed here.",
130
+ line_breaks=True,
131
+ label="Search Results",
132
+ show_label=True,
133
+ ),
134
+ title="Analytics Vidhya Course Search Engine",
135
+ description="Search for courses using semantic similarity. Results are ordered by relevance.",
136
+ allow_flagging="never",
137
  )
138
 
139
+ app.launch()