import pandas as pd import numpy as np from typing import List, Dict, Tuple import re import torch from transformers import AutoModel, AutoTokenizer class CourseSearchSystem: def __init__(self): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model_name = 'sentence-transformers/all-MiniLM-L6-v2' self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModel.from_pretrained(self.model_name).to(self.device) self.model.eval() def mean_pooling(self, model_output, attention_mask): """Mean pooling to get sentence embeddings""" token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def get_embeddings(self, texts: List[str]) -> np.ndarray: """Get embeddings for a list of texts""" encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512) encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()} with torch.no_grad(): model_output = self.model(**encoded_input) sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask']) sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) return sentence_embeddings.cpu().numpy() def preprocess_text(self, text: str) -> str: """Clean and standardize text data""" if pd.isna(text): return "" text = str(text) text = re.sub(r'[^\w\s]', ' ', text) text = ' '.join(text.split()) return text.lower() def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame: """Prepare and clean course data""" free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)] free_courses = free_courses.fillna({ 'Course Time': 0, 'Ratings': 4.6, 'Difficulty': 'Beginner', 'Key Takeaways': 'Course details not available.' }) free_courses['search_text'] = free_courses.apply( lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}", axis=1 ) free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text) return free_courses def load_and_prepare_data(self, df: pd.DataFrame): """Load and prepare the course data and generate embeddings""" self.courses_df = self.prepare_course_data(df) self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist()) def generate_response(self, query: str, results: List[Dict]) -> str: """Generate a professional response with course recommendations""" response_parts = [] # Introduction based on number of results if len(results) == 1: response_parts.append(f"I found an excellent free course matching your search for '{query}':") else: response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':") # Course details for i, result in enumerate(results, 1): course_name = result['course_name'] course_section = f"\n**{i}. {course_name}**\n" # Clean rating display rating = result['ratings'] rating_display = f"{rating}/5.0" course_section += f"**Rating:** {rating_display}\n" # Add difficulty course_section += f"**Level:** {result['difficulty']}\n" # Add duration if available if result['course_time']: course_section += f"**Duration:** {result['course_time']} hours\n" # Format key takeaways with bullet points if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.': course_section += "\n**What you'll learn:**\n" takeaways = result['key_takeaways'].split('.,') formatted_takeaways = [] for takeaway in takeaways: cleaned = takeaway.strip('. ,') if cleaned: if len(cleaned) > 100: cleaned = cleaned[:97] + "..." formatted_takeaways.append(f"• {cleaned}") course_section += "\n".join(formatted_takeaways[:3]) if len(takeaways) > 3: course_section += "\n• And more..." # Add relevance score as a percentage similarity_percentage = int(result['similarity_score'] * 100) course_section += f"\n**Match Score:** {similarity_percentage}%" # Add course link course_section += f"\n\n[Start Course]({result['url']})\n" response_parts.append(course_section) # Add helpful conclusion response_parts.append("\n---\n") response_parts.append("**Notes:**") response_parts.append("• Courses are sorted by relevance to your search") response_parts.append("• All courses are free and include hands-on projects") response_parts.append("• Certificates are provided upon completion") return "\n".join(response_parts) def search_courses(self, query: str, top_k: int = 5) -> str: """Search for courses and return formatted response""" query = self.preprocess_text(query) query_embedding = self.get_embeddings([query])[0] similarities = np.dot(self.course_embeddings, query_embedding) top_indices = np.argsort(similarities)[-top_k:][::-1] results = [] for idx in top_indices: course = self.courses_df.iloc[idx] results.append({ 'course_name': course['Course Name'], 'key_takeaways': course['Key Takeaways'], # Fixed: Changed from Key_Takeaways to Key Takeaways 'course_time': course['Course Time'], 'ratings': course['Ratings'], 'difficulty': course['Difficulty'], 'similarity_score': similarities[idx], 'url': course['Website'] }) return self.generate_response(query, results)