import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from typing import List, Dict, Tuple import re class CourseSearchSystem: def __init__(self): # Initialize the embedding model self.model = SentenceTransformer('all-MiniLM-L6-v2') self.courses_df = None self.course_embeddings = None def preprocess_text(self, text: str) -> str: """Clean and standardize text data""" if pd.isna(text): return "" text = str(text) text = re.sub(r'[^\w\s]', ' ', text) text = ' '.join(text.split()) return text.lower() def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame: """Prepare and clean course data""" free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)] free_courses = free_courses.fillna({ 'Course Time': 0, 'Ratings': 4.6, 'Difficulty': 'Beginner', 'Key Takeaways': 'Course details not available.' }) free_courses['search_text'] = free_courses.apply( lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}", axis=1 ) free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text) return free_courses def load_and_prepare_data(self, df: pd.DataFrame): """Load and prepare the course data and generate embeddings""" self.courses_df = self.prepare_course_data(df) self.course_embeddings = self.model.encode( self.courses_df['search_text'].tolist(), show_progress_bar=True ) def generate_response(self, query: str, results: List[Dict]) -> str: """Generate a natural language response with course recommendations""" response_parts = [] # Introduction response_parts.append(f"I've searched through Analytics Vidhya's free courses related to '{query}' and found some excellent matches. Here are the most relevant courses:") # Course details for i, result in enumerate(results, 1): course_section = f"\n### {i}. {result['course_name']}\n" # Add rating visualization rating = result['ratings'] stars = "⭐" * int(rating) + ("½" if rating % 1 >= 0.5 else "") course_section += f"**Rating:** {stars} ({rating})\n" # Add difficulty and duration course_section += f"**Difficulty:** {result['difficulty']}\n" if result['course_time']: course_section += f"**Duration:** {result['course_time']} hours\n" # Add key takeaways if available if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.': course_section += "\n**Key Takeaways:**\n" takeaways = result['key_takeaways'].split('.,') for takeaway in takeaways: # Clean up the takeaway text cleaned_takeaway = takeaway.strip('. ,') if cleaned_takeaway: course_section += f"- {cleaned_takeaway}\n" # Add course link course_section += f"\n🔗 [Access the course here]({result['url']})\n" response_parts.append(course_section) # Add conclusion response_parts.append("\nEach of these courses is free and available on the Analytics Vidhya platform. Would you like me to provide more specific details about any of these courses or help you find courses on a different topic?") return "\n".join(response_parts) def search_courses(self, query: str, top_k: int = 5) -> str: """Search for courses and return formatted response""" # Preprocess query query = self.preprocess_text(query) # Generate query embedding query_embedding = self.model.encode([query])[0] # Calculate similarities similarities = np.dot(self.course_embeddings, query_embedding) # Get top k results top_indices = np.argsort(similarities)[-top_k:][::-1] results = [] for idx in top_indices: course = self.courses_df.iloc[idx] results.append({ 'course_name': course['Course Name'], 'key_takeaways': course['Key Takeaways'], 'course_time': course['Course Time'], 'ratings': course['Ratings'], 'difficulty': course['Difficulty'], 'similarity_score': similarities[idx], 'url': course['Website'] }) # Generate formatted response return self.generate_response(query, results) def test_search_system(df: pd.DataFrame): """Test the search system with sample queries""" search_system = CourseSearchSystem() search_system.load_and_prepare_data(df) test_queries = [ "machine learning for beginners", "natural language processing", "computer vision courses", "data preprocessing tutorials", "generative AI learning" ] for query in test_queries: print(f"\nTesting query: '{query}'\n") response = search_system.search_courses(query, top_k=3) print(response) print("\n" + "="*80 + "\n") if __name__ == "__main__": df = pd.read_csv('course_data.csv') test_search_system(df)