Spaces:

TravisBoltz
/

default

Sleeping

App Files Files Community

default / generate_recommendations.py

TravisBoltz

Upload 62 files

b4263ca verified 17 days ago

raw

history blame contribute delete

12.9 kB

	import torch
	import torch.nn as nn
	import pandas as pd
	import numpy as np
	import json
	import logging
	from sklearn.preprocessing import LabelEncoder
	from torch.utils.data import DataLoader
	import os
	from train_model import HybridMusicRecommender, MusicRecommenderDataset

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Add safe globals for numpy types
	torch.serialization.add_safe_globals([
	np.generic, # Allow numpy scalar types
	np.ndarray, # Allow numpy arrays
	np.dtype, # Allow numpy dtypes
	np.float64, # Allow specific numpy types
	np.float32,
	np.int64,
	np.int32
	])

	class RecommendationGenerator:
	def __init__(self, model_path: str, catalog_data: pd.DataFrame, encoders_path: str):
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.catalog_data = catalog_data

	# Load model checkpoint with safety settings
	logger.info(f"Loading model from {model_path}")
	try:
	self.checkpoint = torch.load(model_path, map_location=self.device)
	logger.info("Model loaded successfully")
	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	raise

	# Get config and encoders from the checkpoint
	self.config = self.checkpoint.get('config', {})
	if not self.config:
	# Try loading from config file as fallback
	try:
	with open('config/model_config.json', 'r') as f:
	self.config = json.load(f)
	except FileNotFoundError:
	logger.warning("Config file not found, using default values")
	self.config = {
	'embedding_dim': 64,
	'hidden_layers': [256, 128, 64],
	'dropout': 0.3
	}

	# Load encoders with safety settings
	torch.serialization.add_safe_globals([LabelEncoder])
	self.encoders = torch.load(encoders_path, weights_only=False)

	# Print shape info for debugging
	logger.info("Encoder class counts:")
	for key, encoder in self.encoders.items():
	if isinstance(encoder, LabelEncoder):
	logger.info(f"{key}: {len(encoder.classes_)}")

	# Get state dict dimensions with safety checks
	state_dict = self.checkpoint['model_state_dict']
	self.embedding_dims = {
	'num_users': state_dict['user_embedding.weight'].shape[0],
	'num_music': state_dict['music_embedding.weight'].shape[0],
	'num_artists': state_dict['artist_embedding.weight'].shape[0],
	'num_genres': len(self.encoders['genre_encoder'].classes_),
	'num_numerical': 12
	}

	logger.info("Model dimensions from state dict:")
	for key, value in self.embedding_dims.items():
	logger.info(f"{key}: {value}")

	# Safety check for catalog data
	max_music_id = self.catalog_data['music_id'].nunique()
	if max_music_id >= self.embedding_dims['num_music']:
	logger.warning(f"Catalog contains music IDs larger than model capacity. Filtering out excess items.")
	valid_music_ids = set(self.encoders['music_encoder'].transform(
	self.encoders['music_encoder'].classes_[:self.embedding_dims['num_music']]
	))
	self.catalog_data = self.catalog_data[
	self.catalog_data['music_id'].apply(
	lambda x: self.encoders['music_encoder'].transform([x])[0] in valid_music_ids
	)
	]
	logger.info(f"Filtered catalog size: {len(self.catalog_data)}")

	self.model = self._initialize_model(self.embedding_dims)

	def _initialize_model(self, embedding_dims):
	"""Initialize and load the model from checkpoint."""
	# Get dimensions from encoders
	model = HybridMusicRecommender(
	num_users=embedding_dims['num_users'],
	num_music=embedding_dims['num_music'],
	num_artists=embedding_dims['num_artists'],
	num_genres=embedding_dims['num_genres'],
	num_numerical=embedding_dims['num_numerical'],
	embedding_dim=64,
	layers=[256, 128, 64],
	dropout=0.2
	)

	# Load state dict from checkpoint
	state_dict = self.checkpoint['model_state_dict']
	model.load_state_dict(state_dict)

	# Move model to device and set to eval mode
	model = model.to(self.device)
	model.eval()

	return model

	def generate_recommendations(self, user_info: dict, n_recommendations: int = 10) -> pd.DataFrame:
	"""
	Generate music recommendations for a specific user.

	Args:
	user_info: Dictionary containing user information (age, gender, user_id)
	n_recommendations: Number of recommendations to generate

	Returns:
	DataFrame containing recommended songs with predicted play counts
	"""
	# Create a temporary DataFrame with all songs for the user
	user_candidates = self.catalog_data.copy()
	user_candidates['age'] = user_info['age']
	user_candidates['gender'] = user_info['gender']
	user_candidates['user_id'] = user_info['user_id']

	# Debug user encoding with more detailed error handling
	try:
	encoded_user = self.encoders['user_encoder'].transform([user_info['user_id']])[0]
	logger.info(f"User ID {user_info['user_id']} encoded as: {encoded_user}")
	except Exception as e:
	logger.warning(f"Error encoding user ID: {str(e)}")
	logger.warning("Using default encoding (0)")
	encoded_user = 0
	user_candidates['user_id'] = '0' # Use default user ID

	# Debug catalog data
	print(f"\nCatalog Statistics:")
	print(f"Total songs: {len(user_candidates)}")
	print(f"Unique artists: {user_candidates['artist_name'].nunique()}")
	print(f"Unique genres: {user_candidates['main_genre'].nunique()}")

	try:
	# Create dataset with safety checks
	test_dataset = MusicRecommenderDataset(
	user_candidates,
	mode='test',
	encoders=self.encoders,
	embedding_dims=self.embedding_dims # Pass embedding dimensions
	)
	test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

	# Generate predictions
	predictions = []
	indices = []

	with torch.no_grad():
	for i, batch in enumerate(test_loader):
	batch = {k: v.to(self.device) for k, v in batch.items()}
	pred = self.model(batch)
	predictions.extend(pred.cpu().numpy())
	indices.extend(range(i * test_loader.batch_size,
	min((i + 1) * test_loader.batch_size, len(test_dataset))))
	except Exception as e:
	logger.error(f"Error generating recommendations: {str(e)}")
	raise

	# Create recommendations DataFrame and ensure uniqueness
	recommendations = pd.DataFrame({
	'music': user_candidates['music'].values[indices],
	'artist_name': user_candidates['artist_name'].values[indices],
	'genre': user_candidates['main_genre'].values[indices],
	'predicted_plays': predictions
	})

	# Drop duplicates keeping first occurrence (highest predicted play count)
	recommendations = recommendations.drop_duplicates(subset=['music'], keep='first')

	# Convert predictions to scalar values and sort
	recommendations['predicted_plays'] = recommendations['predicted_plays'].apply(lambda x: float(x[0]))

	# Sort by predicted plays and get top N recommendations
	recommendations = recommendations.sort_values('predicted_plays', ascending=False)
	recommendations = recommendations.head(n_recommendations)

	# Debug predictions
	print(f"\nPrediction Statistics:")
	min_pred = recommendations['predicted_plays'].min()
	max_pred = recommendations['predicted_plays'].max()
	std_pred = recommendations['predicted_plays'].std()
	print(f"Prediction range: {min_pred:.2f} to {max_pred:.2f}")
	print(f"Prediction std: {std_pred:.2f}")

	# Print top recommendations with better formatting
	print("\nTop 10 Recommended Songs:")
	pd.set_option('display.max_columns', None)
	pd.set_option('display.width', None)
	print(recommendations.to_string(index=False, float_format=lambda x: '{:.2f}'.format(x) if isinstance(x, (float, np.float32, np.float64)) else str(x)))

	return recommendations.reset_index(drop=True)

	class HybridMusicRecommender(nn.Module):
	def __init__(self, num_users, num_music, num_artists, num_genres, num_numerical,
	embedding_dim=64, layers=[256, 128, 64], dropout=0.2):
	super().__init__()

	# Embedding layers
	self.user_embedding = nn.Embedding(num_users, embedding_dim)
	self.music_embedding = nn.Embedding(num_music, embedding_dim)
	self.artist_embedding = nn.Embedding(num_artists, embedding_dim)
	self.genre_embedding = nn.Embedding(num_genres, embedding_dim)

	# Feature processing layers with residual connections
	self.numerical_layer = nn.Sequential(
	nn.Linear(num_numerical, embedding_dim),
	nn.ReLU(),
	nn.BatchNorm1d(embedding_dim)
	)

	self.binary_layer = nn.Sequential(
	nn.Linear(2, embedding_dim),
	nn.ReLU(),
	nn.BatchNorm1d(embedding_dim)
	)

	# Calculate total input features
	total_features = embedding_dim * 6 # 4 embeddings + numerical + binary

	# MLP layers with residual connections
	self.fc_layers = nn.ModuleList()
	input_dim = total_features

	for layer_size in layers:
	self.fc_layers.append(nn.ModuleDict({
	'main': nn.Sequential(
	nn.Linear(input_dim, layer_size),
	nn.ReLU(),
	nn.BatchNorm1d(layer_size),
	nn.Dropout(dropout)
	),
	'residual': nn.Linear(input_dim, layer_size) if input_dim != layer_size else None
	}))
	input_dim = layer_size

	self.final_layer = nn.Linear(layers[-1], 1)

	def forward(self, batch):
	# Get embeddings
	user_emb = self.user_embedding(batch['user_id'])
	music_emb = self.music_embedding(batch['music_id'])
	artist_emb = self.artist_embedding(batch['artist_id'])
	genre_emb = self.genre_embedding(batch['genre_id'])

	# Process numerical features
	numerical = self.numerical_layer(batch['numerical_features'])

	# Process binary features
	binary = torch.stack([batch['explicit'], batch['gender']], dim=1).float()
	binary = self.binary_layer(binary)

	# Concatenate all features
	x = torch.cat([
	user_emb, music_emb, artist_emb, genre_emb, numerical, binary
	], dim=1)

	# Apply MLP layers with residual connections
	for layer in self.fc_layers:
	identity = x
	x = layer['main'](x)
	if layer['residual'] is not None:
	x = x + layer['residual'](identity)

	# Final prediction
	return self.final_layer(x)

	def main():
	# Example usage
	BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	model_path = 'checkpoints/best_model.pth'
	catalog_data = pd.read_csv(os.path.join(BASE_DIR, 'data', 'test_data.csv'))
	encoders_path = os.path.join(BASE_DIR, 'data', 'data_encoders.pt')

	# Initialize recommendation generator
	recommender = RecommendationGenerator(model_path, catalog_data, encoders_path)

	# Example user
	user_info = {
	'age': 32,
	'gender': 'M',
	'genre': 'Pop',
	'music': 'Shape of You',
	'user_id': '44d39c6e5e7b45bfc2187fb3c89be58c5a3dc6a54d2a0075402c551c14ea1459'
	}

	# Generate recommendations
	recommendations = recommender.generate_recommendations(user_info, n_recommendations=10)

	print("\nTop 10 Recommended Songs:")
	print(recommendations.to_string(index=False))

	if __name__ == "__main__":
	main()