# src/evaluate.py import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader import json from model import TransformerModel from utils import load_vocab from tqdm import tqdm import os class TextDataset(Dataset): def __init__(self, data_path, vocab, seq_length=50): with open(data_path, 'r', encoding='utf-8') as f: self.data = json.load(f) self.vocab = vocab self.seq_length = seq_length def __len__(self): return len(self.data) def numericalize(self, tokens): return [self.vocab.get(token, self.vocab['']) for token in tokens] def __getitem__(self, idx): tokens = self.data[idx] numericalized = self.numericalize(tokens) if len(numericalized) < self.seq_length + 1: numericalized += [self.vocab['']] * (self.seq_length + 1 - len(numericalized)) else: numericalized = numericalized[:self.seq_length + 1] input_seq = torch.tensor(numericalized[:-1], dtype=torch.long) target_seq = torch.tensor(numericalized[1:], dtype=torch.long) return input_seq, target_seq def collate_fn(batch): inputs, targets = zip(*batch) inputs = torch.stack(inputs) targets = torch.stack(targets) return inputs, targets def get_dataloader(data_path, vocab, batch_size=64, seq_length=50): dataset = TextDataset(data_path, vocab, seq_length) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) return dataloader def evaluate_model(config): # Load vocabulary vocab = load_vocab(config['vocab_path']) vocab_size = len(vocab) # Initialize model model = TransformerModel( vocab_size=vocab_size, embed_size=config['embed_size'], num_heads=config['num_heads'], hidden_dim=config['hidden_dim'], num_layers=config['num_layers'], dropout=config['dropout'] ) # Load model weights model.load_state_dict(torch.load(config['model_path'], map_location=torch.device('cpu'))) model.eval() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) # Loss function criterion = nn.CrossEntropyLoss(ignore_index=vocab['']) # DataLoader dataloader = get_dataloader( data_path=config['data_path'], vocab=vocab, batch_size=config['batch_size'], seq_length=config['seq_length'] ) total_loss = 0 total_tokens = 0 with torch.no_grad(): for inputs, targets in tqdm(dataloader, desc="Evaluating"): inputs = inputs.to(device) targets = targets.to(device) src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device) outputs = model(inputs, src_mask) loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) total_loss += loss.item() * inputs.size(0) total_tokens += inputs.size(0) average_loss = total_loss / total_tokens perplexity = torch.exp(torch.tensor(average_loss)) print(f"Average Loss: {average_loss:.4f}") print(f"Perplexity: {perplexity:.4f}") if __name__ == "__main__": config = { 'vocab_path': 'vocab.json', 'data_path': 'data/processed/tokenized_data.json', 'model_path': 'models/3ed0k4_model_epoch10.pth', # Update accordingly 'embed_size': 256, 'num_heads': 8, 'hidden_dim': 512, 'num_layers': 4, 'dropout': 0.1, 'batch_size': 64, 'seq_length': 50, } evaluate_model(config)