Spaces:
Sleeping
Sleeping
# src/evaluate.py | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import Dataset, DataLoader | |
import json | |
from model import TransformerModel | |
from utils import load_vocab | |
from tqdm import tqdm | |
import os | |
class TextDataset(Dataset): | |
def __init__(self, data_path, vocab, seq_length=50): | |
with open(data_path, 'r', encoding='utf-8') as f: | |
self.data = json.load(f) | |
self.vocab = vocab | |
self.seq_length = seq_length | |
def __len__(self): | |
return len(self.data) | |
def numericalize(self, tokens): | |
return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens] | |
def __getitem__(self, idx): | |
tokens = self.data[idx] | |
numericalized = self.numericalize(tokens) | |
if len(numericalized) < self.seq_length + 1: | |
numericalized += [self.vocab['<PAD>']] * (self.seq_length + 1 - len(numericalized)) | |
else: | |
numericalized = numericalized[:self.seq_length + 1] | |
input_seq = torch.tensor(numericalized[:-1], dtype=torch.long) | |
target_seq = torch.tensor(numericalized[1:], dtype=torch.long) | |
return input_seq, target_seq | |
def collate_fn(batch): | |
inputs, targets = zip(*batch) | |
inputs = torch.stack(inputs) | |
targets = torch.stack(targets) | |
return inputs, targets | |
def get_dataloader(data_path, vocab, batch_size=64, seq_length=50): | |
dataset = TextDataset(data_path, vocab, seq_length) | |
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) | |
return dataloader | |
def evaluate_model(config): | |
# Load vocabulary | |
vocab = load_vocab(config['vocab_path']) | |
vocab_size = len(vocab) | |
# Initialize model | |
model = TransformerModel( | |
vocab_size=vocab_size, | |
embed_size=config['embed_size'], | |
num_heads=config['num_heads'], | |
hidden_dim=config['hidden_dim'], | |
num_layers=config['num_layers'], | |
dropout=config['dropout'] | |
) | |
# Load model weights | |
model.load_state_dict(torch.load(config['model_path'], map_location=torch.device('cpu'))) | |
model.eval() | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model = model.to(device) | |
# Loss function | |
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>']) | |
# DataLoader | |
dataloader = get_dataloader( | |
data_path=config['data_path'], | |
vocab=vocab, | |
batch_size=config['batch_size'], | |
seq_length=config['seq_length'] | |
) | |
total_loss = 0 | |
total_tokens = 0 | |
with torch.no_grad(): | |
for inputs, targets in tqdm(dataloader, desc="Evaluating"): | |
inputs = inputs.to(device) | |
targets = targets.to(device) | |
src_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device) | |
outputs = model(inputs, src_mask) | |
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) | |
total_loss += loss.item() * inputs.size(0) | |
total_tokens += inputs.size(0) | |
average_loss = total_loss / total_tokens | |
perplexity = torch.exp(torch.tensor(average_loss)) | |
print(f"Average Loss: {average_loss:.4f}") | |
print(f"Perplexity: {perplexity:.4f}") | |
if __name__ == "__main__": | |
config = { | |
'vocab_path': 'vocab.json', | |
'data_path': 'data/processed/tokenized_data.json', | |
'model_path': 'models/3ed0k4_model_epoch10.pth', # Update accordingly | |
'embed_size': 256, | |
'num_heads': 8, | |
'hidden_dim': 512, | |
'num_layers': 4, | |
'dropout': 0.1, | |
'batch_size': 64, | |
'seq_length': 50, | |
} | |
evaluate_model(config) | |