Spaces:
Runtime error
Runtime error
from collections import Counter | |
from itertools import chain | |
import math | |
import torch | |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
def ngrams(sequence, n): | |
return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)] | |
def count_ngrams(sequence, max_n): | |
counts = Counter() | |
for n in range(1, max_n + 1): | |
counts.update(ngrams(sequence, n)) | |
return counts | |
def self_bleu(outputs): | |
smoothing_function = SmoothingFunction().method1 | |
scores = [] | |
for i in range(len(outputs)): | |
references = outputs[:i] + outputs[i+1:] | |
# Avoid calculating BLEU score for empty references | |
if references: | |
scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function)) | |
# If all references are empty, return a default value | |
if not scores: | |
return 0 | |
return sum(scores) / len(scores) | |
def dist_n(outputs, n): | |
all_ngrams = list(chain(*[ngrams(output, n) for output in outputs])) | |
unique_ngrams = set(all_ngrams) | |
return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0 | |
def perplexity(model, tokenizer, texts): | |
encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True) | |
max_length = model.config.n_positions | |
stride = 512 | |
lls = [] | |
for i in range(0, encodings.input_ids.size(1), stride): | |
begin_loc = max(i + stride - max_length, 0) | |
end_loc = i + stride | |
trg_len = end_loc - i | |
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device) | |
target_ids = input_ids.clone() | |
target_ids[:, :-trg_len] = -100 | |
with torch.no_grad(): | |
outputs = model(input_ids, labels=target_ids) | |
log_likelihood = outputs.loss * trg_len | |
lls.append(log_likelihood) | |
ppl = torch.exp(torch.stack(lls).sum() / end_loc) | |
return ppl.item() | |
def js_divergence(p, q): | |
def kl_divergence(p, q): | |
return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0) | |
p_norm = [float(i)/sum(p) for i in p] | |
q_norm = [float(i)/sum(q) for i in q] | |
m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))] | |
return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2 | |