Spaces:
Runtime error
Runtime error
import argparse | |
import json | |
import numpy as np | |
import tqdm | |
from pathlib import Path | |
from pprint import pprint | |
from collections import defaultdict, Counter | |
from transformers import AutoTokenizer | |
import scrl.utils as utils | |
from scrl.model import load_checkpoint | |
from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES | |
from nltk import word_tokenize | |
def get_hc_summary(output): | |
i = np.argmax(output["scores"]) | |
summary = output["summaries"][i] | |
mask = output["masks"][i] | |
return summary | |
def main(args): | |
outputs = list(utils.read_jsonl(args.outputs)) | |
dataset = list(utils.read_jsonl(args.dataset)) | |
all_scores = defaultdict(list) | |
for i, item in tqdm.tqdm(enumerate(dataset)): | |
src = item["text"] | |
if args.lower_src: | |
src = src.lower() | |
tgts = item["summaries"] | |
pred = get_hc_summary(outputs[i]) | |
if args.max_chars > 0: | |
pred = pred[:args.max_chars] | |
src_tokens = word_tokenize(src) | |
pred_tokens = word_tokenize(pred) | |
if args.lower_summary: | |
pred_tokens = [t.lower() for t in pred_tokens] | |
if args.pretokenized: | |
src_tokens = src.split() | |
else: | |
src_tokens = word_tokenize(src) | |
item_scores = defaultdict(list) | |
for tgt in tgts: | |
if args.pretokenized: | |
tgt_tokens = tgt.split() | |
else: | |
tgt_tokens = word_tokenize(tgt) | |
if args.lower_summary: | |
tgt_tokens = [t.lower() for t in tgt_tokens] | |
token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True) | |
rouge_scores = rouge_scorer.score(tgt, pred) | |
for rouge_type, rouge_type_scores in rouge_scores.items(): | |
item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision) | |
item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall) | |
item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure) | |
item_scores["token-f1"].append(token_fscore) | |
item_scores["tgt-len"].append(len(tgt_tokens)) | |
item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens)) | |
for k, values in item_scores.items(): | |
item_mean = np.mean(values) | |
all_scores[k].append(item_mean) | |
all_scores["pred-len"].append(len(pred_tokens)) | |
all_scores["src-len"].append(len(src_tokens)) | |
all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens)) | |
if args.verbose: | |
print("SRC:", src) | |
print("TGT:", tgts[0]) | |
print("PRED:", pred) | |
print("=" * 100) | |
print("="*100) | |
print("RESULTS:") | |
print("="*20, "Length (#tokens):", "="*20) | |
for metric in ("src-len", "tgt-len", "pred-len"): | |
mean = np.mean(all_scores[metric]) | |
print(f"{metric}: {mean:.2f}") | |
print() | |
print("="*20, "Compression ratio:", "="*20) | |
for metric in ("tgt-cr", "pred-cr"): | |
mean = np.mean(all_scores[metric]) | |
print(f"{metric}: {mean:.2f}") | |
print() | |
print("="*20, "Token F1-Score:", "="*20) | |
mean = np.mean(all_scores["token-f1"]) | |
print(f"f1-score: {mean:.3f}") | |
print() | |
print("="*20, "ROUGE F1-Scores:", "="*20) | |
for rouge_type in ROUGE_TYPES: | |
mean = np.mean(all_scores[f"{rouge_type}-f"]) | |
print(f"{rouge_type}: {mean:.4f}") | |
print() | |
print("="*20, "ROUGE Recall:", "="*20) | |
for rouge_type in ROUGE_TYPES: | |
mean = np.mean(all_scores[f"{rouge_type}-r"]) | |
print(f"{rouge_type}: {mean:.4f}") | |
print() | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--dataset', required=True) | |
parser.add_argument('--outputs', required=True) | |
parser.add_argument('--pretokenized', action="store_true") | |
parser.add_argument('--max-chars', type=int, default=-1) | |
parser.add_argument('--verbose', action="store_true") | |
parser.add_argument('--lower-src', action="store_true") | |
parser.add_argument('--lower-summary', action="store_true") | |
return parser.parse_args() | |
if __name__ == '__main__': | |
main(parse_args()) | |