|
|
|
""" |
|
Created on 02.02.24 |
|
Module for raw ROUGE score calculation from: |
|
@inproceedings{straka-etal-2018-sumeczech, |
|
title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset", |
|
author = "Straka, Milan and |
|
Mediankin, Nikita and |
|
Kocmi, Tom and |
|
{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and |
|
Hude{\v{c}}ek, Vojt{\v{e}}ch and |
|
Haji{\v{c}}, Jan", |
|
editor = "Calzolari, Nicoletta and |
|
Choukri, Khalid and |
|
Cieri, Christopher and |
|
Declerck, Thierry and |
|
Goggi, Sara and |
|
Hasida, Koiti and |
|
Isahara, Hitoshi and |
|
Maegaard, Bente and |
|
Mariani, Joseph and |
|
Mazo, H{\'e}l{\`e}ne and |
|
Moreno, Asuncion and |
|
Odijk, Jan and |
|
Piperidis, Stelios and |
|
Tokunaga, Takenobu", |
|
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", |
|
month = may, |
|
year = "2018", |
|
address = "Miyazaki, Japan", |
|
publisher = "European Language Resources Association (ELRA)", |
|
url = "https://aclanthology.org/L18-1551", |
|
} |
|
|
|
|
|
:author: Martin Dočekal |
|
""" |
|
import collections |
|
import re |
|
from typing import Sequence, Optional |
|
|
|
import datasets |
|
import evaluate |
|
import numpy as np |
|
|
|
|
|
class AggregateScore(collections.namedtuple("AggregateScore", ["low", "mid", "high"])): |
|
""" |
|
Tuple containing confidence intervals for scores. |
|
Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py |
|
""" |
|
|
|
|
|
class Score( |
|
collections.namedtuple("Score", ["precision", "recall", "fmeasure"])): |
|
"""Tuple containing precision, recall, and f-measure values.""" |
|
|
|
|
|
class BootstrapAggregator(object): |
|
"""Aggregates scores to provide confidence intervals. |
|
Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py |
|
|
|
Sample usage: |
|
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL']) |
|
aggregator = Aggregator() |
|
aggregator.add_scores(scorer.score("one two three", "one two")) |
|
aggregator.add_scores(scorer.score("one two five six", "seven eight")) |
|
result = aggregator.aggregate() |
|
print result |
|
{'rougeL': AggregateScore( |
|
low=Score(precision=0.0, recall=0.0, fmeasure=0.0), |
|
mid=Score(precision=0.5, recall=0.33, fmeasure=0.40), |
|
high=Score(precision=1.0, recall=0.66, fmeasure=0.80)), |
|
'rouge1': AggregateScore( |
|
low=Score(precision=0.0, recall=0.0, fmeasure=0.0), |
|
mid=Score(precision=0.5, recall=0.33, fmeasure=0.40), |
|
high=Score(precision=1.0, recall=0.66, fmeasure=0.80))} |
|
""" |
|
|
|
def __init__(self, confidence_interval=0.95, n_samples=1000): |
|
"""Initializes a BootstrapAggregator object. |
|
|
|
Args: |
|
confidence_interval: Confidence interval to compute on the mean as a |
|
decimal. |
|
n_samples: Number of samples to use for bootstrap resampling. |
|
|
|
Raises: |
|
ValueError: If invalid argument is given. |
|
""" |
|
|
|
if confidence_interval < 0 or confidence_interval > 1: |
|
raise ValueError("confidence_interval must be in range [0, 1]") |
|
if n_samples <= 0: |
|
raise ValueError("n_samples must be positive") |
|
|
|
self._n_samples = n_samples |
|
self._confidence_interval = confidence_interval |
|
self._scores = collections.defaultdict(list) |
|
|
|
def add_scores(self, scores): |
|
"""Adds a sample for future aggregation. |
|
|
|
Args: |
|
scores: Dict mapping score_type strings to a namedtuple object/class |
|
representing a score. |
|
""" |
|
|
|
for score_type, score in scores.items(): |
|
self._scores[score_type].append(score) |
|
|
|
def aggregate(self): |
|
"""Aggregates scores previously added using add_scores. |
|
|
|
Returns: |
|
A dict mapping score_type to AggregateScore objects. |
|
""" |
|
|
|
result = {} |
|
for score_type, scores in self._scores.items(): |
|
|
|
score_matrix = np.vstack(tuple(scores)) |
|
|
|
percentiles = self._bootstrap_resample(score_matrix) |
|
|
|
intervals = tuple( |
|
(scores[0].__class__(*percentiles[j, :]) for j in range(3))) |
|
result[score_type] = AggregateScore( |
|
low=intervals[0], mid=intervals[1], high=intervals[2]) |
|
return result |
|
|
|
def _bootstrap_resample(self, matrix): |
|
"""Performs bootstrap resampling on a matrix of scores. |
|
|
|
Args: |
|
matrix: A 2-d matrix of (sample, measure). |
|
|
|
Returns: |
|
A 2-d matrix of (bounds, measure). There are three bounds: low (row 0), |
|
mid (row 1) and high (row 2). Mid is always the mean, while low and high |
|
bounds are specified by self._confidence_interval (which defaults to 0.95 |
|
meaning it will return the 2.5th and 97.5th percentiles for a 95% |
|
confidence interval on the mean). |
|
""" |
|
|
|
|
|
sample_mean = np.zeros((self._n_samples, matrix.shape[1])) |
|
for i in range(self._n_samples): |
|
sample_idx = np.random.choice( |
|
np.arange(matrix.shape[0]), size=matrix.shape[0]) |
|
sample = matrix[sample_idx, :] |
|
sample_mean[i, :] = np.mean(sample, axis=0) |
|
|
|
|
|
|
|
percentile_delta = (1 - self._confidence_interval) / 2 |
|
q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta]) |
|
return np.percentile(sample_mean, q, axis=0) |
|
|
|
|
|
class RougeRawOriginal: |
|
""" |
|
This is the original implementation of the ROUGERaw metric. |
|
Compute RougeRAW-1, RougeRAW-2, RougeRAW-L metrics. |
|
""" |
|
|
|
class FScore: |
|
"""F1 score representation.""" |
|
|
|
def __init__(self, correct, gold, system): |
|
self.p = correct / system if system else 0. |
|
self.r = correct / gold if gold else 0. |
|
self.f = 2 * correct / (system + gold) if system + gold else 0. |
|
|
|
def _rouge_n(self, n, gold_words, system_words): |
|
"""Compute Rouge-n for given words.""" |
|
|
|
def n_grams(n, words): |
|
ngrams = {} |
|
total = 0 |
|
for i in range(len(words) - n + 1): |
|
ngram = "\t".join(words[i:i + n]) |
|
ngrams[ngram] = 1 + ngrams.get(ngram, 0) |
|
total += 1 |
|
return ngrams, total |
|
|
|
gold_ngrams, gold_total = n_grams(n, gold_words) |
|
system_ngrams, system_total = n_grams(n, system_words) |
|
|
|
intersection = 0 |
|
for ngram in system_ngrams: |
|
intersection += min(system_ngrams[ngram], gold_ngrams.get(ngram, 0)) |
|
|
|
return self.FScore(intersection, gold_total, system_total) |
|
|
|
def _rouge_l(self, gold_words, system_words): |
|
"""Compute Rouge-L for given words.""" |
|
lcs = [[0] * len(system_words) for _ in gold_words] |
|
for r in range(len(gold_words)): |
|
for s in range(len(system_words)): |
|
if gold_words[r] == system_words[s]: |
|
lcs[r][s] = 1 + (lcs[r - 1][s - 1] if r and s else 0) |
|
lcs[r][s] = max(lcs[r][s], lcs[r - 1][s] if r else 0) |
|
lcs[r][s] = max(lcs[r][s], lcs[r][s - 1] if s else 0) |
|
|
|
return self.FScore(lcs[-1][-1], len(gold_words), len(system_words)) |
|
|
|
def _tokenize(self, text): |
|
"""Tokenize given text.""" |
|
return re.sub(r"\s+", " ", re.sub(r"\b", " ", text, re.UNICODE), re.UNICODE).strip().split(" ") |
|
|
|
def document(self, gold, system): |
|
"""Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given documents. |
|
Each document should be a string. |
|
""" |
|
|
|
assert isinstance(gold, str) and isinstance(system, str), "Expected string arguments" |
|
|
|
lc_gold_words = [word.lower() for word in self._tokenize(gold)] |
|
lc_system_words = [word.lower() for word in self._tokenize(system)] |
|
|
|
return { |
|
"1": self._rouge_n(1, lc_gold_words, lc_system_words), |
|
"2": self._rouge_n(2, lc_gold_words, lc_system_words), |
|
"L": self._rouge_l(lc_gold_words, lc_system_words), |
|
} |
|
|
|
def corpus(self, gold, system, aggregate=True): |
|
"""Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora. |
|
Each corpus should be a collection of documents, each document a string. |
|
|
|
If aggregate is True, the lower, mid, and upper bounds of the confidence interval are returned. |
|
""" |
|
|
|
assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments" |
|
assert len(gold) == len(system), "Given corpora should be of the same length" |
|
|
|
|
|
if aggregate: |
|
aggregator = BootstrapAggregator() |
|
else: |
|
rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]} |
|
|
|
if len(gold): |
|
for gold_document, system_document in zip(gold, system): |
|
for key, value in self.document(gold_document, system_document).items(): |
|
if aggregate: |
|
aggregator.add_scores({ |
|
key: Score(precision=value.p, recall=value.r, fmeasure=value.f) |
|
}) |
|
else: |
|
rouge[key].p += value.p |
|
rouge[key].r += value.r |
|
rouge[key].f += value.f |
|
|
|
if not aggregate: |
|
for key in rouge: |
|
rouge[key].p /= len(gold) |
|
rouge[key].r /= len(gold) |
|
rouge[key].f /= len(gold) |
|
|
|
if aggregate: |
|
rouge = {} |
|
|
|
|
|
for k, ag_score in aggregator.aggregate().items(): |
|
rouge[k + "_low_precision"] = float(ag_score.low.precision) |
|
rouge[k + "_low_recall"] = float(ag_score.low.recall) |
|
rouge[k + "_low_fmeasure"] = float(ag_score.low.fmeasure) |
|
|
|
rouge[k + "_mid_precision"] = float(ag_score.mid.precision) |
|
rouge[k + "_mid_recall"] = float(ag_score.mid.recall) |
|
rouge[k + "_mid_fmeasure"] = float(ag_score.mid.fmeasure) |
|
|
|
rouge[k + "_high_precision"] = float(ag_score.high.precision) |
|
rouge[k + "_high_recall"] = float(ag_score.high.recall) |
|
rouge[k + "_high_fmeasure"] = float(ag_score.high.fmeasure) |
|
|
|
return rouge |
|
|
|
|
|
_CITATION = """\ |
|
@inproceedings{straka-etal-2018-sumeczech, |
|
title = "{S}ume{C}zech: Large {C}zech News-Based Summarization Dataset", |
|
author = "Straka, Milan and |
|
Mediankin, Nikita and |
|
Kocmi, Tom and |
|
{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and |
|
Hude{\v{c}}ek, Vojt{\v{e}}ch and |
|
Haji{\v{c}}, Jan", |
|
editor = "Calzolari, Nicoletta and |
|
Choukri, Khalid and |
|
Cieri, Christopher and |
|
Declerck, Thierry and |
|
Goggi, Sara and |
|
Hasida, Koiti and |
|
Isahara, Hitoshi and |
|
Maegaard, Bente and |
|
Mariani, Joseph and |
|
Mazo, H{\'e}l{\`e}ne and |
|
Moreno, Asuncion and |
|
Odijk, Jan and |
|
Piperidis, Stelios and |
|
Tokunaga, Takenobu", |
|
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", |
|
month = may, |
|
year = "2018", |
|
address = "Miyazaki, Japan", |
|
publisher = "European Language Resources Association (ELRA)", |
|
url = "https://aclanthology.org/L18-1551", |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
ROUGE RAW is language-agnostic variant of ROUGE without stemmer, stop words and synonymas. |
|
This is a wrapper around the original http://hdl.handle.net/11234/1-2615 script. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
ROCUE RAW metric for list of predictions and references. |
|
Args: |
|
predictions: list of predictions to evaluate. Each prediction should be a string with tokens separated by spaces. |
|
references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. |
|
select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'. |
|
If None, all metrics are returned as a dictionary. |
|
Returns: |
|
This metric outputs a dictionary, containing the scores. |
|
There are precision, recall, F1 values for rougeraw-1, rougeraw-2 and rougeraw-l. By default the bootstrapped confidence intervals are calculated, meaning that for each metric there are low, mid , high values specifying the confidence interval. |
|
|
|
Key format: |
|
``` |
|
{1|2|l}_{low|mid|high}_{precision|recall|fmeasure} |
|
e.g.: 1_low_precision |
|
``` |
|
|
|
If aggregate is False the format is: |
|
``` |
|
{1|2|l}_{precision|recall|fmeasure} |
|
e.g.: 1_precision |
|
``` |
|
Examples: |
|
>>> rougeraw = evaluate.load('CZLC/rouge_raw') |
|
>>> predictions = ["the cat is on the mat", "hello there"] |
|
>>> references = ["the cat is on the mat", "hello there"] |
|
>>> results = rougeraw.compute(predictions=predictions, references=references) |
|
>>> print(results) |
|
{'rougeraw1_precision': 1.0, 'rougeraw1_recall': 1.0, 'rougeraw1_fmeasure': 1.0, 'rougeraw2_precision': 1.0, 'rougeraw2_recall': 1.0, 'rougeraw2_fmeasure': 1.0, 'rougerawl_precision': 1.0, 'rougerawl_recall': 1.0, 'rougerawl_fmeasure': 1.0} |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class RougeRaw(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=[ |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Value("string", id="sequence"), |
|
} |
|
), |
|
], |
|
reference_urls=[ |
|
"http://hdl.handle.net/11234/1-2615", |
|
], |
|
) |
|
|
|
def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None, |
|
aggregate: bool = True): |
|
res = RougeRawOriginal().corpus(references, predictions, aggregate=aggregate) |
|
|
|
if not aggregate: |
|
res = { |
|
"1_precision": res["1"].p, |
|
"1_recall": res["1"].r, |
|
"1_fmeasure": res["1"].f, |
|
"2_precision": res["2"].p, |
|
"2_recall": res["2"].r, |
|
"2_fmeasure": res["2"].f, |
|
"L_precision": res["L"].p, |
|
"L_recall": res["L"].r, |
|
"L_fmeasure": res["L"].f, |
|
} |
|
|
|
if select is not None: |
|
return res[select] |
|
return res |
|
|