Spaces:

evaluate-metric
/

google_bleu

Running

App Files Files Community

google_bleu / google_bleu.py

lvwerra HF staff

Update Space (evaluate main: dfdd0cc0)

70a043b over 2 years ago

raw

history blame

8.69 kB

	# Copyright 2020 The HuggingFace Evaluate Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Google BLEU (aka GLEU) metric. """

	from typing import Dict, List

	import datasets
	from nltk.translate import gleu_score

	import evaluate
	from evaluate import EvaluationModuleInfo

	from .tokenizer_13a import Tokenizer13a


	_CITATION = """\
	@misc{wu2016googles,
	title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
	author={Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey
	and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin
	Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto
	Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and
	Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes
	and Jeffrey Dean},
	year={2016},
	eprint={1609.08144},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
	}
	"""

	_DESCRIPTION = """\
	The BLEU score has some undesirable properties when used for single
	sentences, as it was designed to be a corpus measure. We therefore
	use a slightly different score for our RL experiments which we call
	the 'GLEU score'. For the GLEU score, we record all sub-sequences of
	1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
	compute a recall, which is the ratio of the number of matching n-grams
	to the number of total n-grams in the target (ground truth) sequence,
	and a precision, which is the ratio of the number of matching n-grams
	to the number of total n-grams in the generated output sequence. Then
	GLEU score is simply the minimum of recall and precision. This GLEU
	score's range is always between 0 (no matches) and 1 (all match) and
	it is symmetrical when switching output and target. According to
	our experiments, GLEU score correlates quite well with the BLEU
	metric on a corpus level but does not have its drawbacks for our per
	sentence reward objective.
	"""

	_KWARGS_DESCRIPTION = """\
	Computes corpus-level Google BLEU (GLEU) score of translated segments against one or more references.
	Instead of averaging the sentence level GLEU scores (i.e. macro-average precision), Wu et al. (2016) sum up the matching
	tokens and the max of hypothesis and reference tokens for each sentence, then compute using the aggregate values.

	Args:
	predictions (list of str): list of translations to score.
	references (list of list of str): list of lists of references for each translation.
	tokenizer : approach used for tokenizing `predictions` and `references`.
	The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
	This can be replaced by any function that takes a string as input and returns a list of tokens as output.
	min_len (int): The minimum order of n-gram this function should extract. Defaults to 1.
	max_len (int): The maximum order of n-gram this function should extract. Defaults to 4.

	Returns:
	'google_bleu': google_bleu score

	Examples:
	Example 1:
	>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
	'he read the book because he was interested in world history']
	>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat'], \
	['he was interested in world history because he read the book']]
	>>> google_bleu = evaluate.load("google_bleu")
	>>> results = google_bleu.compute(predictions=predictions, references=references)
	>>> print(round(results["google_bleu"], 2))
	0.44

	Example 2:
	>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
	'he read the book because he was interested in world history']
	>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
	'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
	'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
	['he was interested in world history because he read the book']]
	>>> google_bleu = evaluate.load("google_bleu")
	>>> results = google_bleu.compute(predictions=predictions, references=references)
	>>> print(round(results["google_bleu"], 2))
	0.61

	Example 3:
	>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
	'he read the book because he was interested in world history']
	>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
	'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
	'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
	['he was interested in world history because he read the book']]
	>>> google_bleu = evaluate.load("google_bleu")
	>>> results = google_bleu.compute(predictions=predictions, references=references, min_len=2)
	>>> print(round(results["google_bleu"], 2))
	0.53

	Example 4:
	>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
	'he read the book because he was interested in world history']
	>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
	'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
	'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
	['he was interested in world history because he read the book']]
	>>> google_bleu = evaluate.load("google_bleu")
	>>> results = google_bleu.compute(predictions=predictions,references=references, min_len=2, max_len=6)
	>>> print(round(results["google_bleu"], 2))
	0.4
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class GoogleBleu(evaluate.EvaluationModule):
	def _info(self) -> EvaluationModuleInfo:
	return evaluate.EvaluationModuleInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=[
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
	}
	),
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	],
	)

	def _compute(
	self,
	predictions: List[str],
	references: List[List[str]],
	tokenizer=Tokenizer13a(),
	min_len: int = 1,
	max_len: int = 4,
	) -> Dict[str, float]:
	# if only one reference is provided make sure we still use list of lists
	if isinstance(references[0], str):
	references = [[ref] for ref in references]

	references = [[tokenizer(r) for r in ref] for ref in references]
	predictions = [tokenizer(p) for p in predictions]
	return {
	"google_bleu": gleu_score.corpus_gleu(
	list_of_references=references, hypotheses=predictions, min_len=min_len, max_len=max_len
	)
	}