Source code for mindspore.train.metrics.bleu_score

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""BleuScore."""
from __future__ import absolute_import

from collections import Counter
import numpy as np

from mindspore import _checkparam as validator
from mindspore.train.metrics.metric import Metric, rearrange_inputs


[docs]class BleuScore(Metric): """ Calculates the BLEU score. BLEU (bilingual evaluation understudy) is a metric for evaluating the quality of text translated by machine. Args: n_gram (int): The n_gram value ranges from 1 to 4. Default: 4. smooth (bool): Whether or not to apply smoothing. Default: False. Raises: ValueError: If the value range of n_gram is not from 1 to 4. Supported Platforms: ``Ascend`` ``GPU`` ``CPU`` Examples: >>> from mindspore.train import BleuScore >>> >>> candidate_corpus = [['i', 'have', 'a', 'pen', 'on', 'my', 'desk']] >>> reference_corpus = [[['i', 'have', 'a', 'pen', 'in', 'my', 'desk'], ... ['there', 'is', 'a', 'pen', 'on', 'the', 'desk']]] >>> metric = BleuScore() >>> metric.clear() >>> metric.update(candidate_corpus, reference_corpus) >>> bleu_score = metric.eval() >>> print(bleu_score) 0.5946035575013605 """ def __init__(self, n_gram=4, smooth=False): super().__init__() self.n_gram = validator.check_value_type("n_gram", n_gram, [int]) if self.n_gram > 4 or self.n_gram < 1: raise ValueError("For 'BleuScore', the argument 'n_gram' should range from 1 to 4, " "but got {}.".format(n_gram)) self.smooth = validator.check_value_type("smooth", smooth, [bool]) self.clear()
[docs] def clear(self): """Clear the internal evaluation result.""" self._numerator = np.zeros(self.n_gram) self._denominator = np.zeros(self.n_gram) self._precision_scores = np.zeros(self.n_gram) self._c = 0.0 self._r = 0.0 self._trans_len = 0 self._ref_len = 0 self._is_update = False
def _count_ngram(self, ngram_input_list, n_gram): """ Counting how many times each word appears in a given text with ngram. Args: ngram_input_list (list): A list of translated text or reference texts. n_gram (int): gram value ranges from 1 to 4. Return: ngram_counter: a collections.Counter object of ngram. """ ngram_counter = Counter() for i in range(1, n_gram + 1): for j in range(len(ngram_input_list) - i + 1): ngram_key = tuple(ngram_input_list[j:(i + j)]) ngram_counter[ngram_key] += 1 return ngram_counter
[docs] @rearrange_inputs def update(self, *inputs): """ Updates the internal evaluation result with `candidate_corpus` and `reference_corpus`. Args: inputs(iterator): Input `candidate_corpus` and `reference_corpus`. `candidate_corpus` and `reference_corpus` are both a list. The `candidate_corpus` is an iterable of machine translated corpus. The `reference_corpus` is an iterable object of iterables of reference corpus. Raises: ValueError: If the number of inputs is not 2. ValueError: If the lengths of `candidate_corpus` and `reference_corpus` are not equal. """ if len(inputs) != 2: raise ValueError("For 'BleuScore.update', it needs 2 inputs (candidate_corpus, reference_corpus), " "but got {}.".format(len(inputs))) candidate_corpus = inputs[0] reference_corpus = inputs[1] if len(candidate_corpus) != len(reference_corpus): raise ValueError("For 'BleuScore.update', 'translate_corpus' (inputs[0]) and 'reference_corpus' " "(inputs[1]) should be equal in length, but got {}, {}" .format(len(candidate_corpus), len(reference_corpus))) for (candidate, references) in zip(candidate_corpus, reference_corpus): self._c += len(candidate) ref_len_list = [len(ref) for ref in references] ref_len_diff = [abs(len(candidate) - x) for x in ref_len_list] self._r += ref_len_list[ref_len_diff.index(min(ref_len_diff))] translation_counter = self._count_ngram(candidate, self.n_gram) reference_counter = Counter() for ref in references: reference_counter |= self._count_ngram(ref, self.n_gram) ngram_counter_clip = translation_counter & reference_counter for counter_clip in ngram_counter_clip: self._numerator[len(counter_clip) - 1] += ngram_counter_clip[counter_clip] for counter in translation_counter: self._denominator[len(counter) - 1] += translation_counter[counter] self._trans_len = np.array(self._c) self._ref_len = np.array(self._r) self._is_update = True
[docs] def eval(self): """ Computes the bleu score. Returns: numpy.float64, the bleu score. Raises: RuntimeError: If the update method is not called first, an error will be reported. """ if self._is_update is False: raise RuntimeError("Please call the 'update' method before calling 'eval' method.") if min(self._numerator) == 0.0: return np.array(0.0) if self.smooth: precision_scores = np.add(self._numerator, np.ones(self.n_gram)) / np.add(self._denominator, np.ones(self.n_gram)) else: precision_scores = self._numerator / self._denominator log_precision_scores = np.array([1.0 / self.n_gram] * self.n_gram) * np.log(precision_scores) geometric_mean = np.exp(np.sum(log_precision_scores)) brevity_penalty = np.array(1.0) if self._c > self._r else np.exp(1 - (self._ref_len / self._trans_len)) bleu = brevity_penalty * geometric_mean return bleu