Spaces:

giulio98
/

codebleu

Runtime error

App Files Files Community

giulio98 commited on Nov 7, 2022

Commit

b7f853e

•

1 Parent(s): aff5728

Upload 7 files

Browse files

Files changed (6) hide show

bleu.py +584 -128
calc_code_bleu.py +72 -0
dataflow_match.py +9 -1274
readme.txt +1 -0
syntax_match.py +9 -1274
weighted_ngram_match.py +4 -102

bleu.py CHANGED Viewed

@@ -1,134 +1,590 @@
-# Copyright 2017 Google Inc. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python implementation of BLEU and smooth-BLEU.
-This module provides a Python implementation of BLEU and smooth-BLEU.
-Smooth BLEU is computed following the method outlined in the paper:
-Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
-evaluation metrics for machine translation. COLING 2004.
-"""
-import collections
 import math
-def _get_ngrams(segment, max_order):
-  """Extracts all n-grams upto a given maximum order from an input segment.
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i+order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 smooth=False):
-  """Computes BLEU score of translated segments against one or more references.
-  Args:
-    reference_corpus: list of lists of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    smooth: Whether or not to apply Lin et al. 2004 smoothing.
-  Returns:
-    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
-    precisions and brevity penalty.
-  """
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  reference_length = 0
-  translation_length = 0
-  for (references, translation) in zip(reference_corpus,
-                                       translation_corpus):
-    reference_length += min(len(r) for r in references)
-    translation_length += len(translation)
-    merged_ref_ngram_counts = collections.Counter()
     for reference in references:
-      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
-    translation_ngram_counts = _get_ngrams(translation, max_order)
-    overlap = translation_ngram_counts & merged_ref_ngram_counts
-    for ngram in overlap:
-      matches_by_order[len(ngram)-1] += overlap[ngram]
-    for order in range(1, max_order+1):
-      possible_matches = len(translation) - order + 1
-      if possible_matches > 0:
-        possible_matches_by_order[order-1] += possible_matches
-  precisions = [0] * max_order
-  for i in range(0, max_order):
-    if smooth:
-      precisions[i] = ((matches_by_order[i] + 1.) /
-                       (possible_matches_by_order[i] + 1.))
     else:
-      if possible_matches_by_order[i] > 0:
-        precisions[i] = (float(matches_by_order[i]) /
-                         possible_matches_by_order[i])
-      else:
-        precisions[i] = 0.0
-  if min(precisions) > 0:
-    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
-    geo_mean = math.exp(p_log_sum)
-  else:
-    geo_mean = 0
-  ratio = float(translation_length) / reference_length
-  if ratio > 1.0:
-    bp = 1.
-  else:
-    bp = math.exp(1 - 1. / ratio)
-  bleu = geo_mean * bp
-  return (bleu, precisions, bp, ratio, translation_length, reference_length)
-def _bleu(ref_file, trans_file, subword_option=None):
-    max_order = 4
-    smooth = True
-    ref_files = [ref_file]
-    reference_text = []
-    for reference_filename in ref_files:
-        with open(reference_filename) as fh:
-            reference_text.append(fh.readlines())
-    per_segment_references = []
-    for references in zip(*reference_text):
-        reference_list = []
-        for reference in references:
-            reference_list.append(reference.strip().split())
-        per_segment_references.append(reference_list)
-    translations = []
-    with open(trans_file) as fh:
-        for line in fh:
-            translations.append(line.strip().split())
-    bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
-    return round(100 * bleu_score,2)

+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: BLEU Score
 #
+# Copyright (C) 2001-2020 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""BLEU score implementation."""
 import math
+import sys
+from fractions import Fraction
+import warnings
+from collections import Counter
+from .utils import ngrams
+import pdb
+def sentence_bleu(
+    references,
+    hypothesis,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+):
+    """
+    Calculate BLEU score (Bilingual Evaluation Understudy) from
+    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
+    "BLEU: a method for automatic evaluation of machine translation."
+    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
+    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...               'ensures', 'that', 'the', 'military', 'always',
+    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+    ...               'that', 'party', 'direct']
+    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...               'heed', 'Party', 'commands']
+    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...               'guarantees', 'the', 'military', 'forces', 'always',
+    ...               'being', 'under', 'the', 'command', 'of', 'the',
+    ...               'Party']
+    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...               'of', 'the', 'party']
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
+    0.5045...
+    If there is no ngrams overlap for any order of n-grams, BLEU returns the
+    value 0. This is because the precision for the order of n-grams without
+    overlap is 0, and the geometric mean in the final BLEU score computation
+    multiplies the 0 with the precision of other n-grams. This results in 0
+    (independently of the precision of the othe n-gram orders). The following
+    example has zero 3-gram and 4-gram overlaps:
+    >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
+    0.0
+    To avoid this harsh behaviour when no ngram overlaps are found a smoothing
+    function can be used.
+    >>> chencherry = SmoothingFunction()
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
+    ...     smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
+    0.0370...
+    The default BLEU calculates a score for up to 4-grams using uniform
+    weights (this is called BLEU-4). To evaluate your translations with
+    higher/lower order ngrams, use customized weights. E.g. when accounting
+    for up to 5-grams with uniform weights (this is called BLEU-5) use:
+    >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
+    0.3920...
+    :param references: reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh: Option to re-normalize the weights uniformly.
+    :type auto_reweigh: bool
+    :return: The sentence-level BLEU score.
+    :rtype: float
+    """
+    return corpus_bleu(
+        [references], [hypothesis], weights, smoothing_function, auto_reweigh
+    )
+def corpus_bleu(
+    list_of_references,
+    hypotheses,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5920...
+    The example below show that corpus_bleu() is different from averaging
+    sentence_bleu() for hypotheses
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6223...
+    :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type list_of_references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh: Option to re-normalize the weights uniformly.
+    :type auto_reweigh: bool
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    # Before proceeding to compute BLEU, perform sanity checks.
+    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+    assert len(list_of_references) == len(hypotheses), (
+        "The number of hypotheses and their reference(s) should be the " "same "
+    )
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len = len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+    # Calculate corpus-level brevity penalty.
+    bp = brevity_penalty(ref_lengths, hyp_lengths)
+    # Uniformly re-weighting based on maximum hypothesis lengths if largest
+    # order of n-grams < 4 and weights is set at default.
+    if auto_reweigh:
+        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+            weights = (1 / hyp_lengths,) * hyp_lengths
+    # Collects the various precision values for the different ngram orders.
+    p_n = [
+        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+        for i, _ in enumerate(weights, start=1)
+    ]
+    # Returns 0 if there's no matching n-grams
+    # We only need to check for p_numerators[1] == 0, since if there's
+    # no unigrams, there won't be any higher order ngrams.
+    if p_numerators[1] == 0:
+        return 0
+    # If there's no smoothing, set use method0 from SmoothinFunction class.
+    if not smoothing_function:
+        smoothing_function = SmoothingFunction().method1
+    # Smoothen the modified precision.
+    # Note: smoothing_function() may convert values into floats;
+    #       it tries to retain the Fraction object as much as the
+    #       smoothing method allows.
+    p_n = smoothing_function(
+        p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
+    )
+    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
+    s = bp * math.exp(math.fsum(s))
+    return s
+def modified_precision(references, hypothesis, n):
+    """
+    Calculate modified ngram precision.
+    The normal precision method may lead to some wrong translations with
+    high-precision, e.g., the translation, in which a word of reference
+    repeats several times, has very high precision.
+    This function only returns the Fraction object that contains the numerator
+    and denominator necessary to calculate the corpus-level precision.
+    To calculate the modified precision for a single pair of hypothesis and
+    references, cast the Fraction object into a float.
+    The famous "the the the ... " example shows that you can get BLEU precision
+    by duplicating high frequency words.
+        >>> reference1 = 'the cat is on the mat'.split()
+        >>> reference2 = 'there is a cat on the mat'.split()
+        >>> hypothesis1 = 'the the the the the the the'.split()
+        >>> references = [reference1, reference2]
+        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
+        0.2857...
+    In the modified n-gram precision, a reference word will be considered
+    exhausted after a matching hypothesis word is identified, e.g.
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will',
+        ...               'forever', 'heed', 'Party', 'commands']
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> hypothesis = 'of the'.split()
+        >>> references = [reference1, reference2, reference3]
+        >>> float(modified_precision(references, hypothesis, n=1))
+        1.0
+        >>> float(modified_precision(references, hypothesis, n=2))
+        1.0
+    An example of a normal machine translation hypothesis:
+        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+        ...               'ensures', 'that', 'the', 'military', 'always',
+        ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+        >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+        ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+        ...               'that', 'party', 'direct']
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will',
+        ...               'forever', 'heed', 'Party', 'commands']
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> references = [reference1, reference2, reference3]
+        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
+        0.9444...
+        >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
+        0.5714...
+        >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
+        0.5882352941176471
+        >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
+        0.07692...
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: A hypothesis translation.
+    :type hypothesis: list(str)
+    :param n: The ngram order.
+    :type n: int
+    :return: BLEU's modified precision for the nth order ngram.
+    :rtype: Fraction
+    """
+    # Extracts all ngrams in hypothesis
+    # Set an empty Counter if hypothesis is empty.
+    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
+    # Extract a union of references' counts.
+    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
+    max_counts = {}
     for reference in references:
+        reference_counts = (
+            Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
+        )
+        for ngram in counts:
+            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
+    # Assigns the intersection between hypothesis and references' counts.
+    clipped_counts = {
+        ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
+    }
+    numerator = sum(clipped_counts.values())
+    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
+    # Usually this happens when the ngram order is > len(reference).
+    denominator = max(1, sum(counts.values()))
+    return Fraction(numerator, denominator, _normalize=False)
+def closest_ref_length(references, hyp_len):
+    """
+    This function finds the reference that is the closest length to the
+    hypothesis. The closest reference length is referred to as *r* variable
+    from the brevity penalty formula in Papineni et. al. (2002)
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hyp_len: The length of the hypothesis.
+    :type hyp_len: int
+    :return: The length of the reference that's closest to the hypothesis.
+    :rtype: int
+    """
+    ref_lens = (len(reference) for reference in references)
+    closest_ref_len = min(
+        ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
+    )
+    return closest_ref_len
+def brevity_penalty(closest_ref_len, hyp_len):
+    """
+    Calculate brevity penalty.
+    As the modified n-gram precision still has the problem from the short
+    length sentence, brevity penalty is used to modify the overall BLEU
+    score according to length.
+    An example from the paper. There are three references with length 12, 15
+    and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
+        >>> reference1 = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> reference2 = list('aaaaaaaaaaaaaaa')   # i.e. ['a'] * 15
+        >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
+        >>> hypothesis = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> references = [reference1, reference2, reference3]
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        1.0
+    In case a hypothesis translation is shorter than the references, penalty is
+    applied.
+        >>> references = [['a'] * 28, ['a'] * 28]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        0.2635971381157267
+    The length of the closest reference is used to compute the penalty. If the
+    length of a hypothesis is 12, and the reference lengths are 13 and 2, the
+    penalty is applied because the hypothesis length (12) is less then the
+    closest reference length (13).
+        >>> references = [['a'] * 13, ['a'] * 2]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
+        0.9200...
+    The brevity penalty doesn't depend on reference order. More importantly,
+    when two reference sentences are at the same distance, the shortest
+    reference sentence length is used.
+        >>> references = [['a'] * 13, ['a'] * 11]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(reversed(references), hyp_len)
+        >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
+        >>> bp1 == bp2 == 1
+        True
+    A test example from mteval-v13a.pl (starting from the line 705):
+        >>> references = [['a'] * 11, ['a'] * 8]
+        >>> hypothesis = ['a'] * 7
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
+        0.8668...
+        >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+        >>> hypothesis = ['a'] * 7
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        1.0
+    :param hyp_len: The length of the hypothesis for a single sentence OR the
+    sum of all the hypotheses' lengths for a corpus
+    :type hyp_len: int
+    :param closest_ref_len: The length of the closest reference for a single
+    hypothesis OR the sum of all the closest references for every hypotheses.
+    :type closest_ref_len: int
+    :return: BLEU's brevity penalty.
+    :rtype: float
+    """
+    if hyp_len > closest_ref_len:
+        return 1
+    # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
+    elif hyp_len == 0:
+        return 0
     else:
+        return math.exp(1 - closest_ref_len / hyp_len)
+class SmoothingFunction:
+    """
+    This is an implementation of the smoothing techniques
+    for segment-level BLEU scores that was presented in
+    Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
+    Smoothing Techniques for Sentence-Level BLEU. In WMT14.
+    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+    """
+    def __init__(self, epsilon=0.1, alpha=5, k=5):
+        """
+        This will initialize the parameters required for the various smoothing
+        techniques, the default values are set to the numbers used in the
+        experiments from Chen and Cherry (2014).
+        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
+        ...                 'that', 'the', 'military', 'always', 'obeys', 'the',
+        ...                 'commands', 'of', 'the', 'party']
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
+        ...               'that', 'the', 'military', 'will', 'forever', 'heed',
+        ...               'Party', 'commands']
+        >>> chencherry = SmoothingFunction()
+        >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
+        0.4489...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
+        0.4905...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
+        0.4135...
+        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
+        0.4905...
+        :param epsilon: the epsilon value use in method 1
+        :type epsilon: float
+        :param alpha: the alpha value use in method 6
+        :type alpha: int
+        :param k: the k value use in method 4
+        :type k: int
+        """
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.k = k
+    def method0(self, p_n, *args, **kwargs):
+        """
+        No smoothing.
+        """
+        p_n_new = []
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator != 0:
+                p_n_new.append(p_i)
+            else:
+                _msg = str(
+                    "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
+                    "Therefore the BLEU score evaluates to 0, independently of\n"
+                    "how many N-gram overlaps of lower order it contains.\n"
+                    "Consider using lower n-gram order or use "
+                    "SmoothingFunction()"
+                ).format(i + 1)
+                warnings.warn(_msg)
+                # When numerator==0 where denonminator==0 or !=0, the result
+                # for the precision score should be equal to 0 or undefined.
+                # Due to BLEU geometric mean computation in logarithm space,
+                # we we need to take the return sys.float_info.min such that
+                # math.log(sys.float_info.min) returns a 0 precision score.
+                p_n_new.append(sys.float_info.min)
+        return p_n_new
+    def method1(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
+        """
+        return [
+            (p_i.numerator + self.epsilon) / p_i.denominator
+            if p_i.numerator == 0
+            else p_i
+            for p_i in p_n
+        ]
+    def method2(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 2: Add 1 to both numerator and denominator from
+        Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
+        machine translation quality using longest common subsequence and
+        skip-bigram statistics. In ACL04.
+        """
+        return [
+            Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
+            for p_i in p_n
+        ]
+    def method3(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 3: NIST geometric sequence smoothing
+        The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
+        precision score whose matching n-gram count is null.
+        k is 1 for the first 'n' value for which the n-gram match count is null/
+        For example, if the text contains:
+         - one 2-gram match
+         - and (consequently) two 1-gram matches
+        the n-gram count for each individual precision score would be:
+         - n=1  =>  prec_count = 2     (two unigrams)
+         - n=2  =>  prec_count = 1     (one bigram)
+         - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
+         - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
+        """
+        incvnt = 1  # From the mteval-v13a.pl, it's referred to as k.
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator == 0:
+                p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
+                incvnt += 1
+        return p_n
+    def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+        """
+        Smoothing method 4:
+        Shorter translations may have inflated precision values due to having
+        smaller denominators; therefore, we give them proportionally
+        smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
+        suggests dividing by 1/ln(len(T)), where T is the length of the translation.
+        """
+        hyp_len = hyp_len if hyp_len else len(hypothesis)
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator == 0 and hyp_len != 0:
+                incvnt = i + 1 * self.k / math.log(
+                    hyp_len
+                )  # Note that this K is different from the K from NIST.
+                p_n[i] = incvnt / p_i.denominator
+        return p_n
+    def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+        """
+        Smoothing method 5:
+        The matched counts for similar values of n should be similar. To a
+        calculate the n-gram matched count, it averages the n−1, n and n+1 gram
+        matched counts.
+        """
+        hyp_len = hyp_len if hyp_len else len(hypothesis)
+        m = {}
+        # Requires an precision value for an addition ngram order.
+        p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
+        m[-1] = p_n[0] + 1
+        for i, p_i in enumerate(p_n):
+            p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
+            m[i] = p_n[i]
+        return p_n
+    def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+        """
+        Smoothing method 6:
+        Interpolates the maximum likelihood estimate of the precision *p_n* with
+        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
+        Gao and He (2013) Training MRF-Based Phrase Translation Models using
+        Gradient Ascent. In NAACL.
+        """
+        hyp_len = hyp_len if hyp_len else len(hypothesis)
+        # This smoothing only works when p_1 and p_2 is non-zero.
+        # Raise an error with an appropriate message when the input is too short
+        # to use this smoothing technique.
+        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
+        for i, p_i in enumerate(p_n):
+            if i in [0, 1]:  # Skips the first 2 orders of ngrams.
+                continue
+            else:
+                pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
+                # No. of ngrams in translation that matches the reference.
+                m = p_i.numerator
+                # No. of ngrams in translation.
+                l = sum(1 for _ in ngrams(hypothesis, i + 1))
+                # Calculates the interpolated precision.
+                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
+        return p_n
+    def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+        """
+        Smoothing method 7:
+        Interpolates methods 4 and 5.
+        """
+        hyp_len = hyp_len if hyp_len else len(hypothesis)
+        p_n = self.method4(p_n, references, hypothesis, hyp_len)
+        p_n = self.method5(p_n, references, hypothesis, hyp_len)
+        return p_n

calc_code_bleu.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# -*- coding:utf-8 -*-
+import argparse
+from .bleu import corpus_bleu
+from .weighted_ngram_match import corpus_weighted_ngram_match
+from .syntax_match import corpus_syntax_match
+from .dataflow_match import corpus_dataflow_match
+import os
+def calculate(predictions, references, language="python", alpha=0.25, beta=0.25, gamma=0.25, theta=0.25):
+    # preprocess inputs
+    pre_references = [[s.strip() for s in my_list] for my_list in references]
+    hypothesis = [s.strip() for s in predictions]
+    for i in range(len(pre_references)):
+        assert len(hypothesis) == len(pre_references[i])
+    references = []
+    for i in range(len(hypothesis)):
+        ref_for_instance = []
+        for j in range(len(pre_references)):
+            ref_for_instance.append(pre_references[j][i])
+        references.append(ref_for_instance)
+    assert len(references) == len(pre_references)*len(hypothesis)
+    # calculate ngram match (BLEU)
+    tokenized_hyps = [x.split() for x in hypothesis]
+    tokenized_refs = [[x.split() for x in reference] for reference in references]
+    ngram_match_score = corpus_bleu(tokenized_refs,tokenized_hyps)
+    # calculate weighted ngram match
+    # from os import listdir
+    # from os.path import isfile, join
+    # onlyfiles = [f for f in listdir("./keywords") if isfile(join("keywords", f))]
+    # print(onlyfiles)
+    curr_path = os.path.dirname(os.path.abspath(__file__))
+    keywords = [x.strip() for x in open(curr_path + "/keywords/" + language +'.txt', 'r', encoding='utf-8').readlines()]
+    def make_weights(reference_tokens, key_word_list):
+        return {token:1 if token in key_word_list else 0.2 \
+                for token in reference_tokens}
+    tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
+                for reference_tokens in reference] for reference in tokenized_refs]
+    weighted_ngram_match_score = corpus_weighted_ngram_match(tokenized_refs_with_weights,tokenized_hyps)
+    # calculate syntax match
+    syntax_match_score = corpus_syntax_match(references, hypothesis, language)
+    # calculate dataflow match
+    dataflow_match_score = corpus_dataflow_match(references, hypothesis, language)
+    code_bleu_score = alpha*ngram_match_score\
+                    + beta*weighted_ngram_match_score\
+                    + gamma*syntax_match_score\
+                    + theta*dataflow_match_score
+    return {
+        "ngram_match_score": ngram_match_score,
+        "weighted_ngram_match_score": weighted_ngram_match_score,
+        "syntax_match_score": syntax_match_score,
+        "dataflow_match_score": dataflow_match_score,
+        "code_bleu_score": code_bleu_score
+    }

dataflow_match.py CHANGED Viewed

@@ -1,1280 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 from tree_sitter import Language, Parser
 import pdb
-import re
-from io import StringIO
-import  tokenize
-def remove_comments_and_docstrings(source,lang):
-    if lang in ['python']:
-        """
-        Returns 'source' minus comments and docstrings.
-        """
-        io_obj = StringIO(source)
-        out = ""
-        prev_toktype = tokenize.INDENT
-        last_lineno = -1
-        last_col = 0
-        for tok in tokenize.generate_tokens(io_obj.readline):
-            token_type = tok[0]
-            token_string = tok[1]
-            start_line, start_col = tok[2]
-            end_line, end_col = tok[3]
-            ltext = tok[4]
-            if start_line > last_lineno:
-                last_col = 0
-            if start_col > last_col:
-                out += (" " * (start_col - last_col))
-            # Remove comments:
-            if token_type == tokenize.COMMENT:
-                pass
-            # This series of conditionals removes docstrings:
-            elif token_type == tokenize.STRING:
-                if prev_toktype != tokenize.INDENT:
-            # This is likely a docstring; double-check we're not inside an operator:
-                    if prev_toktype != tokenize.NEWLINE:
-                        if start_col > 0:
-                            out += token_string
-            else:
-                out += token_string
-            prev_toktype = token_type
-            last_col = end_col
-            last_lineno = end_line
-        temp=[]
-        for x in out.split('\n'):
-            if x.strip()!="":
-                temp.append(x)
-        return '\n'.join(temp)
-    elif lang in ['ruby']:
-        return source
-    else:
-        def replacer(match):
-            s = match.group(0)
-            if s.startswith('/'):
-                return " " # note: a space and not an empty string
-            else:
-                return s
-        pattern = re.compile(
-            r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
-            re.DOTALL | re.MULTILINE
-        )
-        temp=[]
-        for x in re.sub(pattern, replacer, source).split('\n'):
-            if x.strip()!="":
-                temp.append(x)
-        return '\n'.join(temp)
-def tree_to_token_index(root_node):
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        return [(root_node.start_point,root_node.end_point)]
-    else:
-        code_tokens=[]
-        for child in root_node.children:
-            code_tokens+=tree_to_token_index(child)
-        return code_tokens
-def tree_to_variable_index(root_node,index_to_code):
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        index=(root_node.start_point,root_node.end_point)
-        _,code=index_to_code[index]
-        if root_node.type!=code:
-            return [(root_node.start_point,root_node.end_point)]
-        else:
-            return []
-    else:
-        code_tokens=[]
-        for child in root_node.children:
-            code_tokens+=tree_to_variable_index(child,index_to_code)
-        return code_tokens
-def index_to_code_token(index,code):
-    start_point=index[0]
-    end_point=index[1]
-    if start_point[0]==end_point[0]:
-        s=code[start_point[0]][start_point[1]:end_point[1]]
-    else:
-        s=""
-        s+=code[start_point[0]][start_point[1]:]
-        for i in range(start_point[0]+1,end_point[0]):
-            s+=code[i]
-        s+=code[end_point[0]][:end_point[1]]
-    return s
-def DFG_python(root_node,index_to_code,states):
-    assignment=['assignment','augmented_assignment','for_in_clause']
-    if_statement=['if_statement']
-    for_statement=['for_statement']
-    while_statement=['while_statement']
-    do_first_statement=['for_in_clause']
-    def_statement=['default_parameter']
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_python(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        if root_node.type=='for_in_clause':
-            right_nodes=[root_node.children[-1]]
-            left_nodes=[root_node.child_by_field_name('left')]
-        else:
-            if root_node.child_by_field_name('right') is None:
-                return [],states
-            left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-            right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-            if len(right_nodes)!=len(left_nodes):
-                left_nodes=[root_node.child_by_field_name('left')]
-                right_nodes=[root_node.child_by_field_name('right')]
-            if len(left_nodes)==0:
-                left_nodes=[root_node.child_by_field_name('left')]
-            if len(right_nodes)==0:
-                right_nodes=[root_node.child_by_field_name('right')]
-        DFG=[]
-        for node in right_nodes:
-            temp,states=DFG_python(node,index_to_code,states)
-            DFG+=temp
-        for left_node,right_node in zip(left_nodes,right_nodes):
-            left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-            right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-            temp=[]
-            for token1_index in left_tokens_index:
-                idx1,code1=index_to_code[token1_index]
-                temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                             [index_to_code[x][0] for x in right_tokens_index]))
-                states[code1]=[idx1]
-            DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in ['elif_clause','else_clause']:
-                temp,current_states=DFG_python(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                temp,new_states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for i in range(2):
-            right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-            left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-            if len(right_nodes)!=len(left_nodes):
-                left_nodes=[root_node.child_by_field_name('left')]
-                right_nodes=[root_node.child_by_field_name('right')]
-            if len(left_nodes)==0:
-                left_nodes=[root_node.child_by_field_name('left')]
-            if len(right_nodes)==0:
-                right_nodes=[root_node.child_by_field_name('right')]
-            for node in right_nodes:
-                temp,states=DFG_python(node,index_to_code,states)
-                DFG+=temp
-            for left_node,right_node in zip(left_nodes,right_nodes):
-                left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-                right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-                temp=[]
-                for token1_index in left_tokens_index:
-                    idx1,code1=index_to_code[token1_index]
-                    temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                                 [index_to_code[x][0] for x in right_tokens_index]))
-                    states[code1]=[idx1]
-                DFG+=temp
-            if  root_node.children[-1].type=="block":
-                temp,states=DFG_python(root_node.children[-1],index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_java(root_node,index_to_code,states):
-    assignment=['assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=['enhanced_for_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_java(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_java(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_java(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_java(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="local_variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_java(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_java(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_csharp(root_node,index_to_code,states):
-    assignment=['assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['postfix_unary_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=['for_each_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        if len(root_node.children)==2:
-            name=root_node.children[0]
-            value=root_node.children[1]
-        else:
-            name=root_node.children[0]
-            value=None
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_csharp(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_csharp(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_csharp(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_csharp(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="local_variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=root_node.child_by_field_name('left')
-        value=root_node.child_by_field_name('right')
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_csharp(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_csharp(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_ruby(root_node,index_to_code,states):
-    assignment=['assignment','operator_assignment']
-    if_statement=['if','elsif','else','unless','when']
-    for_statement=['for']
-    while_statement=['while_modifier','until']
-    do_first_statement=[]
-    def_statement=['keyword_parameter']
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        states=states.copy()
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_ruby(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-        right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-        if len(right_nodes)!=len(left_nodes):
-            left_nodes=[root_node.child_by_field_name('left')]
-            right_nodes=[root_node.child_by_field_name('right')]
-        if len(left_nodes)==0:
-            left_nodes=[root_node.child_by_field_name('left')]
-        if len(right_nodes)==0:
-            right_nodes=[root_node.child_by_field_name('right')]
-        if root_node.type=="operator_assignment":
-            left_nodes=[root_node.children[0]]
-            right_nodes=[root_node.children[-1]]
-        DFG=[]
-        for node in right_nodes:
-            temp,states=DFG_ruby(node,index_to_code,states)
-            DFG+=temp
-        for left_node,right_node in zip(left_nodes,right_nodes):
-            left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-            right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-            temp=[]
-            for token1_index in left_tokens_index:
-                idx1,code1=index_to_code[token1_index]
-                temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                             [index_to_code[x][0] for x in right_tokens_index]))
-                states[code1]=[idx1]
-            DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement:
-                temp,current_states=DFG_ruby(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                temp,new_states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for i in range(2):
-            left_nodes=[root_node.child_by_field_name('pattern')]
-            right_nodes=[root_node.child_by_field_name('value')]
-            assert len(right_nodes)==len(left_nodes)
-            for node in right_nodes:
-                temp,states=DFG_ruby(node,index_to_code,states)
-                DFG+=temp
-            for left_node,right_node in zip(left_nodes,right_nodes):
-                left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-                right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-                temp=[]
-                for token1_index in left_tokens_index:
-                    idx1,code1=index_to_code[token1_index]
-                    temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                                 [index_to_code[x][0] for x in right_tokens_index]))
-                    states[code1]=[idx1]
-                DFG+=temp
-            temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_go(root_node,index_to_code,states):
-    assignment=['assignment_statement',]
-    def_statement=['var_spec']
-    increment_statement=['inc_statement']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=[]
-    while_statement=[]
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_go(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_go(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_go(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_go(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="for_clause":
-                if child.child_by_field_name('update') is not None:
-                    temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states)
-                    DFG+=temp
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_php(root_node,index_to_code,states):
-    assignment=['assignment_expression','augmented_assignment_expression']
-    def_statement=['simple_parameter']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else_clause']
-    for_statement=['for_statement']
-    enhanced_for_statement=['foreach_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('default_value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_php(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_php(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_php(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_php(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="assignment_expression":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=None
-        value=None
-        for child in root_node.children:
-            if child.type=='variable_name' and value is None:
-                value=child
-            elif child.type=='variable_name' and name is None:
-                name=child
-                break
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_php(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_php(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_javascript(root_node,index_to_code,states):
-    assignment=['assignment_pattern','augmented_assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=[]
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_javascript(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_javascript(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_javascript(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_javascript(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
 dfg_function={
     'python':DFG_python,
@@ -1291,8 +25,9 @@ dfg_function={
 def calc_dataflow_match(references, candidate, lang):
     return corpus_dataflow_match([references], [candidate], lang)
-def corpus_dataflow_match(references, candidates, lang):
-    LANGUAGE = Language('parser/my-languages.so', lang)
     parser = Parser()
     parser.set_language(LANGUAGE)
     parser = [parser,dfg_function[lang]]

 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from .parsercode.DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
+from .parsercode.utils import (remove_comments_and_docstrings,
+                   tree_to_token_index,
+                   index_to_code_token,
+                   tree_to_variable_index)
 from tree_sitter import Language, Parser
 import pdb
+import os
 dfg_function={
     'python':DFG_python,
 def calc_dataflow_match(references, candidate, lang):
     return corpus_dataflow_match([references], [candidate], lang)
+def corpus_dataflow_match(references, candidates, lang):
+    curr_path = os.path.dirname(os.path.abspath(__file__))
+    LANGUAGE = Language(curr_path + '/parsercode/my-languages.so', lang)
     parser = Parser()
     parser.set_language(LANGUAGE)
     parser = [parser,dfg_function[lang]]

readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python calc_code_bleu.py --refs reference_files --hyp candidate_file --language java ( or c_sharp) --params 0.25,0.25,0.25,0.25(default)

syntax_match.py CHANGED Viewed

@@ -1,1279 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 from tree_sitter import Language, Parser
-import re
-from io import StringIO
-import  tokenize
-def remove_comments_and_docstrings(source,lang):
-    if lang in ['python']:
-        """
-        Returns 'source' minus comments and docstrings.
-        """
-        io_obj = StringIO(source)
-        out = ""
-        prev_toktype = tokenize.INDENT
-        last_lineno = -1
-        last_col = 0
-        for tok in tokenize.generate_tokens(io_obj.readline):
-            token_type = tok[0]
-            token_string = tok[1]
-            start_line, start_col = tok[2]
-            end_line, end_col = tok[3]
-            ltext = tok[4]
-            if start_line > last_lineno:
-                last_col = 0
-            if start_col > last_col:
-                out += (" " * (start_col - last_col))
-            # Remove comments:
-            if token_type == tokenize.COMMENT:
-                pass
-            # This series of conditionals removes docstrings:
-            elif token_type == tokenize.STRING:
-                if prev_toktype != tokenize.INDENT:
-            # This is likely a docstring; double-check we're not inside an operator:
-                    if prev_toktype != tokenize.NEWLINE:
-                        if start_col > 0:
-                            out += token_string
-            else:
-                out += token_string
-            prev_toktype = token_type
-            last_col = end_col
-            last_lineno = end_line
-        temp=[]
-        for x in out.split('\n'):
-            if x.strip()!="":
-                temp.append(x)
-        return '\n'.join(temp)
-    elif lang in ['ruby']:
-        return source
-    else:
-        def replacer(match):
-            s = match.group(0)
-            if s.startswith('/'):
-                return " " # note: a space and not an empty string
-            else:
-                return s
-        pattern = re.compile(
-            r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
-            re.DOTALL | re.MULTILINE
-        )
-        temp=[]
-        for x in re.sub(pattern, replacer, source).split('\n'):
-            if x.strip()!="":
-                temp.append(x)
-        return '\n'.join(temp)
-def tree_to_token_index(root_node):
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        return [(root_node.start_point,root_node.end_point)]
-    else:
-        code_tokens=[]
-        for child in root_node.children:
-            code_tokens+=tree_to_token_index(child)
-        return code_tokens
-def tree_to_variable_index(root_node,index_to_code):
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        index=(root_node.start_point,root_node.end_point)
-        _,code=index_to_code[index]
-        if root_node.type!=code:
-            return [(root_node.start_point,root_node.end_point)]
-        else:
-            return []
-    else:
-        code_tokens=[]
-        for child in root_node.children:
-            code_tokens+=tree_to_variable_index(child,index_to_code)
-        return code_tokens
-def index_to_code_token(index,code):
-    start_point=index[0]
-    end_point=index[1]
-    if start_point[0]==end_point[0]:
-        s=code[start_point[0]][start_point[1]:end_point[1]]
-    else:
-        s=""
-        s+=code[start_point[0]][start_point[1]:]
-        for i in range(start_point[0]+1,end_point[0]):
-            s+=code[i]
-        s+=code[end_point[0]][:end_point[1]]
-    return s
-def DFG_python(root_node,index_to_code,states):
-    assignment=['assignment','augmented_assignment','for_in_clause']
-    if_statement=['if_statement']
-    for_statement=['for_statement']
-    while_statement=['while_statement']
-    do_first_statement=['for_in_clause']
-    def_statement=['default_parameter']
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_python(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        if root_node.type=='for_in_clause':
-            right_nodes=[root_node.children[-1]]
-            left_nodes=[root_node.child_by_field_name('left')]
-        else:
-            if root_node.child_by_field_name('right') is None:
-                return [],states
-            left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-            right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-            if len(right_nodes)!=len(left_nodes):
-                left_nodes=[root_node.child_by_field_name('left')]
-                right_nodes=[root_node.child_by_field_name('right')]
-            if len(left_nodes)==0:
-                left_nodes=[root_node.child_by_field_name('left')]
-            if len(right_nodes)==0:
-                right_nodes=[root_node.child_by_field_name('right')]
-        DFG=[]
-        for node in right_nodes:
-            temp,states=DFG_python(node,index_to_code,states)
-            DFG+=temp
-        for left_node,right_node in zip(left_nodes,right_nodes):
-            left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-            right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-            temp=[]
-            for token1_index in left_tokens_index:
-                idx1,code1=index_to_code[token1_index]
-                temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                             [index_to_code[x][0] for x in right_tokens_index]))
-                states[code1]=[idx1]
-            DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in ['elif_clause','else_clause']:
-                temp,current_states=DFG_python(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                temp,new_states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for i in range(2):
-            right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-            left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-            if len(right_nodes)!=len(left_nodes):
-                left_nodes=[root_node.child_by_field_name('left')]
-                right_nodes=[root_node.child_by_field_name('right')]
-            if len(left_nodes)==0:
-                left_nodes=[root_node.child_by_field_name('left')]
-            if len(right_nodes)==0:
-                right_nodes=[root_node.child_by_field_name('right')]
-            for node in right_nodes:
-                temp,states=DFG_python(node,index_to_code,states)
-                DFG+=temp
-            for left_node,right_node in zip(left_nodes,right_nodes):
-                left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-                right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-                temp=[]
-                for token1_index in left_tokens_index:
-                    idx1,code1=index_to_code[token1_index]
-                    temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                                 [index_to_code[x][0] for x in right_tokens_index]))
-                    states[code1]=[idx1]
-                DFG+=temp
-            if  root_node.children[-1].type=="block":
-                temp,states=DFG_python(root_node.children[-1],index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_python(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_java(root_node,index_to_code,states):
-    assignment=['assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=['enhanced_for_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_java(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_java(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_java(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_java(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="local_variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_java(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_java(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_java(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_csharp(root_node,index_to_code,states):
-    assignment=['assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['postfix_unary_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=['for_each_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        if len(root_node.children)==2:
-            name=root_node.children[0]
-            value=root_node.children[1]
-        else:
-            name=root_node.children[0]
-            value=None
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_csharp(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_csharp(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_csharp(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_csharp(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="local_variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=root_node.child_by_field_name('left')
-        value=root_node.child_by_field_name('right')
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_csharp(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_csharp(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_csharp(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_ruby(root_node,index_to_code,states):
-    assignment=['assignment','operator_assignment']
-    if_statement=['if','elsif','else','unless','when']
-    for_statement=['for']
-    while_statement=['while_modifier','until']
-    do_first_statement=[]
-    def_statement=['keyword_parameter']
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        states=states.copy()
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_ruby(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
-        right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
-        if len(right_nodes)!=len(left_nodes):
-            left_nodes=[root_node.child_by_field_name('left')]
-            right_nodes=[root_node.child_by_field_name('right')]
-        if len(left_nodes)==0:
-            left_nodes=[root_node.child_by_field_name('left')]
-        if len(right_nodes)==0:
-            right_nodes=[root_node.child_by_field_name('right')]
-        if root_node.type=="operator_assignment":
-            left_nodes=[root_node.children[0]]
-            right_nodes=[root_node.children[-1]]
-        DFG=[]
-        for node in right_nodes:
-            temp,states=DFG_ruby(node,index_to_code,states)
-            DFG+=temp
-        for left_node,right_node in zip(left_nodes,right_nodes):
-            left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-            right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-            temp=[]
-            for token1_index in left_tokens_index:
-                idx1,code1=index_to_code[token1_index]
-                temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                             [index_to_code[x][0] for x in right_tokens_index]))
-                states[code1]=[idx1]
-            DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement:
-                temp,current_states=DFG_ruby(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                temp,new_states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for i in range(2):
-            left_nodes=[root_node.child_by_field_name('pattern')]
-            right_nodes=[root_node.child_by_field_name('value')]
-            assert len(right_nodes)==len(left_nodes)
-            for node in right_nodes:
-                temp,states=DFG_ruby(node,index_to_code,states)
-                DFG+=temp
-            for left_node,right_node in zip(left_nodes,right_nodes):
-                left_tokens_index=tree_to_variable_index(left_node,index_to_code)
-                right_tokens_index=tree_to_variable_index(right_node,index_to_code)
-                temp=[]
-                for token1_index in left_tokens_index:
-                    idx1,code1=index_to_code[token1_index]
-                    temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
-                                 [index_to_code[x][0] for x in right_tokens_index]))
-                    states[code1]=[idx1]
-                DFG+=temp
-            temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_ruby(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_go(root_node,index_to_code,states):
-    assignment=['assignment_statement',]
-    def_statement=['var_spec']
-    increment_statement=['inc_statement']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=[]
-    while_statement=[]
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_go(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_go(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_go(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_go(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="for_clause":
-                if child.child_by_field_name('update') is not None:
-                    temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states)
-                    DFG+=temp
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_go(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_php(root_node,index_to_code,states):
-    assignment=['assignment_expression','augmented_assignment_expression']
-    def_statement=['simple_parameter']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else_clause']
-    for_statement=['for_statement']
-    enhanced_for_statement=['foreach_statement']
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('default_value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_php(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_php(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_php(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_php(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="assignment_expression":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in enhanced_for_statement:
-        name=None
-        value=None
-        for child in root_node.children:
-            if child.type=='variable_name' and value is None:
-                value=child
-            elif child.type=='variable_name' and name is None:
-                name=child
-                break
-        body=root_node.child_by_field_name('body')
-        DFG=[]
-        for i in range(2):
-            temp,states=DFG_php(value,index_to_code,states)
-            DFG+=temp
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            temp,states=DFG_php(body,index_to_code,states)
-            DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_php(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
-def DFG_javascript(root_node,index_to_code,states):
-    assignment=['assignment_pattern','augmented_assignment_expression']
-    def_statement=['variable_declarator']
-    increment_statement=['update_expression']
-    if_statement=['if_statement','else']
-    for_statement=['for_statement']
-    enhanced_for_statement=[]
-    while_statement=['while_statement']
-    do_first_statement=[]
-    states=states.copy()
-    if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
-        idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
-        if root_node.type==code:
-            return [],states
-        elif code in states:
-            return [(code,idx,'comesFrom',[code],states[code].copy())],states
-        else:
-            if root_node.type=='identifier':
-                states[code]=[idx]
-            return [(code,idx,'comesFrom',[],[])],states
-    elif root_node.type in def_statement:
-        name=root_node.child_by_field_name('name')
-        value=root_node.child_by_field_name('value')
-        DFG=[]
-        if value is None:
-            indexs=tree_to_variable_index(name,index_to_code)
-            for index in indexs:
-                idx,code=index_to_code[index]
-                DFG.append((code,idx,'comesFrom',[],[]))
-                states[code]=[idx]
-            return sorted(DFG,key=lambda x:x[1]),states
-        else:
-            name_indexs=tree_to_variable_index(name,index_to_code)
-            value_indexs=tree_to_variable_index(value,index_to_code)
-            temp,states=DFG_javascript(value,index_to_code,states)
-            DFG+=temp
-            for index1 in name_indexs:
-                idx1,code1=index_to_code[index1]
-                for index2 in value_indexs:
-                    idx2,code2=index_to_code[index2]
-                    DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
-                states[code1]=[idx1]
-            return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in assignment:
-        left_nodes=root_node.child_by_field_name('left')
-        right_nodes=root_node.child_by_field_name('right')
-        DFG=[]
-        temp,states=DFG_javascript(right_nodes,index_to_code,states)
-        DFG+=temp
-        name_indexs=tree_to_variable_index(left_nodes,index_to_code)
-        value_indexs=tree_to_variable_index(right_nodes,index_to_code)
-        for index1 in name_indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in value_indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in increment_statement:
-        DFG=[]
-        indexs=tree_to_variable_index(root_node,index_to_code)
-        for index1 in indexs:
-            idx1,code1=index_to_code[index1]
-            for index2 in indexs:
-                idx2,code2=index_to_code[index2]
-                DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
-            states[code1]=[idx1]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in if_statement:
-        DFG=[]
-        current_states=states.copy()
-        others_states=[]
-        flag=False
-        tag=False
-        if 'else' in root_node.type:
-            tag=True
-        for child in root_node.children:
-            if 'else' in child.type:
-                tag=True
-            if child.type not in if_statement and flag is False:
-                temp,current_states=DFG_javascript(child,index_to_code,current_states)
-                DFG+=temp
-            else:
-                flag=True
-                temp,new_states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-                others_states.append(new_states)
-        others_states.append(current_states)
-        if tag is False:
-            others_states.append(states)
-        new_states={}
-        for dic in others_states:
-            for key in dic:
-                if key not in new_states:
-                    new_states[key]=dic[key].copy()
-                else:
-                    new_states[key]+=dic[key]
-        for key in states:
-            if key not in new_states:
-                new_states[key]=states[key]
-            else:
-                new_states[key]+=states[key]
-        for key in new_states:
-            new_states[key]=sorted(list(set(new_states[key])))
-        return sorted(DFG,key=lambda x:x[1]),new_states
-    elif root_node.type in for_statement:
-        DFG=[]
-        for child in root_node.children:
-            temp,states=DFG_javascript(child,index_to_code,states)
-            DFG+=temp
-        flag=False
-        for child in root_node.children:
-            if flag:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-            elif child.type=="variable_declaration":
-                flag=True
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    elif root_node.type in while_statement:
-        DFG=[]
-        for i in range(2):
-            for child in root_node.children:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        dic={}
-        for x in DFG:
-            if (x[0],x[1],x[2]) not in dic:
-                dic[(x[0],x[1],x[2])]=[x[3],x[4]]
-            else:
-                dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
-                dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
-        DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
-        return sorted(DFG,key=lambda x:x[1]),states
-    else:
-        DFG=[]
-        for child in root_node.children:
-            if child.type in do_first_statement:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        for child in root_node.children:
-            if child.type not in do_first_statement:
-                temp,states=DFG_javascript(child,index_to_code,states)
-                DFG+=temp
-        return sorted(DFG,key=lambda x:x[1]),states
 dfg_function={
     'python':DFG_python,
@@ -1288,8 +22,9 @@ dfg_function={
 def calc_syntax_match(references, candidate, lang):
     return corpus_syntax_match([references], [candidate], lang)
-def corpus_syntax_match(references, candidates, lang):
-    JAVA_LANGUAGE = Language('parser/my-languages.so', lang)
     parser = Parser()
     parser.set_language(JAVA_LANGUAGE)
     match_count = 0

 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from .parsercode.DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
+from .parsercode.utils import (remove_comments_and_docstrings,
+                   tree_to_token_index,
+                   index_to_code_token,
+                   tree_to_variable_index)
 from tree_sitter import Language, Parser
+import os
 dfg_function={
     'python':DFG_python,
 def calc_syntax_match(references, candidate, lang):
     return corpus_syntax_match([references], [candidate], lang)
+def corpus_syntax_match(references, candidates, lang):
+    curr_path = os.path.dirname(os.path.abspath(__file__))
+    JAVA_LANGUAGE = Language(curr_path + '/parsercode/my-languages.so', lang)
     parser = Parser()
     parser.set_language(JAVA_LANGUAGE)
     match_count = 0

weighted_ngram_match.py CHANGED Viewed

@@ -17,107 +17,9 @@ import sys
 from fractions import Fraction
 import warnings
 from collections import Counter
-import pdb
-from itertools import chain
-def pad_sequence(
-    sequence,
-    n,
-    pad_left=False,
-    pad_right=False,
-    left_pad_symbol=None,
-    right_pad_symbol=None,
-):
-    """
-    Returns a padded sequence of items before ngram extraction.
-        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
-        ['<s>', 1, 2, 3, 4, 5, '</s>']
-        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
-        ['<s>', 1, 2, 3, 4, 5]
-        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
-        [1, 2, 3, 4, 5, '</s>']
-    :param sequence: the source data to be padded
-    :type sequence: sequence or iter
-    :param n: the degree of the ngrams
-    :type n: int
-    :param pad_left: whether the ngrams should be left-padded
-    :type pad_left: bool
-    :param pad_right: whether the ngrams should be right-padded
-    :type pad_right: bool
-    :param left_pad_symbol: the symbol to use for left padding (default is None)
-    :type left_pad_symbol: any
-    :param right_pad_symbol: the symbol to use for right padding (default is None)
-    :type right_pad_symbol: any
-    :rtype: sequence or iter
-    """
-    sequence = iter(sequence)
-    if pad_left:
-        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
-    if pad_right:
-        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
-    return sequence
-# add a flag to pad the sequence so we get peripheral ngrams?
-def ngrams(
-    sequence,
-    n,
-    pad_left=False,
-    pad_right=False,
-    left_pad_symbol=None,
-    right_pad_symbol=None,
-):
-    """
-    Return the ngrams generated from a sequence of items, as an iterator.
-    For example:
-        >>> from nltk.util import ngrams
-        >>> list(ngrams([1,2,3,4,5], 3))
-        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
-    Wrap with list for a list version of this function.  Set pad_left
-    or pad_right to true in order to get additional ngrams:
-        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
-        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
-        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
-        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
-        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
-        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
-        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
-        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
-    :param sequence: the source data to be converted into ngrams
-    :type sequence: sequence or iter
-    :param n: the degree of the ngrams
-    :type n: int
-    :param pad_left: whether the ngrams should be left-padded
-    :type pad_left: bool
-    :param pad_right: whether the ngrams should be right-padded
-    :type pad_right: bool
-    :param left_pad_symbol: the symbol to use for left padding (default is None)
-    :type left_pad_symbol: any
-    :param right_pad_symbol: the symbol to use for right padding (default is None)
-    :type right_pad_symbol: any
-    :rtype: sequence or iter
-    """
-    sequence = pad_sequence(
-        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
-    )
-    history = []
-    while n > 1:
-        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
-        try:
-            next_item = next(sequence)
-        except StopIteration:
-            # no more data, terminate the generator
-            return
-        history.append(next_item)
-        n -= 1
-    for item in sequence:
-        history.append(item)
-        yield tuple(history)
-        del history[0]
 def sentence_bleu(
@@ -184,12 +86,12 @@ def sentence_bleu(
     :return: The sentence-level BLEU score.
     :rtype: float
     """
-    return corpus_bleu(
         [references], [hypothesis], weights, smoothing_function, auto_reweigh
     )
-def corpus_bleu(
     list_of_references,
     hypotheses,
     weights=(0.25, 0.25, 0.25, 0.25),

 from fractions import Fraction
 import warnings
 from collections import Counter
+from .utils import ngrams
+import pdb
 def sentence_bleu(
     :return: The sentence-level BLEU score.
     :rtype: float
     """
+    return corpus_weighted_ngram_match(
         [references], [hypothesis], weights, smoothing_function, auto_reweigh
     )
+def corpus_weighted_ngram_match(
     list_of_references,
     hypotheses,
     weights=(0.25, 0.25, 0.25, 0.25),