File size: 2,068 Bytes
6a62ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import logging
from typing import List


logger = logging.getLogger(__name__)


def uniform(dataset_sizes: List[int]):
    return [1.0] * len(dataset_sizes)


def temperature_sampling(dataset_sizes, temp):
    total_size = sum(dataset_sizes)
    return [(size / total_size) ** (1.0 / temp) for size in dataset_sizes]


def make_temperature_sampling(temp=1.0):
    def sampling_func(dataset_sizes):
        return temperature_sampling(dataset_sizes, temp)

    return sampling_func


def make_ratio_sampling(ratios):
    def sampling_func(dataset_sizes):
        return ratios

    return sampling_func


class SamplingMethod:
    @staticmethod
    def add_arguments(parser):
        parser.add_argument(
            "--sampling-method",
            choices=[
                "uniform",
                "temperature",
                "concat",
                "RoundRobin",
            ],
            type=str,
            default="concat",
            help="The method to sample data per language pairs",
        )
        parser.add_argument(
            "--sampling-temperature",
            default=1.5,
            type=float,
            help="only work with --sampling-method temperature",
        )

    @staticmethod
    def build_sampler(args, task):
        return SamplingMethod(args, task)

    def __init__(self, args, task):
        self.args = args
        self.task = task

    def is_adaptive(self):
        return False

    def sampling_method_selector(self):
        args = self.args
        logger.info(f"selected sampler: {args.sampling_method}")
        if args.sampling_method == "uniform":
            return uniform
        elif args.sampling_method == "temperature" or self.is_adaptive():
            return make_temperature_sampling(float(args.sampling_temperature))
        else:
            # default to concating all data set together
            return None