File size: 5,928 Bytes
9e49e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
{
  "builder_name": "common_voice_11_0",
  "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n",
  "config_name": "ta",
  "dataset_size": 6138537473,
  "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 16413 validated hours of speech  in 100 languages, but more voices and languages are always added.",
  "download_checksums": {
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/n_shards.json": {
      "num_bytes": 12179,
      "checksum": "584aa91a99f678abe7ad1e181e5cdc6af970d20c82bcb63f54e86c4fd2e5a2a8"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/train/ta_train_0.tar": {
      "num_bytes": 1604730880,
      "checksum": "ad4bbc42512f1bc9efd98c188e44750ac83b71e4829f4a52d6d9cf7259c0e6cb"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/train/ta_train_1.tar": {
      "num_bytes": 67246080,
      "checksum": "230bd2febfe9bc7012d73e52e517240c6a33ae11e3c64fb99086328e2b650fa5"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/dev/ta_dev_0.tar": {
      "num_bytes": 423659520,
      "checksum": "80b9977ebcd1b484880dafac7e28c184440a7c2d4558d501240e069d9b4dd628"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/test/ta_test_0.tar": {
      "num_bytes": 443709440,
      "checksum": "188a061b89e27517f06f713e5be835f7634242afe5f9cdc828d91fbf0bb6ffd7"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_0.tar": {
      "num_bytes": 1563248640,
      "checksum": "91ca2c227f2d59a9a1817b38adfca9fe14e62484d7b5ffe44ae23811442a2b97"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_1.tar": {
      "num_bytes": 1506631680,
      "checksum": "83809ecfcf1826812817e8d826d240823569aecb447a39a2a38dcb47090fa3fd"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_2.tar": {
      "num_bytes": 319129600,
      "checksum": "6df52e8d650db95f7537b7acfecafc89433a586036c3dda21939a4be87421027"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/invalidated/ta_invalidated_0.tar": {
      "num_bytes": 227020800,
      "checksum": "33fe28d8a4e50f48bf0f7a0007179b479d11525467aa0743583e64b246799d1d"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/train.tsv": {
      "num_bytes": 15058319,
      "checksum": "f8214e19cb767e9911de46ab64672a39a77814b8c3d9330a71d8f9ff07c12d33"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/dev.tsv": {
      "num_bytes": 4205514,
      "checksum": "a3136b8f038de5e90ef79fd814ebb1f29282137d1e850b2db2a354d6cc76c345"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/test.tsv": {
      "num_bytes": 3936521,
      "checksum": "f3346901e118f481b32880dcc0dcf52f066fcb70c1355ad49d6bc532c73a6480"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/other.tsv": {
      "num_bytes": 30745385,
      "checksum": "18f1ac9e239e40d839a70c962d42e8d75a610f535c476453e6330a4cc5ec1a47"
    },
    "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/invalidated.tsv": {
      "num_bytes": 2044387,
      "checksum": "9b8b0c7a216423f9070824f17fa5722779357d56e0c908db0b3dfc59968514b1"
    }
  },
  "download_size": 6211378945,
  "features": {
    "input_features": {
      "feature": {
        "feature": {
          "dtype": "float32",
          "_type": "Value"
        },
        "_type": "Sequence"
      },
      "_type": "Sequence"
    },
    "input_length": {
      "dtype": "float64",
      "_type": "Value"
    },
    "labels": {
      "feature": {
        "dtype": "int64",
        "_type": "Value"
      },
      "_type": "Sequence"
    }
  },
  "homepage": "https://commonvoice.mozilla.org/en/datasets",
  "license": "https://creativecommons.org/publicdomain/zero/1.0/",
  "size_in_bytes": 12349916418,
  "splits": {
    "train": {
      "name": "train",
      "num_bytes": 1667970011,
      "num_examples": 41710,
      "shard_lengths": [
        14000,
        13000,
        12000,
        2710
      ],
      "dataset_name": "common_voice_11_0"
    },
    "validation": {
      "name": "validation",
      "num_bytes": 422495922,
      "num_examples": 11758,
      "dataset_name": "common_voice_11_0"
    },
    "test": {
      "name": "test",
      "num_bytes": 442244039,
      "num_examples": 11815,
      "dataset_name": "common_voice_11_0"
    },
    "other": {
      "name": "other",
      "num_bytes": 3379297630,
      "num_examples": 87993,
      "shard_lengths": [
        13000,
        13000,
        14000,
        14000,
        15000,
        13000,
        5993
      ],
      "dataset_name": "common_voice_11_0"
    },
    "invalidated": {
      "name": "invalidated",
      "num_bytes": 226529871,
      "num_examples": 5575,
      "dataset_name": "common_voice_11_0"
    }
  },
  "version": {
    "version_str": "11.0.0",
    "major": 11,
    "minor": 0,
    "patch": 0
  }
}