{ "builder_name": "common_voice_11_0", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "config_name": "ta", "dataset_size": 6138537473, "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 16413 validated hours of speech in 100 languages, but more voices and languages are always added.", "download_checksums": { "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/n_shards.json": { "num_bytes": 12179, "checksum": "584aa91a99f678abe7ad1e181e5cdc6af970d20c82bcb63f54e86c4fd2e5a2a8" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/train/ta_train_0.tar": { "num_bytes": 1604730880, "checksum": "ad4bbc42512f1bc9efd98c188e44750ac83b71e4829f4a52d6d9cf7259c0e6cb" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/train/ta_train_1.tar": { "num_bytes": 67246080, "checksum": "230bd2febfe9bc7012d73e52e517240c6a33ae11e3c64fb99086328e2b650fa5" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/dev/ta_dev_0.tar": { "num_bytes": 423659520, "checksum": "80b9977ebcd1b484880dafac7e28c184440a7c2d4558d501240e069d9b4dd628" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/test/ta_test_0.tar": { "num_bytes": 443709440, "checksum": "188a061b89e27517f06f713e5be835f7634242afe5f9cdc828d91fbf0bb6ffd7" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_0.tar": { "num_bytes": 1563248640, "checksum": "91ca2c227f2d59a9a1817b38adfca9fe14e62484d7b5ffe44ae23811442a2b97" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_1.tar": { "num_bytes": 1506631680, "checksum": "83809ecfcf1826812817e8d826d240823569aecb447a39a2a38dcb47090fa3fd" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/other/ta_other_2.tar": { "num_bytes": 319129600, "checksum": "6df52e8d650db95f7537b7acfecafc89433a586036c3dda21939a4be87421027" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/ta/invalidated/ta_invalidated_0.tar": { "num_bytes": 227020800, "checksum": "33fe28d8a4e50f48bf0f7a0007179b479d11525467aa0743583e64b246799d1d" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/train.tsv": { "num_bytes": 15058319, "checksum": "f8214e19cb767e9911de46ab64672a39a77814b8c3d9330a71d8f9ff07c12d33" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/dev.tsv": { "num_bytes": 4205514, "checksum": "a3136b8f038de5e90ef79fd814ebb1f29282137d1e850b2db2a354d6cc76c345" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/test.tsv": { "num_bytes": 3936521, "checksum": "f3346901e118f481b32880dcc0dcf52f066fcb70c1355ad49d6bc532c73a6480" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/other.tsv": { "num_bytes": 30745385, "checksum": "18f1ac9e239e40d839a70c962d42e8d75a610f535c476453e6330a4cc5ec1a47" }, "https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/ta/invalidated.tsv": { "num_bytes": 2044387, "checksum": "9b8b0c7a216423f9070824f17fa5722779357d56e0c908db0b3dfc59968514b1" } }, "download_size": 6211378945, "features": { "input_features": { "feature": { "feature": { "dtype": "float32", "_type": "Value" }, "_type": "Sequence" }, "_type": "Sequence" }, "input_length": { "dtype": "float64", "_type": "Value" }, "labels": { "feature": { "dtype": "int64", "_type": "Value" }, "_type": "Sequence" } }, "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://creativecommons.org/publicdomain/zero/1.0/", "size_in_bytes": 12349916418, "splits": { "train": { "name": "train", "num_bytes": 1667970011, "num_examples": 41710, "shard_lengths": [ 14000, 13000, 12000, 2710 ], "dataset_name": "common_voice_11_0" }, "validation": { "name": "validation", "num_bytes": 422495922, "num_examples": 11758, "dataset_name": "common_voice_11_0" }, "test": { "name": "test", "num_bytes": 442244039, "num_examples": 11815, "dataset_name": "common_voice_11_0" }, "other": { "name": "other", "num_bytes": 3379297630, "num_examples": 87993, "shard_lengths": [ 13000, 13000, 14000, 14000, 15000, 13000, 5993 ], "dataset_name": "common_voice_11_0" }, "invalidated": { "name": "invalidated", "num_bytes": 226529871, "num_examples": 5575, "dataset_name": "common_voice_11_0" } }, "version": { "version_str": "11.0.0", "major": 11, "minor": 0, "patch": 0 } }