Spaces:
Runtime error
Runtime error
Anonymous
commited on
Commit
·
32eac11
1
Parent(s):
f475b49
add utils
Browse files
utils/languages_by_word_count.csv
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Unnamed: 0,Language,number of words,percentage of total words
|
2 |
+
0,English,181014683608,92.64708%
|
3 |
+
1,French,3553061536,1.81853%
|
4 |
+
2,German,2870869396,1.46937%
|
5 |
+
3,Spanish,1510070974,0.77289%
|
6 |
+
4,Italian,1187784217,0.60793%
|
7 |
+
5,Portuguese,1025413869,0.52483%
|
8 |
+
6,Dutch,669055061,0.34244%
|
9 |
+
7,Russian,368157074,0.18843%
|
10 |
+
8,Romanian,308182352,0.15773%
|
11 |
+
9,Polish,303812362,0.15550%
|
12 |
+
10,Finnish,221644679,0.11344%
|
13 |
+
11,Danish,221551540,0.11339%
|
14 |
+
12,Swedish,220920577,0.11307%
|
15 |
+
13,Japanese,217047918,0.11109%
|
16 |
+
14,Norwegian,212193299,0.10860%
|
17 |
+
15,Chinese,193517396,0.09905%
|
18 |
+
16,Czech,139918438,0.07161%
|
19 |
+
17,Hungarian,127224375,0.06512%
|
20 |
+
18,Indonesian,116930321,0.05985%
|
21 |
+
19,Turkish,116141938,0.05944%
|
22 |
+
20,Croatian,101613675,0.05201%
|
23 |
+
21,Vietnamese,83077650,0.04252%
|
24 |
+
22,Greek,61607673,0.03153%
|
25 |
+
23,Arabic,60839973,0.03114%
|
26 |
+
24,Serbian,52875283,0.02706%
|
27 |
+
25,Chinese (Traditional),38583893,0.01975%
|
28 |
+
26,Catalan,35126650,0.01798%
|
29 |
+
27,Korean,33147663,0.01697%
|
30 |
+
28,Slovak,27957963,0.01431%∆
|
31 |
+
29,Thai,26806557,0.01372%
|
32 |
+
30,Slovenian,26037337,0.01333%
|
33 |
+
31,Estonian,20718080,0.01060%
|
34 |
+
32,Persian,16731301,0.00856%
|
35 |
+
33,Hebrew,15027640,0.00769%
|
36 |
+
34,Ukrainian,14905898,0.00763%
|
37 |
+
35,Malay,13389340,0.00685%
|
38 |
+
36,Latvian,13290098,0.00680%
|
39 |
+
37,Bosnian,13160941,0.00674%
|
40 |
+
38,Lithuanian,12921255,0.00661%
|
41 |
+
39,Icelandic,12792837,0.00655%
|
42 |
+
40,Hindi,9434632,0.00483%
|
43 |
+
41,Albanian,9253803,0.00474%
|
44 |
+
42,Filipino,8650331,0.00443%
|
45 |
+
43,Galician,6947527,0.00356%
|
46 |
+
44,Javanese,6604056,0.00338%
|
47 |
+
45,Bulgarian,5919807,0.00303%
|
48 |
+
46,Afrikaans,5461216,0.00280%
|
49 |
+
47,Tamil,5163171,0.00264%
|
50 |
+
48,Marathi,3660217,0.00187%
|
51 |
+
49,Welsh,3459671,0.00177%
|
52 |
+
50,Malayalam,3227746,0.00165%
|
53 |
+
51,Bangla,3003033,0.00154%
|
54 |
+
52,Irish,2878943,0.00147%
|
55 |
+
53,Azerbaijani,2496202,0.00128%
|
56 |
+
54,Kannada,1913389,0.00098%
|
57 |
+
55,Burmese,1853421,0.00095%
|
58 |
+
56,Telugu,1638366,0.00084%
|
59 |
+
57,Uzbek,1458861,0.00075%
|
60 |
+
58,Kinyarwanda,1430208,0.00073%
|
61 |
+
59,Cebuano,1329456,0.00068%
|
62 |
+
60,Nepali,1120450,0.00057%
|
63 |
+
61,Kurdish,1091032,0.00056%
|
64 |
+
62,Basque,1048905,0.00054%
|
65 |
+
63,Khmer,1041164,0.00053%
|
66 |
+
64,Georgian,924256,0.00047%
|
67 |
+
65,Scottish Gaelic,841970,0.00043%
|
68 |
+
66,Armenian,840171,0.00043%
|
69 |
+
67,Maltese,748610,0.00038%
|
70 |
+
68,Sinhala,708343,0.00036%
|
71 |
+
69,Punjabi,703086,0.00036%
|
72 |
+
70,Urdu,689768,0.00035%
|
73 |
+
71,Kazakh,670231,0.00034%
|
74 |
+
72,Swahili,585858,0.00030%
|
75 |
+
73,Southern Sotho,538257,0.00028%
|
76 |
+
74,Belarusian,533405,0.00027%
|
77 |
+
75,Macedonian,529413,0.00027%
|
78 |
+
76,Malagasy,507043,0.00026%
|
79 |
+
77,Gujarati,494798,0.00025%
|
80 |
+
78,Lao,449476,0.00023%
|
81 |
+
79,Haitian Creole,430911,0.00022%
|
82 |
+
80,Ganda,261217,0.00013%
|
83 |
+
81,Yiddish,227609,0.00012%
|
84 |
+
82,Tajik,210167,0.00011%
|
85 |
+
83,Sundanese,208819,0.00011%
|
86 |
+
84,Hmong,175972,0.00009%
|
87 |
+
85,Nyanja,161994,0.00008%
|
88 |
+
86,Odia,131688,0.00007%
|
89 |
+
87,Divehi,112819,0.00006%
|
90 |
+
88,Kyrgyz,91289,0.00005%
|
91 |
+
89,Bihari languages,48094,0.00002%
|
92 |
+
90,Unknown language [xx] (Gothic),48025,0.00002%
|
93 |
+
91,Unknown language [xx] (Runic),37558,0.00002%
|
94 |
+
92,Inuktitut,31142,0.00002%
|
95 |
+
93,Syriac,21482,0.00001%
|
96 |
+
94,Mongolian,7779,0.00000%
|
97 |
+
95,Unknown language [xx] (Phoenician),4343,0.00000%
|
98 |
+
96,Unknown language [xx] (Unknown Script [Qaai]),4185,0.00000%
|
99 |
+
97,Unknown language [xx] (Egyptian hieroglyphs),3395,0.00000%
|
100 |
+
98,Unknown language [xx] (N’Ko),3338,0.00000%
|
101 |
+
99,Unknown language [xx] (Tifinagh),3277,0.00000%
|
102 |
+
100,Unknown language [xx] (Chakma),2608,0.00000%
|
103 |
+
101,Unknown language [xx] (Yi),2357,0.00000%
|
104 |
+
102,Cherokee,2315,0.00000%
|
105 |
+
103,Unknown language [xx] (Phags-pa),1750,0.00000%
|
106 |
+
104,Unknown language [xx] (Tai Viet),1622,0.00000%
|
107 |
+
105,Unknown language [xx] (Deseret),1504,0.00000%
|
108 |
+
106,Unknown language [xx] (Javanese),1448,0.00000%
|
109 |
+
107,Unknown language [xx] (Sundanese),780,0.00000%
|
110 |
+
108,Unknown language [xx] (Coptic),707,0.00000%
|
111 |
+
109,Unknown language [xx] (Glagolitic),673,0.00000%
|
112 |
+
110,Unknown language [xx] (Ol Chiki),573,0.00000%
|
113 |
+
111,Unknown language [xx] (Shavian),542,0.00000%
|
114 |
+
112,Unknown language [xx] (Samaritan),313,0.00000%
|
115 |
+
113,Unknown language [xx] (Avestan),213,0.00000%
|
116 |
+
114,Unknown language [xx] (Bopomofo),188,0.00000%
|
117 |
+
115,Unknown language [xx] (Linear B),156,0.00000%
|
118 |
+
116,Unknown language [xx] (Ogham),84,0.00000%
|
119 |
+
117,Unknown language [xx] (Cham),49,0.00000%
|