tokenizer-arena / stats /compression_rate /ClueAI.ChatYuan-large-v2 @ cc100.fa.diff.json
xu-song's picture
add compression_rate details
a4208a2
[
{
"text": "آشپزخانه کوچک من: February 2012",
"decoded_text": "<unk>ا<unk> <unk> <unk>: February 2012",
"diff": [
"replace text[0:5] --> decoded_text[0:5] 'آشپزخ' --> '<unk>'",
"replace text[6:8] --> decoded_text[6:11] 'نه' --> '<unk>'",
"replace text[9:13] --> decoded_text[12:17] 'کوچک' --> '<unk>'",
"replace text[14:16] --> decoded_text[18:23] 'من' --> '<unk>'"
],
"n_oov_chars": 13,
"oov_ratio": 0.41935483870967744,
"oov_charset": "[\"آ\", \"ش\", \"پ\", \"ز\", \"خ\", \"ن\", \"ه\", \"ک\", \"و\", \"چ\", \"م\"]"
},
{
"text": "آشپزخانه کوچک من",
"decoded_text": "<unk>ا<unk> <unk> <unk>",
"diff": [
"replace text[0:5] --> decoded_text[0:5] 'آشپزخ' --> '<unk>'",
"replace text[6:8] --> decoded_text[6:11] 'نه' --> '<unk>'",
"replace text[9:13] --> decoded_text[12:17] 'کوچک' --> '<unk>'",
"replace text[14:16] --> decoded_text[18:23] 'من' --> '<unk>'"
],
"n_oov_chars": 13,
"oov_ratio": 0.8125,
"oov_charset": "[\"آ\", \"ش\", \"پ\", \"ز\", \"خ\", \"ن\", \"ه\", \"ک\", \"و\", \"چ\", \"م\"]"
},
{
"text": "بکینگ پودر:2 قاشق چای خوری",
"decoded_text": "<unk> <unk>:2 <unk>ا<unk> <unk>ا<unk> <unk>",
"diff": [
"replace text[0:5] --> decoded_text[0:5] 'بکینگ' --> '<unk>'",
"replace text[6:10] --> decoded_text[6:11] 'پودر' --> '<unk>'",
"replace text[13:14] --> decoded_text[14:19] 'ق' --> '<unk>'",
"replace text[15:17] --> decoded_text[20:25] 'شق' --> '<unk>'",
"replace text[18:19] --> decoded_text[26:31] 'چ' --> '<unk>'",
"replace text[20:21] --> decoded_text[32:37] 'ی' --> '<unk>'",
"replace text[22:26] --> decoded_text[38:43] 'خوری' --> '<unk>'"
],
"n_oov_chars": 18,
"oov_ratio": 0.6923076923076923,
"oov_charset": "[\"ب\", \"ک\", \"ی\", \"ن\", \"گ\", \"پ\", \"و\", \"د\", \"ر\", \"ق\", \"ش\", \"چ\", \"خ\"]"
},
{
"text": "تخم مرغ:2 عدد بزرگ",
"decoded_text": "<unk> <unk>:2 <unk> <unk>",
"diff": [
"replace text[0:3] --> decoded_text[0:5] 'تخم' --> '<unk>'",
"replace text[4:7] --> decoded_text[6:11] 'مرغ' --> '<unk>'",
"replace text[10:13] --> decoded_text[14:19] 'عدد' --> '<unk>'",
"replace text[14:18] --> decoded_text[20:25] 'بزرگ' --> '<unk>'"
],
"n_oov_chars": 13,
"oov_ratio": 0.7222222222222222,
"oov_charset": "[\"ت\", \"خ\", \"م\", \"ر\", \"غ\", \"ع\", \"د\", \"ب\", \"ز\", \"گ\"]"
},
{
"text": "کره:225 گرم به دمای اتاق رسیده",
"decoded_text": "<unk>:225 <unk> <unk> <unk>ا<unk> ا<unk>ا<unk> <unk>",
"diff": [
"replace text[0:3] --> decoded_text[0:5] 'کره' --> '<unk>'",
"replace text[8:11] --> decoded_text[10:15] 'گرم' --> '<unk>'",
"replace text[12:14] --> decoded_text[16:21] 'به' --> '<unk>'",
"replace text[15:17] --> decoded_text[22:27] 'دم' --> '<unk>'",
"replace text[18:19] --> decoded_text[28:33] 'ی' --> '<unk>'",
"replace text[21:22] --> decoded_text[35:40] 'ت' --> '<unk>'",
"replace text[23:24] --> decoded_text[41:46] 'ق' --> '<unk>'",
"replace text[25:30] --> decoded_text[47:52] 'رسیده' --> '<unk>'"
],
"n_oov_chars": 18,
"oov_ratio": 0.6,
"oov_charset": "[\"ک\", \"ر\", \"ه\", \"گ\", \"م\", \"ب\", \"د\", \"ی\", \"ت\", \"ق\", \"س\"]"
},
{
"text": "شکر:1و1/2 پیمانه+ 3 قاشق غذا خوری",
"decoded_text": "<unk>:1<unk>1/2 <unk>ا<unk>+ 3 <unk>ا<unk> <unk>ا <unk>",
"diff": [
"replace text[0:3] --> decoded_text[0:5] 'شکر' --> '<unk>'",
"replace text[5:6] --> decoded_text[7:12] 'و' --> '<unk>'",
"replace text[10:13] --> decoded_text[16:21] 'پیم' --> '<unk>'",
"replace text[14:16] --> decoded_text[22:27] 'نه' --> '<unk>'",
"replace text[20:21] --> decoded_text[31:36] 'ق' --> '<unk>'",
"replace text[22:24] --> decoded_text[37:42] 'شق' --> '<unk>'",
"replace text[25:27] --> decoded_text[43:48] 'غذ' --> '<unk>'",
"replace text[29:33] --> decoded_text[50:55] 'خوری' --> '<unk>'"
],
"n_oov_chars": 18,
"oov_ratio": 0.5454545454545454,
"oov_charset": "[\"ش\", \"ک\", \"ر\", \"و\", \"پ\", \"ی\", \"م\", \"ن\", \"ه\", \"ق\", \"غ\", \"ذ\", \"خ\"]"
},
{
"text": "پودر دارچین:2 و1/2قاشق چای خوری",
"decoded_text": "<unk> <unk>ا<unk>:2 <unk>1/2<unk>ا<unk> <unk>ا<unk> <unk>",
"diff": [
"replace text[0:4] --> decoded_text[0:5] 'پودر' --> '<unk>'",
"replace text[5:6] --> decoded_text[6:11] 'د' --> '<unk>'",
"replace text[7:11] --> decoded_text[12:17] 'رچین' --> '<unk>'",
"replace text[14:15] --> decoded_text[20:25] 'و' --> '<unk>'",
"replace text[18:19] --> decoded_text[28:33] 'ق' --> '<unk>'",
"replace text[20:22] --> decoded_text[34:39] 'شق' --> '<unk>'",
"replace text[23:24] --> decoded_text[40:45] 'چ' --> '<unk>'",
"replace text[25:26] --> decoded_text[46:51] 'ی' --> '<unk>'",
"replace text[27:31] --> decoded_text[52:57] 'خوری' --> '<unk>'"
],
"n_oov_chars": 19,
"oov_ratio": 0.6129032258064516,
"oov_charset": "[\"پ\", \"و\", \"د\", \"ر\", \"چ\", \"ی\", \"ن\", \"ق\", \"ش\", \"خ\"]"
},
{
"text": "فر رو روی 350 درجه فارنهایت روشن کنید",
"decoded_text": "<unk> <unk> <unk> 350 <unk> <unk>ا<unk>ا<unk> <unk> <unk>",
"diff": [
"replace text[0:2] --> decoded_text[0:5] 'فر' --> '<unk>'",
"replace text[3:5] --> decoded_text[6:11] 'رو' --> '<unk>'",
"replace text[6:9] --> decoded_text[12:17] 'روی' --> '<unk>'",
"replace text[14:18] --> decoded_text[22:27] 'درجه' --> '<unk>'",
"replace text[19:20] --> decoded_text[28:33] 'ف' --> '<unk>'",
"replace text[21:24] --> decoded_text[34:39] 'رنه' --> '<unk>'",
"replace text[25:27] --> decoded_text[40:45] 'یت' --> '<unk>'",
"replace text[28:32] --> decoded_text[46:51] 'روشن' --> '<unk>'",
"replace text[33:37] --> decoded_text[52:57] 'کنید' --> '<unk>'"
],
"n_oov_chars": 25,
"oov_ratio": 0.6756756756756757,
"oov_charset": "[\"ف\", \"ر\", \"و\", \"ی\", \"د\", \"ج\", \"ه\", \"ن\", \"ت\", \"ش\", \"ک\"]"
},
{
"text": "کره رو با شکر هم بزنید تا یکدست و کرمی بشه تخم مرغها رو دونه دونه اضافه کنید و هم بزنید",
"decoded_text": "<unk> <unk> <unk>ا <unk> <unk> <unk> <unk>ا <unk> <unk> <unk> <unk> <unk> <unk>ا <unk> <unk> <unk> ا<unk>ا<unk> <unk> <unk> <unk> <unk>",
"diff": [
"replace text[0:3] --> decoded_text[0:5] 'کره' --> '<unk>'",
"replace text[4:6] --> decoded_text[6:11] 'رو' --> '<unk>'",
"replace text[7:8] --> decoded_text[12:17] 'ب' --> '<unk>'",
"replace text[10:13] --> decoded_text[19:24] 'شکر' --> '<unk>'",
"replace text[14:16] --> decoded_text[25:30] 'هم' --> '<unk>'",
"replace text[17:22] --> decoded_text[31:36] 'بزنید' --> '<unk>'",
"replace text[23:24] --> decoded_text[37:42] 'ت' --> '<unk>'",
"replace text[26:31] --> decoded_text[44:49] 'یکدست' --> '<unk>'",
"replace text[32:33] --> decoded_text[50:55] 'و' --> '<unk>'",
"replace text[34:38] --> decoded_text[56:61] 'کرمی' --> '<unk>'",
"replace text[39:42] --> decoded_text[62:67] 'بشه' --> '<unk>'",
"replace text[43:46] --> decoded_text[68:73] 'تخم' --> '<unk>'",
"replace text[47:51] --> decoded_text[74:79] 'مرغه' --> '<unk>'",
"replace text[53:55] --> decoded_text[81:86] 'رو' --> '<unk>'",
"replace text[56:60] --> decoded_text[87:92] 'دونه' --> '<unk>'",
"replace text[61:65] --> decoded_text[93:98] 'دونه' --> '<unk>'",
"replace text[67:68] --> decoded_text[100:105] 'ض' --> '<unk>'",
"replace text[69:71] --> decoded_text[106:111] 'فه' --> '<unk>'",
"replace text[72:76] --> decoded_text[112:117] 'کنید' --> '<unk>'",
"replace text[77:78] --> decoded_text[118:123] 'و' --> '<unk>'",
"replace text[79:81] --> decoded_text[124:129] 'هم' --> '<unk>'",
"replace text[82:87] --> decoded_text[130:135] 'بزنید' --> '<unk>'"
],
"n_oov_chars": 62,
"oov_ratio": 0.7126436781609196,
"oov_charset": "[\"ک\", \"ر\", \"ه\", \"و\", \"ب\", \"ش\", \"م\", \"ز\", \"ن\", \"ی\", \"د\", \"ت\", \"س\", \"خ\", \"غ\", \"ض\", \"ف\"]"
},
{
"text": "ارد و بکینگ پودر و نمک رو الک کنید و اضافه کنید",
"decoded_text": "ا<unk> <unk> <unk> <unk> <unk> <unk> <unk> ال<unk> <unk> <unk> ا<unk>ا<unk> <unk>",
"diff": [
"replace text[1:3] --> decoded_text[1:6] 'رد' --> '<unk>'",
"replace text[4:5] --> decoded_text[7:12] 'و' --> '<unk>'",
"replace text[6:11] --> decoded_text[13:18] 'بکینگ' --> '<unk>'",
"replace text[12:16] --> decoded_text[19:24] 'پودر' --> '<unk>'",
"replace text[17:18] --> decoded_text[25:30] 'و' --> '<unk>'",
"replace text[19:22] --> decoded_text[31:36] 'نمک' --> '<unk>'",
"replace text[23:25] --> decoded_text[37:42] 'رو' --> '<unk>'",
"replace text[28:29] --> decoded_text[45:50] 'ک' --> '<unk>'",
"replace text[30:34] --> decoded_text[51:56] 'کنید' --> '<unk>'",
"replace text[35:36] --> decoded_text[57:62] 'و' --> '<unk>'",
"replace text[38:39] --> decoded_text[64:69] 'ض' --> '<unk>'",
"replace text[40:42] --> decoded_text[70:75] 'فه' --> '<unk>'",
"replace text[43:47] --> decoded_text[76:81] 'کنید' --> '<unk>'"
],
"n_oov_chars": 31,
"oov_ratio": 0.6595744680851063,
"oov_charset": "[\"ر\", \"د\", \"و\", \"ب\", \"ک\", \"ی\", \"ن\", \"گ\", \"پ\", \"م\", \"ض\", \"ف\", \"ه\"]"
}
]