macrogpt-tokenizer / README.md
Macropodus's picture
Update README.md
112edc4 verified
metadata
license: apache-2.0
language:
  - zh
  - en
pipeline_tag: text-generation
inference: false
tags:
  - chinese

explain

  • base tokenizer from baichuan-7B, this model add some maths symbol
"approx": 64000,
"arccos": 64001,
"arcsin": 64002,
"arctan": 64003,
"backsim": 64004,
"begin{matrix}": 64005,
"begin{vmatrix}": 64006,
"beta": 64007,
"cdot": 64008,
"cdots": 64009,
"cong": 64010,
"delta": 64011,
"dot": 64012,
"downarrow": 64013,
"end{matrix}": 64014,
"end{vmatrix}": 64015,
"exists": 64016,
"forall": 64017,
"gamma": 64018,
"geq": 64019,
"infty": 64020,
"lambda": 64021,
"left.": 64022,
"left[": 64023,
"left{": 64024,
"leftrightarrow": 64025,
"leq": 64026,
"lg": 64027,
"neq": 64028,
"notin": 64029,
"omega": 64030,
"overline": 64031,
"overrightarrow": 64032,
"prime": 64033,
"psi": 64034,
"rho": 64035,
"right.": 64036,
"right}": 64037,
"right]": 64038,
"rightarrow": 64039,
"sigma": 64040,
"subset": 64041,
"subseteq": 64042,
"supset": 64043,
"supseteq": 64044,
"tan": 64045,
"textcircled": 64046,
"text{": 64047,
"therefore": 64048,
"theta": 64049,
"varepsilon": 64050,
"varphi": 64051,
"widehat": 64052,
"xrightarrow": 64053,
"…": 64054,
"℃": 64055,
"①": 64056,
"②": 64057,
"③": 64058,
"④": 64059,
"⑤": 64060,
"⑥": 64061,
"⑦": 64062,
"⑧": 64063,
"⑨": 64064,
"⑩": 64065,
"%": 64066,
"(": 64067,
")": 64068,
"+": 64069,
"-": 64070,
".": 64071,
";": 64072,
"<": 64073,
"=": 64074,
">": 64075