Spaces:
Running
Running
File size: 4,085 Bytes
98985f3 2a7a3b6 98985f3 ce96772 24c122a bbc0512 ce96772 f3369dd 24c122a d35776c ce96772 d35776c ce96772 d35776c a17b6c0 d35776c e3dfd55 6475fdc c89cc71 e3dfd55 6475fdc ce96772 a17b6c0 d35776c 238547c d35776c 98985f3 2789d18 ce96772 6475fdc ce96772 6475fdc 24c122a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
from transformers import AutoTokenizer
import gradio as gr
import os
print("Check CPU count...")
print(os.cpu_count())
def formatarr(input):
return "["+",".join(str(x) for x in input)+"]"
def tokenize(input_text):
llama_tokens = llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
llama3_tokens = llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
mistral_tokens = mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
gpt2_tokens = gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
gpt_neox_tokens = gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
falcon_tokens = falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
phi2_tokens = phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
phi3_tokens = phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
t5_tokens = t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
gemma_tokens = gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
qwen_tokens = qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
codeqwen_tokens = codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
rwkv4_tokens = rwkv4_tokenizer(input_text, add_special_tokens=True)["input_ids"]
rwkv5_tokens = rwkv5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
deepseek_tokens = deepseek_tokenizer(input_text, add_special_tokens=True)["input_ids"]
internlm_tokens = internlm_tokenizer(input_text, add_special_tokens=True)["input_ids"]
internlm2_tokens = internlm2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
results = {
"LLaMa-1/LLaMa-2": llama_tokens,
"LLaMa-3": llama3_tokens,
"Mistral": mistral_tokens,
"GPT-2/GPT-J": gpt2_tokens,
"GPT-NeoX": gpt_neox_tokens,
"Falcon": falcon_tokens,
"Phi-1/Phi-2": phi2_tokens,
"Phi-3": phi3_tokens,
"T5": t5_tokens,
"Gemma": gemma_tokens,
"Qwen/Qwen1.5": qwen_tokens,
"CodeQwen": codeqwen_tokens,
"RWKV-v4": rwkv4_tokens,
"RWKV-v5/RWKV-v6": rwkv5_tokens,
"DeepSeek": deepseek_tokens,
"InternLM": internlm_tokens,
"InternLM2": internlm2_tokens
}
toks = ""
for model, tokens in results.items():
toks += f"\n{model} gets {len(tokens)} tokens: {formatarr(tokens)}"
return toks
if __name__ == "__main__":
llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
mistral_tokenizer = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-v0.2")
gpt2_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B")
codeqwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B")
rwkv4_tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-4-14b-pile", trust_remote_code=True)
rwkv5_tokenizer = AutoTokenizer.from_pretrained("RWKV/v5-EagleX-v2-7B-HF", trust_remote_code=True)
deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2", trust_remote_code=True)
internlm_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-20b", trust_remote_code=True)
internlm2_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-20b", trust_remote_code=True)
iface = gr.Interface(
fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=19), outputs="text"
)
iface.launch()
|