|
|
|
- Original model is [yanolja/EEVE-Korean-Instruct-10.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-10.8B-v1.0) |
|
- quantized using [llama.cpp](https://github.com/ggerganov/llama.cpp) |
|
|
|
|
|
### Usage |
|
requirements |
|
``` |
|
# GPU model |
|
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose |
|
|
|
# CPU |
|
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose |
|
|
|
pip install huggingface_hub |
|
``` |
|
|
|
``` |
|
from huggingface_hub import hf_hub_download |
|
from llama_cpp import Llama |
|
|
|
import time |
|
from pprint import pprint |
|
|
|
|
|
# download model |
|
model_name_or_path = "heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF" # repo id |
|
# 4bit |
|
model_basename = "ggml-model-Q4_K_M.gguf" # file name |
|
|
|
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) |
|
print(model_path) |
|
|
|
|
|
# CPU |
|
# lcpp_llm = Llama( |
|
# model_path=model_path, |
|
# n_threads=2, |
|
# ) |
|
|
|
# GPUμμ μ¬μ©νλ €λ©΄ μλ μ½λλ‘ μ€ν |
|
lcpp_llm = Llama( |
|
model_path=model_path, |
|
n_threads=2, # CPU cores |
|
n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. |
|
n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool. |
|
n_ctx=4096, # Context window |
|
) |
|
|
|
|
|
prompt_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: {prompt}\nAssistant:\n" |
|
text = 'νκ΅μ μλλ μ΄λμΈκ°μ? μλ μ νμ§ μ€ κ³¨λΌμ£ΌμΈμ.\n\n(A) κ²½μ±\n(B) λΆμ°\n(C) νμ\n(D) μμΈ\n(E) μ μ£Ό' |
|
|
|
prompt = prompt_template.format(prompt=text) |
|
|
|
start = time.time() |
|
response = lcpp_llm( |
|
prompt=prompt, |
|
max_tokens=256, |
|
temperature=0.5, |
|
top_p=0.95, |
|
top_k=50, |
|
stop = ['</s>'], # Dynamic stopping when such token is detected. |
|
echo=True # return the prompt |
|
) |
|
pprint(response) |
|
print(time.time() - start) |
|
``` |
|
|
|
μ€νκ²°κ³Ό (Colab T4 GPU) |
|
``` |
|
llama_print_timings: load time = 942.53 ms |
|
llama_print_timings: sample time = 27.60 ms / 37 runs ( 0.75 ms per token, 1340.43 tokens per second) |
|
llama_print_timings: prompt eval time = 942.29 ms / 83 tokens ( 11.35 ms per token, 88.08 tokens per second) |
|
llama_print_timings: eval time = 4530.31 ms / 36 runs ( 125.84 ms per token, 7.95 tokens per second) |
|
llama_print_timings: total time = 5648.42 ms / 119 tokens |
|
{'choices': [{'finish_reason': 'stop', |
|
'index': 0, |
|
'logprobs': None, |
|
'text': 'A chat between a curious user and an artificial ' |
|
'intelligence assistant. The assistant gives helpful, ' |
|
"detailed, and polite answers to the user's questions.\n" |
|
'Human: νκ΅μ μλλ μ΄λμΈκ°μ? μλ μ νμ§ μ€ κ³¨λΌμ£ΌμΈμ.\n' |
|
'\n' |
|
'(A) κ²½μ±\n' |
|
'(B) λΆμ°\n' |
|
'(C) νμ\n' |
|
'(D) μμΈ\n' |
|
'(E) μ μ£Ό\n' |
|
'Assistant:\n' |
|
'νκ΅μ λμμμμ μμΉν κ΅κ°λ‘ 곡μμ μΌλ‘ λνλ―Όκ΅μ΄λΌκ³ λΆλ¦½λλ€. μμΈμ λνλ―Όκ΅μ μλμ
λλ€. ' |
|
'λ°λΌμ μ λ΅μ (D) μμΈμ
λλ€.'}], |
|
'created': 1710404368, |
|
'id': 'cmpl-af889267-f64e-4516-b0a3-5c8b918d0e36', |
|
'model': '/root/.cache/huggingface/hub/models--heegyu--EEVE-Korean-Instruct-10.8B-v1.0-GGUF/snapshots/ff014aa6d73ffa8a2857085261cb7a4e6c630bfe/ggml-model-Q4_K_M.gguf', |
|
'object': 'text_completion', |
|
'usage': {'completion_tokens': 36, 'prompt_tokens': 83, 'total_tokens': 119}} |
|
5.662428140640259 |
|
``` |