aiben / tests /test_eval_models.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
raw
history blame
4.32 kB
import os
import pytest
from tests.utils import wrap_test_forked
@pytest.mark.skipif(not os.getenv('BENCHMARK'),
reason="Only valid on sufficiently large system and not normal part of testing."
" Instead used to get eval scores for all models.")
@pytest.mark.parametrize(
"base_model",
[
"h2oai/h2ogpt-oasst1-falcon-40b",
"h2oai/h2ogpt-oig-oasst1-512-6_9b",
"h2oai/h2ogpt-oig-oasst1-512-12b",
"h2oai/h2ogpt-oig-oasst1-512-20b",
"h2oai/h2ogpt-oasst1-512-12b",
"h2oai/h2ogpt-oasst1-512-20b",
"h2oai/h2ogpt-gm-oasst1-en-1024-20b",
"databricks/dolly-v2-12b",
"h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
"ehartford/WizardLM-7B-Uncensored",
"ehartford/WizardLM-13B-Uncensored",
"AlekseyKorshuk/vicuna-7b",
"TheBloke/stable-vicuna-13B-HF",
"decapoda-research/llama-7b-hf",
"decapoda-research/llama-13b-hf",
"decapoda-research/llama-30b-hf",
"junelee/wizard-vicuna-13b",
"openaccess-ai-collective/wizard-mega-13b",
]
)
@wrap_test_forked
def test_score_eval(base_model):
from src.gen import main
main(
base_model=base_model,
chat=False,
stream_output=False,
eval=True,
gradio=False,
eval_prompts_only_num=500,
eval_as_output=False,
num_beams=2,
use_gpu_id=False,
)
@pytest.mark.skipif(not os.getenv('FALCONS'), reason="download purpose")
@pytest.mark.parametrize(
"base_model",
[
"OpenAssistant/falcon-7b-sft-top1-696",
"OpenAssistant/falcon-7b-sft-mix-2000",
"h2oai/h2ogpt-oasst1-falcon-40b",
"h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
"h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2",
"h2oai/h2ogpt-gm-oasst1-multilang-2048-falcon-7b",
"OpenAssistant/falcon-40b-sft-top1-560",
"OpenAssistant/falcon-40b-sft-mix-1226",
]
)
@wrap_test_forked
def test_get_falcons(base_model):
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
t = AutoTokenizer.from_pretrained(base_model,
use_fast=False,
padding_side="left",
trust_remote_code=True,
token=True,
)
assert t is not None
m = AutoModelForCausalLM.from_pretrained(base_model,
trust_remote_code=True,
torch_dtype=torch.float16,
token=True,
)
assert m is not None
@pytest.mark.skipif(not os.getenv('LLAMA'), reason="LLaMa conversion")
@wrap_test_forked
def test_get_landmark_llama():
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaForCausalLM, LlamaTokenizer
m = LlamaForCausalLM.from_pretrained("epfml/landmark-attention-llama7b-wdiff")
t = LlamaTokenizer.from_pretrained("epfml/landmark-attention-llama7b-wdiff")
assert m is not None and t is not None
os.system("""
#
# step 1, convert llama to HF format
pip install protobuf==3.19.0
source ~/.bashrc.mamba
mamba create -n trans
conda activate trans
conda install python=3.10 -y
git clone https://github.com/epfml/landmark-attention.git
pip install fire datasets
git clone https://github.com/huggingface/transformers.git
cd transformers
pip install .
pip install torch accelerate sentencepiece protobuf==3.19.0
# below requires LLaMa weights
python src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir /data/jon/LLaMA --model_size 7B --output_dir llama_7B
#
# step 2, make landmark model (change hash if updated)
mkdir -p epfml/landmark-attention-llama7b-wdiff
cd epfml/landmark-attention-llama7b-wdiff
ln -s ~/.cache/huggingface/hub/models--epfml--landmark-attention-llama7b-wdiff/snapshots/050562871ac72723b4ab674f0392b02cd9609842/* .
cd ../../
python ../landmark-attention/llama/weight_diff.py recover --path_raw llama_7B --path_diff epfml/landmark-attention-llama7b-wdiff --path_tuned landmark_llama_7b
""")