aiben / src /eval.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
import os
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from evaluate_params import eval_func_param_names, eval_extra_columns, input_args_list
from gen import evaluate, check_locals, score_qa
from prompter import Prompter
from utils import clear_torch_cache, NullContext, get_kwargs, makedirs
def run_eval( # for local function:
base_model=None, lora_weights=None, inference_server=None,
regenerate_clients=None, regenerate_gradio_clients=None, validate_clients=None, fail_if_invalid_client=None,
prompt_type=None, prompt_dict=None, chat_template=None, system_prompt=None,
debug=None, chat=False,
stream_output=None, enable_caching=None, async_output=None, num_async=None, stream_map=None,
eval_filename=None, eval_prompts_only_num=None, eval_prompts_only_seed=None, eval_as_output=None,
examples=None, memory_restriction_level=None,
# evaluate kwargs
n_jobs=None, llamacpp_path=None, llamacpp_dict=None, exllama_dict=None, gptq_dict=None, attention_sinks=None,
sink_dict=None, truncation_generation=None,
hf_model_dict=None,
force_seq2seq_type=None, force_t5_type=None,
load_exllama=None,
force_streaming_on_to_handle_timeouts=None,
use_pymupdf=None,
use_unstructured_pdf=None,
use_pypdf=None,
enable_pdf_ocr=None,
enable_pdf_doctr=None,
enable_image=None,
visible_image_models=None,
image_size=None,
image_quality=None,
image_guidance_scale=None,
image_num_inference_steps=None,
try_pdf_as_html=None,
# for evaluate args beyond what's already above, or things that are always dynamic and locally created
load_awq='',
temperature=None,
top_p=None,
top_k=None,
penalty_alpha=None,
num_beams=None,
max_new_tokens=None,
min_new_tokens=None,
early_stopping=None,
max_time=None,
repetition_penalty=None,
num_return_sequences=None,
do_sample=None,
seed=None,
langchain_mode=None,
langchain_action=None,
langchain_agents=[],
top_k_docs=None,
chunk=None,
chunk_size=None,
document_subset=None,
document_choice=None,
document_source_substrings=None,
document_source_substrings_op=None,
document_content_substrings=None,
document_content_substrings_op=None,
pre_prompt_query=None, prompt_query=None,
pre_prompt_summary=None, prompt_summary=None, hyde_llm_prompt=None,
all_docs_start_prompt=None,
all_docs_finish_prompt=None,
user_prompt_for_fake_system_prompt=None,
json_object_prompt=None,
json_object_prompt_simpler=None,
json_code_prompt=None,
json_code_prompt_if_no_schema=None,
json_schema_instruction=None,
json_preserve_system_prompt=None,
json_object_post_prompt_reminder=None,
json_code_post_prompt_reminder=None,
json_code2_post_prompt_reminder=None,
image_audio_loaders=None,
pdf_loaders=None,
url_loaders=None,
jq_schema=None,
extract_frames=None,
extract_frames0=None,
guided_whitespace_pattern0=None,
metadata_in_context0=None,
llava_prompt=None,
visible_models=None,
h2ogpt_key=None,
add_search_to_context=None,
chat_conversation=None,
text_context_list=None,
docs_ordering_type=None,
min_max_new_tokens=None,
max_input_tokens=None,
max_total_input_tokens=None,
docs_token_handling=None,
docs_joiner=None,
hyde_level=None,
hyde_template=None,
hyde_show_only_final=None,
hyde_show_intermediate_in_accordion=None,
map_reduce_show_intermediate_in_accordion=None,
doc_json_mode=None,
metadata_in_context=None,
chatbot_role=None,
speaker=None,
tts_language=None,
tts_speed=None,
image_file=None,
image_control=None,
images_num_max=None,
image_resolution=None,
image_format=None,
rotate_align_resize_image=None,
video_frame_period=None,
image_batch_image_prompt=None,
image_batch_final_prompt=None,
image_batch_stream=None,
visible_vision_models=None,
video_file=None,
response_format=None,
guided_json=None,
guided_regex=None,
guided_choice=None,
guided_grammar=None,
guided_whitespace_pattern=None,
client_metadata=None,
# for evaluate kwargs:
captions_model=None,
caption_loader=None,
doctr_loader=None,
pix2struct_loader=None,
llava_model=None,
image_model_dict=None,
asr_model=None,
asr_loader=None,
image_audio_loaders_options0=None,
pdf_loaders_options0=None,
url_loaders_options0=None,
jq_schema0=None,
keep_sources_in_context=None,
gradio_errors_to_chatbot=None,
allow_chat_system_prompt=None,
src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
model_state0=None,
use_auth_token=None,
trust_remote_code=None,
score_model_state0=None,
max_max_new_tokens=None,
is_public=None,
max_max_time=None,
raise_generate_gpu_exceptions=None, load_db_if_exists=None, use_llm_if_no_docs=None,
my_db_state0=None, selection_docs_state0=None, dbs=None, langchain_modes=None, langchain_mode_paths=None,
detect_user_path_changes_every_query=None,
use_openai_embedding=None, use_openai_model=None,
hf_embedding_model=None, migrate_embedding_model=None,
cut_distance=None,
answer_with_sources=None,
append_sources_to_answer=None,
append_sources_to_chat=None,
sources_show_text_in_accordion=None,
top_k_docs_max_show=None,
show_link_in_sources=None,
langchain_instruct_mode=None,
add_chat_history_to_context=None,
context=None, iinput=None,
db_type=None, first_para=None, text_limit=None, verbose=None,
gradio=None, cli=None,
use_cache=None,
auto_reduce_chunks=None, max_chunks=None, headsize=None,
model_lock=None, force_langchain_evaluate=None,
model_state_none=None,
):
from_ui = False
# makes no sense to evaluate document content for langchain case
answer_with_sources = False
show_link_in_sources = False
append_sources_to_answer = False
append_sources_to_chat = False
check_locals(**locals().copy())
if not context:
context = ''
if eval_prompts_only_num > 0:
np.random.seed(eval_prompts_only_seed)
example1 = examples[-1] # pick reference example
examples = []
responses = []
if eval_filename is None:
# override default examples with shareGPT ones for human-level eval purposes only
eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
if not os.path.isfile(eval_filename):
os.system(
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
import json
with open(eval_filename, 'r', encoding='utf-8') as f:
data = json.load(f)
# focus on data that starts with human, else likely chopped from other data
turn_start = 0 # odd in general
data = [x for x in data if len(x['conversations']) > turn_start + 1 and
x['conversations'][turn_start]['from'] == 'human' and
x['conversations'][turn_start + 1]['from'] == 'gpt']
for i in sorted(np.random.randint(0, len(data), size=eval_prompts_only_num)):
assert data[i]['conversations'][turn_start]['from'] == 'human'
instruction = data[i]['conversations'][turn_start]['value']
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
output = data[i]['conversations'][turn_start + 1]['value']
examplenew = example1.copy()
assert not chat, "No gradio must use chat=False, uses nochat instruct"
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
examplenew[eval_func_param_names.index('iinput_nochat')] = iinput
examplenew[eval_func_param_names.index('context')] = context
examples.append(examplenew)
responses.append(output)
else:
# get data, assume in correct format: json of rows of dict of instruction and output
# only instruction is required
import json
with open(eval_filename, 'r', encoding='utf-8') as f:
data = json.load(f)
for i in sorted(np.random.randint(0, len(data), size=eval_prompts_only_num)):
examplenew = example1.copy()
instruction = data[i]['instruction']
output = data[i].get('output', '') # not required
assert not chat, "No gradio must use chat=False, uses nochat instruct"
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
examplenew[eval_func_param_names.index('iinput_nochat')] = iinput
examplenew[eval_func_param_names.index('context')] = context
examples.append(examplenew)
responses.append(output)
num_examples = len(examples)
scoring_path = 'scoring'
# if no permissions, assume may not want files, put into temp
scoring_path = makedirs(scoring_path, tmp_ok=True, use_base=True)
if eval_as_output:
used_base_model = 'gpt35'
used_lora_weights = ''
used_inference_server = ''
else:
used_base_model = str(base_model.split('/')[-1])
used_lora_weights = str(lora_weights.split('/')[-1])
used_inference_server = str(inference_server.split('/')[-1])
eval_out_filename = "df_scores_%s_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_prompts_only_num,
eval_prompts_only_seed,
eval_as_output,
used_base_model,
used_lora_weights,
used_inference_server,
)
eval_out_filename = os.path.join(scoring_path, eval_out_filename)
smodel = score_model_state0['model']
stokenizer = score_model_state0['tokenizer']
sdevice = score_model_state0['device']
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
device = 'cpu' if n_gpus == 0 else 'cuda'
context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
with context_class(device):
# ensure was set right above before examples generated
assert not stream_output, "stream_output=True does not make sense with example loop"
import time
from functools import partial
if not eval_as_output:
requests_state0 = {}
roles_state0 = None
args = (None, my_db_state0, selection_docs_state0, requests_state0, roles_state0)
assert len(args) == len(input_args_list)
fun = partial(evaluate,
*args,
**get_kwargs(evaluate, exclude_names=input_args_list + eval_func_param_names,
**locals().copy()))
else:
assert eval_prompts_only_num > 0
def get_response(*args, exi=0):
# assumes same ordering of examples and responses
yield responses[exi]
fun = get_response
t0 = time.time()
score_dump = []
score_avg = 0
score_median = 0
for exi, ex in enumerate(examples):
clear_torch_cache(allow_skip=True)
instruction = ex[eval_func_param_names.index('instruction_nochat')]
iinput = ex[eval_func_param_names.index('iinput_nochat')]
context = ex[eval_func_param_names.index('context')]
clear_torch_cache(allow_skip=True)
print("")
print("START" + "=" * 100)
print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
print("-" * 105)
# fun yields as generator, so have to iterate over it
# Also means likely do NOT want --stream_output=True, else would show all generations
t1 = time.time()
# grab other parameters, like langchain_mode
eval_vars = ex.copy()
for k in eval_func_param_names:
if k in locals().copy():
eval_vars[eval_func_param_names.index(k)] = locals().copy()[k]
gener = fun(*tuple(eval_vars), exi=exi) if eval_as_output else fun(*tuple(eval_vars))
for res_fun in gener:
res = res_fun['response']
sources = res_fun.get('sources', 'Failure of Generation')
print(res)
if smodel:
score_with_prompt = False
if score_with_prompt:
data_point = dict(instruction=instruction, input=iinput, context=context)
prompter = Prompter(prompt_type, prompt_dict,
debug=debug, stream_output=stream_output, base_model=base_model)
prompt = prompter.generate_prompt(data_point, context_from_history=False, image_file=image_file)
else:
# just raw input and output
if eval_prompts_only_num > 0:
# only our own examples have this filled at moment
assert iinput in [None, ''], iinput # should be no iinput
prompt = instruction
score = score_qa(smodel, stokenizer, prompt, res, memory_restriction_level=memory_restriction_level)
score_dump.append(ex + [prompt, res, score, sources])
# dump every score in case abort
df_scores = pd.DataFrame(score_dump,
columns=eval_func_param_names +
eval_extra_columns)
df_scores.to_parquet(eval_out_filename, index=False)
if not isinstance(score, str):
# plot histogram so far
plt.figure(figsize=(10, 10))
plt.hist(df_scores['score'], bins=20)
score_avg = np.mean(df_scores['score'])
score_median = np.median(df_scores['score'])
print("SCORE %s: %s So far: AVG: %s MEDIAN: %s" % (exi, score, score_avg, score_median),
flush=True)
plt.title("Score avg: %s median: %s" % (score_avg, score_median))
plt.savefig(eval_out_filename.replace('.parquet', '.png'))
plt.close()
print("END" + "=" * 102)
print("")
t2 = time.time()
print("Time taken for example: %s Time taken so far: %.4f about %.4g per example" % (
t2 - t1, t2 - t0, (t2 - t0) / (1 + exi)))
t1 = time.time()
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
print("Score avg: %s median: %s" % (score_avg, score_median), flush=True)
return eval_out_filename