codellama-13b-instruct-gguf

Runtime error

ffreemt

Update Info

e390745 about 1 year ago

12.1 kB

	"""Run codes."""
	# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
	# ruff: noqa: E501
	import gc
	import os
	import platform
	import random
	import time
	from dataclasses import asdict, dataclass
	from pathlib import Path
	from typing import Optional, Sequence

	# from types import SimpleNamespace
	import gradio as gr
	import psutil
	from about_time import about_time
	from ctransformers import AutoModelForCausalLM
	from dl_hf_model import dl_hf_model
	from loguru import logger

	from examples_list import examples_list

	url = "https://huggingface.co/TheBloke/Code Llama-13B-Python-GGML/blob/main/code llama-13b-python.ggmlv3.Q4_K_M.bin" # 7.87G
	url = "https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/blob/main/codellama-13b-instruct.Q4_K_M.gguf" # 8.06

	LLM = None
	gc.collect()

	try:
	logger.debug(f" dl {url}")
	model_loc, file_size = dl_hf_model(url)
	logger.info(f"done load llm {model_loc=} {file_size=}G")
	except Exception as exc_:
	logger.error(exc_)
	raise SystemExit(1) from exc_

	# raise SystemExit(0)

	# Prompt template: Guanaco
	# {past_history}
	prompt_template = """You are a helpful assistant. Let's think step by step.
	### Human:
	{question}
	### Assistant:"""

	# Prompt template: garage-bAInd/Stable-Pla t ypus2-13B
	prompt_template = """
	### System:
	This is a system prompt, please behave and help the user.

	### Instruction:

	{question}

	### Response:
	"""
	_ = """[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
	{prompt}
	[/INST]"""
	prompt_template = """
	[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
	{question}
	[/INST]"""

	# human_prefix = "### Instruction"
	# ai_prefix = "### Response"
	# stop_list = [f"{human_prefix}:"]

	_ = psutil.cpu_count(logical=False) - 1
	cpu_count: int = int(_) if _ else 1
	logger.debug(f"{cpu_count=}")

	logger.debug(f"{model_loc=}")
	LLM = AutoModelForCausalLM.from_pretrained(
	model_loc,
	model_type="llama",
	threads=cpu_count,
	)

	os.environ["TZ"] = "Asia/Shanghai"
	try:
	time.tzset() # type: ignore # pylint: disable=no-member
	except Exception:
	# Windows
	logger.warning("Windows, cant run time.tzset()")


	# ctransformers.Config() default
	# Config(top_k=40, top_p=0.95, temperature=0.8,
	# repetition_penalty=1.1, last_n_tokens=64, seed=-1,
	# batch_size=8, threads=-1, max_new_tokens=256,
	# stop=None, stream=False, reset=True,
	# context_length=-1, gpu_layers=0)
	@dataclass
	class GenerationConfig:
	temperature: float = 0.7
	top_k: int = 50
	top_p: float = 0.9
	repetition_penalty: float = 1.0
	max_new_tokens: int = 512
	seed: int = 42
	reset: bool = False
	stream: bool = True
	threads: int = cpu_count
	# stop: list[str] = field(default_factory=lambda: stop_list)


	# ctransformers\llm.py
	@dataclass
	class Config:
	# sample
	top_k: int = 40
	top_p: float = 0.95
	temperature: float = 0.8
	repetition_penalty: float = 1.1
	last_n_tokens: int = 64
	seed: int = -1

	# eval
	batch_size: int = 8
	threads: int = -1

	# generate
	max_new_tokens: int = 512 # 256
	stop: Optional[Sequence[str]] = None
	stream: bool = True # False
	reset: bool = False # True

	# model
	# context_length: int = -1
	# gpu_layers: int = 0


	def generate(
	question: str,
	llm=LLM,
	# config: GenerationConfig = GenerationConfig(),
	config: Config = Config(),
	):
	"""Run model inference, will return a Generator if streaming is true."""
	# _ = prompt_template.format(question=question)
	# print(_)

	prompt = prompt_template.format(question=question)

	return llm(
	prompt,
	**asdict(config),
	# **vars(config),
	)


	# logger.debug(f"{asdict(GenerationConfig())=}")
	logger.debug(f"{Config(stream=True)=}")
	logger.debug(f"{vars(Config(stream=True))=}")


	def user(user_message, history):
	# return user_message, history + [[user_message, None]]
	if history is None:
	history = []
	history.append([user_message, None])
	return user_message, history # keep user_message


	def user1(user_message, history):
	# return user_message, history + [[user_message, None]]
	if history is None:
	history = []
	history.append([user_message, None])
	return "", history # clear user_message


	def bot_(history):
	user_message = history[-1][0]
	resp = random.choice(["How are you?", "I love you", "I'm very hungry"])
	bot_message = user_message + ": " + resp
	history[-1][1] = ""
	for character in bot_message:
	history[-1][1] += character
	time.sleep(0.02)
	yield history

	history[-1][1] = resp
	yield history


	def bot(history):
	user_message = ""
	try:
	user_message = history[-1][0]
	except Exception as exc:
	logger.error(exc)
	response = []

	logger.debug(f"{user_message=}")

	with about_time() as atime: # type: ignore
	flag = 1
	prefix = ""
	then = time.time()

	logger.debug("about to generate")

	config = GenerationConfig(reset=True)
	for elm in generate(user_message, config=config):
	if flag == 1:
	logger.debug("in the loop")
	prefix = f"({time.time() - then:.2f}s)\n"
	flag = 0
	print(prefix, end="", flush=True)
	logger.debug(f"{prefix=}")
	print(elm, end="", flush=True)
	# logger.debug(f"{elm}")

	response.append(elm)
	history[-1][1] = prefix + "".join(response)
	yield history

	_ = (
	f"(time elapsed: {atime.duration_human}, " # type: ignore
	f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
	)

	history[-1][1] = "".join(response) + f"\n{_}"
	yield history


	def predict_api(prompt):
	logger.debug(f"{prompt=}")
	try:
	# user_prompt = prompt
	config = GenerationConfig(
	temperature=0.2,
	top_k=10,
	top_p=0.9,
	repetition_penalty=1.0,
	max_new_tokens=512, # adjust as needed
	seed=42,
	reset=True, # reset history (cache)
	stream=False,
	# threads=cpu_count,
	# stop=prompt_prefix[1:2],
	)

	response = generate(
	prompt,
	config=config,
	)

	logger.debug(f"api: {response=}")
	except Exception as exc:
	logger.error(exc)
	response = f"{exc=}"
	# bot = {"inputs": [response]}
	# bot = [(prompt, response)]

	return response


	css = """
	.importantButton {
	background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
	border: none !important;
	}
	.importantButton:hover {
	background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
	border: none !important;
	}
	.disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
	.xsmall {font-size: x-small;}
	"""

	logger.info("start block")

	with gr.Blocks(
	title=f"{Path(model_loc).name}",
	# theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
	theme=gr.themes.Glass(text_size="sm", spacing_size="sm"),
	css=css,
	) as block:
	# buff_var = gr.State("")
	with gr.Accordion("🎈 Info", open=False):
	gr.Markdown(
	f"""<h5><center>{Path(model_loc).name}</center></h4>
	initial response typically takes around 60 to 90 seconds, in addition to any queuing time""",
	elem_classes="xsmall",
	)

	# chatbot = gr.Chatbot().style(height=700) # 500
	chatbot = gr.Chatbot(height=500)

	# buff = gr.Textbox(show_label=False, visible=True)

	with gr.Row():
	with gr.Column(scale=5):
	msg = gr.Textbox(
	label="Chat Message Box",
	placeholder="Ask me to write some python code, or js/c/rust... codes ... (press Shift+Enter or click Submit to send)",
	show_label=False,
	# container=False,
	lines=6,
	max_lines=30,
	show_copy_button=True,
	# ).style(container=False)
	)
	with gr.Column(scale=1, min_width=50):
	with gr.Row():
	submit = gr.Button("Submit", elem_classes="xsmall")
	stop = gr.Button("Stop", visible=True)
	clear = gr.Button("Clear History", visible=True)
	with gr.Row(visible=False):
	with gr.Accordion("Advanced Options:", open=False):
	with gr.Row():
	with gr.Column(scale=2):
	system = gr.Textbox(
	label="System Prompt",
	value=prompt_template,
	show_label=False,
	container=False,
	# ).style(container=False)
	)
	with gr.Column():
	with gr.Row():
	change = gr.Button("Change System Prompt")
	reset = gr.Button("Reset System Prompt")

	with gr.Accordion("Example Inputs", open=True):
	examples = gr.Examples(
	examples=examples_list,
	inputs=[msg],
	examples_per_page=40,
	)

	# with gr.Row():
	with gr.Accordion("Disclaimer", open=False):
	_ = Path(model_loc).name
	gr.Markdown(
	f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
	f"factually accurate information. {_} was trained on various public datasets; while great efforts "
	"have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
	"biased, or otherwise offensive outputs.",
	elem_classes=["disclaimer"],
	)

	msg_submit_event = msg.submit(
	# fn=conversation.user_turn,
	fn=user,
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	show_progress="full",
	# api_name=None,
	).then(bot, chatbot, chatbot, queue=True)
	submit_click_event = submit.click(
	# fn=lambda x, y: ("",) + user(x, y)[1:], # clear msg
	fn=user1, # clear msg
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	# queue=False,
	show_progress="full",
	# api_name=None,
	).then(bot, chatbot, chatbot, queue=True)
	stop.click(
	fn=None,
	inputs=None,
	outputs=None,
	cancels=[msg_submit_event, submit_click_event],
	queue=False,
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	with gr.Accordion("For Chat/Translation API", open=False, visible=False):
	input_text = gr.Text()
	api_btn = gr.Button("Go", variant="primary")
	out_text = gr.Text()

	api_btn.click(
	predict_api,
	input_text,
	out_text,
	api_name="api",
	)

	# block.load(update_buff, [], buff, every=1)
	# block.load(update_buff, [buff_var], [buff_var, buff], every=1)

	# concurrency_count=5, max_size=20
	# max_size=36, concurrency_count=14
	# CPU cpu_count=2 16G, model 7G
	# CPU UPGRADE cpu_count=8 32G, model 7G

	# does not work
	_ = """
	# _ = int(psutil.virtual_memory().total / 10**9 // file_size - 1)
	# concurrency_count = max(_, 1)
	if psutil.cpu_count(logical=False) >= 8:
	# concurrency_count = max(int(32 / file_size) - 1, 1)
	else:
	# concurrency_count = max(int(16 / file_size) - 1, 1)
	# """

	# default concurrency_count = 1
	# block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)

	server_port = 7860
	if "forindo" in platform.node():
	server_port = 7861
	block.queue(max_size=5).launch(
	debug=True, server_name="0.0.0.0", server_port=server_port
	)

	# block.queue(max_size=5).launch(debug=True, server_name="0.0.0.0")