Uploaded model
- Developed by: AlberBshara
- License: apache-2.0
- Finetuned from model : llama-3-8b-Instruct-bnb-4bit
you can use this model for QA task, the context window of this model is 8K
How to Use it:
Installs Unsloth, Xformers (Flash Attention) and all other packages!
%%capture !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes !pip install triton
import sys, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from unsloth.chat_templates import get_chat_template
from typing import Tuple, Dict, Any
import torch
class LLM:
def __init__(self, load_in_4bit: bool = True,
load_cpu_mem_usage: bool = True,
hf_model_path: str = "AlberBshara/scholara_QA"):
"""
Args:
load_in_4bit (bool): Use 4-bit quantization. Defaults to True.
load_cpu_mem_usage (bool): Reduce CPU memory usage. Defaults to True.
hf_model_path (str): The path of your model on HuggingFace-Hub like "your-user-name/model-name".
"""
assert torch.cuda.is_available(), "CUDA is not available. An NVIDIA GPU is required."
hf_auth_token = HUGGING_FACE_API_TOKEN
# Specify the quantization config
self._bnb_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)
# Load model directly with quantization config
self._model = AutoModelForCausalLM.from_pretrained(
hf_model_path,
low_cpu_mem_usage=load_cpu_mem_usage,
quantization_config=self._bnb_config,
use_auth_token=hf_auth_token
)
# Load the tokenizer
self._tokenizer = AutoTokenizer.from_pretrained(
hf_model_path,
use_auth_token=hf_auth_token
)
self._tokenizer = get_chat_template(
self._tokenizer,
chat_template="llama-3",
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
)
self._hf_model_path = hf_model_path
self._EOS_TOKEN_ID = self._tokenizer.eos_token_id
self._prompt = lambda context, question: f"""
Answer the following question, use the given context.
Context: [{context}]
Question: [{question}]
"""
def invoke(self, context: str, question: str) -> Tuple:
if not question.strip():
raise ValueError("question cannot be empty or None")
if not context.strip():
raise ValueError("context cannot be empty or None")
inputs = self._prompt(context, question)
messages = [{"from": "human", "value": inputs}]
inputs = self._tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")
# Increase the max_new_tokens to allow more detailed responses
output_ids = self._model.generate(inputs, max_new_tokens=2048, pad_token_id=self._EOS_TOKEN_ID)
output_ids = output_ids.tolist()[0] if output_ids.size(0) == 1 else output_ids.tolist()
output_text = self._tokenizer.decode(output_ids, skip_special_tokens=True)
# free GPU Mem.
del inputs
torch.cuda.empty_cache()
return output_text, output_ids, None
def extract_answer(self, response: str) -> str:
start_with: str = ".assistant"
start_index = response.find(start_with)
# If the word is found, extract the substring from that point onward
if start_index != -1:
# Move start_index to the end of the word
start_index += len(start_with)
return response[start_index:]
else:
return response
def get_metadata(self) -> Dict[str, Any]:
return {
"class_name": self.__class__.__name__,
"init_params": {
"load_in_4bit": True,
"load_cpu_mem_usage": True,
"hf_model_path": "AlberBshara/scholara_QA",
"hf_auth_token": "--%$%--"
},
"methods": ["invoke", "extract_answer"]
}
test_llm = LLM()
Model tree for AlberBshara/scholara_QA
Base model
unsloth/llama-3-8b-Instruct-bnb-4bit