##################################################### ### DOCUMENT PROCESSOR [MODELS] ##################################################### # Jonathan Wang # ABOUT: # This project creates an app to chat with PDFs. # This is the LANGUAGE MODELS # that are used in the document reader. ##################################################### ## TODOS: # Add support for vLLM / AWQ / GPTQ models. (probably not going to be done due to lack of attention scores) # Add KTransformers backend? # https://github.com/kvcache-ai/ktransformers # https://github.com/Tada-AI/pdf_parser ##################################################### ## IMPORTS: from __future__ import annotations import gc import logging import sys from typing import ( Any, Callable, Dict, List, Optional, Protocol, Sequence, Union, cast, runtime_checkable, ) import streamlit as st import torch from llama_index.core.base.embeddings.base import BaseEmbedding from llama_index.core.base.llms.base import BaseLLM from llama_index.core.base.llms.generic_utils import ( messages_to_prompt as generic_messages_to_prompt, ) from llama_index.core.base.llms.types import ( ChatMessage, ChatResponse, ChatResponseGen, CompletionResponse, CompletionResponseGen, LLMMetadata, MessageRole, ) from llama_index.core.bridge.pydantic import Field, PrivateAttr from llama_index.core.callbacks import CallbackManager from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS from llama_index.core.llms.callbacks import ( llm_chat_callback, llm_completion_callback, ) from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.core.postprocessor import SentenceTransformerRerank from llama_index.core.prompts.base import PromptTemplate from llama_index.core.schema import ImageDocument, ImageNode from llama_index.core.types import BaseOutputParser, PydanticProgramMode from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.llms.huggingface import HuggingFaceLLM from PIL import Image as PILImage from transformers import ( AutoImageProcessor, AutoModelForVision2Seq, AutoTokenizer, LogitsProcessor, QuantoConfig, StoppingCriteria, StoppingCriteriaList, ) from typing_extensions import Annotated # from wtpsplit import SaT # Sentence segmentation model. Dropping this. Requires adapters=0.2.1->Transformers=4.39.3 | Phi3 Vision requires Transformers 4.40.2 ## NOTE: Proposal for LAZY LOADING packages for running LLMS: # Currently not done because empahsis is on local inference w/ ability to get Attention Scores, which is not yet supported in non-HF Transformers methods. ## LLamacpp: # from llama_index.llms.llama_cpp import LlamaCPP # from llama_index.llms.llama_cpp.llama_utils import ( # messages_to_prompt, # completion_to_prompt # ) ## HF Transformers LLM: # from transformers import AutoTokenizer, BitsAndBytesConfig # from llama_index.llms.huggingface import HuggingFaceLLM ## GROQ # from llama_index.llms.groq import Groq ##################################################### ### SETTINGS: DEFAULT_HF_MULTIMODAL_LLM = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW = 1024 DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS = 1024 ##################################################### ### CODE: logger = logging.getLogger(__name__) @st.cache_resource def get_embedder( model_path: str = "mixedbread-ai/mxbai-embed-large-v1", device: str = "cuda", # 'cpu' is unbearably slow ) -> BaseEmbedding: """Given the path to an embedding model, load it.""" # NOTE: okay we definitely could have not made this wrapper, but shrug return HuggingFaceEmbedding( model_path, device=device ) @st.cache_resource def get_reranker( model_path: str = "mixedbread-ai/mxbai-rerank-large-v1", top_n: int = 3, device: str = "cpu", # 'cuda' if we were rich ) -> SentenceTransformerRerank: # technically this is a BaseNodePostprocessor, but that seems too abstract. """Given the path to a reranking model, load it.""" # NOTE: okay we definitely could have not made this wrapper, but shrug return SentenceTransformerRerank( model=model_path, top_n=top_n, device=device ) ## LLM Options Below # def _get_llamacpp_llm( # model_path: str, # model_seed: int = 31415926, # model_temperature: float = 1e-64, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] # model_context_length: Optional[int] = 8192, # model_max_new_tokens: Optional[int] = 1024, # ) -> BaseLLM: # """Load a LlamaCPP model using GPU and other sane defaults.""" # # Lazy Loading # from llama_index.llms.llama_cpp import LlamaCPP # from llama_index.llms.llama_cpp.llama_utils import ( # messages_to_prompt, # completion_to_prompt # ) # # Arguments to Pass # llm = LlamaCPP( # model_path=model_path, # temperature=model_temperature, # max_new_tokens=model_max_new_tokens, # context_window=model_context_length, # # kwargs to pass to __call__() # generate_kwargs={'seed': model_seed}, # {'temperature': TEMPERATURE, 'top_p':0.7, 'min_p':0.1, 'seed': MODEL_SEED}, # # kwargs to pass to __init__() # # set to at least 1 to use GPU # model_kwargs={'n_gpu_layers': -1, 'n_threads': os.cpu_count()-1}, #, 'rope_freq_scale': 0.83, 'rope_freq_base': 20000}, # # transform inputs into model format # messages_to_prompt=messages_to_prompt, # completion_to_prompt=completion_to_prompt, # verbose=True, # ) # return (llm) @st.cache_resource def _get_hf_llm( model_path: str, model_temperature: float = sys.float_info.min, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] to confirm (?) model_context_length: int | None = 16384, model_max_new_tokens: int | None = 2048, hf_quant_level: int | None = 8, ) -> BaseLLM: """Load a Huggingface-Transformers based model using sane defaults.""" # Fix temperature if needed; HF implementation complains about it being zero model_temperature = max(sys.float_info.min, model_temperature) # Get Quantization with BitsandBytes quanto_config = None # NOTE: by default, no quantization. if (hf_quant_level == 4): # bnb_config = BitsAndBytesConfig( # # load_in_8bit=True, # load_in_4bit=True, # # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16 # # bnb_4bit_compute_dtype='float16' # ) quanto_config = QuantoConfig( weights="int4" # there's also 'int2' if you're crazy... ) elif (hf_quant_level == 8): # bnb_config = BitsAndBytesConfig( # load_in_8bit=True # ) quanto_config = QuantoConfig( weights="int8" ) # Get Stopping Tokens for Llama3 based models, because they're /special/ and added a new one. tokenizer = AutoTokenizer.from_pretrained( model_path ) stopping_ids = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>"), ] return HuggingFaceLLM( model_name=model_path, tokenizer_name=model_path, stopping_ids=stopping_ids, max_new_tokens=model_max_new_tokens or DEFAULT_NUM_OUTPUTS, context_window=model_context_length or DEFAULT_CONTEXT_WINDOW, tokenizer_kwargs={"trust_remote_code": True}, model_kwargs={"trust_remote_code": True, "quantization_config": quanto_config}, generate_kwargs={ "do_sample": not model_temperature > sys.float_info.min, "temperature": model_temperature, }, is_chat_model=True, ) @st.cache_resource def get_llm( model_path: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", model_temperature: float = 0, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] model_context_length: int | None = 8192, model_max_new_tokens: int | None = 1024, hf_quant_level: int | None = 8, # 4-bit / 8-bit loading for HF models ) -> BaseLLM: """ Given the path to a LLM, determine the type, load it in and convert it into a Llamaindex-compatable LLM. NOTE: I chose to set some "sane" defaults, so it's probably not as flexible as some other dev would like. """ # if (model_path_extension == ".gguf"): # ##### LLAMA.CPP # return(_get_llamacpp_llm(model_path, model_seed, model_temperature, model_context_length, model_max_new_tokens)) # TODO(Jonathan Wang): Consider non-HF-Transformers backends # vLLM support for AWQ/GPTQ models # I guess reluctantly AutoAWQ and AutoGPTQ packages. # Exllamav2 is kinda dead IMO. # else: #### No extension or weird fake extension suggests a folder, i.e., the base model from HF return(_get_hf_llm(model_path=model_path, model_temperature=model_temperature, model_context_length=model_context_length, model_max_new_tokens=model_max_new_tokens, hf_quant_level=hf_quant_level)) # @st.cache_resource # def get_llm() -> BaseLLM: # from llama_index.llms.groq import Groq # llm = Groq( # model='llama-3.1-8b-instant', # old: 'llama3-8b-8192' # api_key=os.environ.get('GROQ_API_KEY'), # ) # return (llm) class EosLogitProcessor(LogitsProcessor): """Special snowflake processor for Salesforce Vision Model.""" def __init__(self, eos_token_id: int, end_token_id: int): super().__init__() self.eos_token_id = eos_token_id self.end_token_id = end_token_id def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if input_ids.size(1) > 1: # Expect at least 1 output token. forced_eos = torch.full((scores.size(1),), -float("inf"), device=input_ids.device) forced_eos[self.eos_token_id] = 0 # Force generation of EOS after the <|end|> token. scores[input_ids[:, -1] == self.end_token_id] = forced_eos return scores # NOTE: These two protocols are needed to appease mypy # https://github.com/run-llama/llama_index/blob/5238b04c183119b3035b84e2663db115e63dcfda/llama-index-core/llama_index/core/llms/llm.py#L89 @runtime_checkable class MessagesImagesToPromptType(Protocol): def __call__(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument], **kwargs: Any) -> str: pass MessagesImagesToPromptCallable = Annotated[ Optional[MessagesImagesToPromptType], WithJsonSchema({"type": "string"}), ] # https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/batch_inference.ipynb class HuggingFaceMultiModalLLM(MultiModalLLM): """Supposed to be a wrapper around HuggingFace's Vision LLMS. Currently only supports one model type: Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5 """ model_name: str = Field( description='The multi-modal huggingface LLM to use. Currently only using Phi3.', default=DEFAULT_HF_MULTIMODAL_LLM ) context_window: int = Field( default=DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW, description="The maximum number of tokens available for input.", gt=0, ) max_new_tokens: int = Field( default=DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS, description="The maximum number of tokens to generate.", gt=0, ) system_prompt: str = Field( default="", description=( "The system prompt, containing any extra instructions or context. " "The model card on HuggingFace should specify if this is needed." ), ) query_wrapper_prompt: PromptTemplate = Field( default=PromptTemplate("{query_str}"), description=( "The query wrapper prompt, containing the query placeholder. " "The model card on HuggingFace should specify if this is needed. " "Should contain a `{query_str}` placeholder." ), ) tokenizer_name: str = Field( default=DEFAULT_HF_MULTIMODAL_LLM, description=( "The name of the tokenizer to use from HuggingFace. " "Unused if `tokenizer` is passed in directly." ), ) processor_name: str = Field( default=DEFAULT_HF_MULTIMODAL_LLM, description=( "The name of the processor to use from HuggingFace. " "Unused if `processor` is passed in directly." ), ) device_map: str = Field( default="auto", description="The device_map to use. Defaults to 'auto'." ) stopping_ids: list[int] = Field( default_factory=list, description=( "The stopping ids to use. " "Generation stops when these token IDs are predicted." ), ) tokenizer_outputs_to_remove: list = Field( default_factory=list, description=( "The outputs to remove from the tokenizer. " "Sometimes huggingface tokenizers return extra inputs that cause errors." ), ) tokenizer_kwargs: dict = Field( default_factory=dict, description="The kwargs to pass to the tokenizer." ) processor_kwargs: dict = Field( default_factory=dict, description="The kwargs to pass to the processor." ) model_kwargs: dict = Field( default_factory=dict, description="The kwargs to pass to the model during initialization.", ) generate_kwargs: dict = Field( default_factory=dict, description="The kwargs to pass to the model during generation.", ) is_chat_model: bool = Field( default=False, description=( "Whether the model can have multiple messages passed at once, like the OpenAI chat API." # LLMMetadata.__fields__["is_chat_model"].field_info.description # + " Be sure to verify that you either pass an appropriate tokenizer " # "that can convert prompts to properly formatted chat messages or a " # "`messages_to_prompt` that does so." ), ) messages_images_to_prompt: MessagesImagesToPromptCallable = Field( default=generic_messages_to_prompt, description="A function that takes in a list of messages and images and returns a prompt string.", ) _model: Any = PrivateAttr() _tokenizer: Any = PrivateAttr() # TODO(Jonathan Wang): We need to add a separate field for AutoProcessor as opposed to ImageProcessors. _processor: Any = PrivateAttr() _stopping_criteria: Any = PrivateAttr() def __init__( self, context_window: int = DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW, max_new_tokens: int = DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS, query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}", tokenizer_name: str = DEFAULT_HF_MULTIMODAL_LLM, processor_name: str = DEFAULT_HF_MULTIMODAL_LLM, model_name: str = DEFAULT_HF_MULTIMODAL_LLM, model: Any | None = None, tokenizer: Any | None = None, processor: Any | None = None, device_map: str = "auto", stopping_ids: list[int] | None = None, tokenizer_kwargs: dict[str, Any] | None = None, processor_kwargs: dict[str, Any] | None = None, tokenizer_outputs_to_remove: list[str] | None = None, model_kwargs: dict[str, Any] | None = None, generate_kwargs: dict[str, Any] | None = None, is_chat_model: bool = False, callback_manager: CallbackManager | None = None, system_prompt: str = "", messages_images_to_prompt: Callable[[Sequence[ChatMessage], Sequence[ImageDocument]], str] | None = None, # completion_to_prompt: Callable[[str], str] | None = None, # pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, # output_parser: BaseOutputParser | None = None, ) -> None: logger.info(f"CUDA Memory Pre-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}") # Salesforce one is a AutoModelForVision2Seq, but not AutoCausalLM which is more common. model = model or AutoModelForVision2Seq.from_pretrained( model_name, device_map=device_map, trust_remote_code=True, **(model_kwargs or {}) ) logger.info(f"CUDA Memory Post-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}") # check context_window config_dict = model.config.to_dict() model_context_window = int( config_dict.get("max_position_embeddings", context_window) ) if model_context_window < context_window: logger.warning( f"Supplied context_window {context_window} is greater " f"than the model's max input size {model_context_window}. " "Disable this warning by setting a lower context_window." ) context_window = model_context_window processor_kwargs = processor_kwargs or {} if "max_length" not in processor_kwargs: processor_kwargs["max_length"] = context_window # NOTE: Sometimes models (phi-3) will use AutoProcessor and include the tokenizer within it. logger.info(f"CUDA Memory Pre-Processor: {torch.cuda.mem_get_info()}") processor = processor or AutoImageProcessor.from_pretrained( processor_name or model_name, trust_remote_code=True, **processor_kwargs ) logger.info(f"CUDA Memory Post-Processor: {torch.cuda.mem_get_info()}") tokenizer = tokenizer or AutoTokenizer.from_pretrained( tokenizer_name or model_name, trust_remote_code=True, **(tokenizer_kwargs or {}) ) logger.info(f"CUDA Memory Post-Tokenizer: {torch.cuda.mem_get_info()}") # Tokenizer-Model disagreement if (hasattr(tokenizer, "name_or_path") and tokenizer.name_or_path != model_name): # type: ignore (checked for attribute) logger.warning( f"The model `{model_name}` and processor `{getattr(tokenizer, 'name_or_path', None)}` " f"are different, please ensure that they are compatible." ) # Processor-Model disagreement if (hasattr(processor, "name_or_path") and getattr(processor, "name_or_path", None) != model_name): logger.warning( f"The model `{model_name}` and processor `{getattr(processor, 'name_or_path', None)}` " f"are different, please ensure that they are compatible." ) # setup stopping criteria stopping_ids_list = stopping_ids or [] class StopOnTokens(StoppingCriteria): def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs: Any, ) -> bool: return any(input_ids[0][-1] == stop_id for stop_id in stopping_ids_list) stopping_criteria = StoppingCriteriaList([StopOnTokens()]) if isinstance(query_wrapper_prompt, str): query_wrapper_prompt = PromptTemplate(query_wrapper_prompt) messages_images_to_prompt = messages_images_to_prompt or self._processor_messages_to_prompt # Initiate standard LLM super().__init__( callback_manager=callback_manager or CallbackManager([]), ) logger.info(f"CUDA Memory Post-SuperInit: {torch.cuda.mem_get_info()}") # Initiate remaining fields self._model = model self._tokenizer = tokenizer self._processor = processor logger.info(f"CUDA Memory Post-Init: {torch.cuda.mem_get_info()}") self._stopping_criteria = stopping_criteria self.model_name = model_name self.context_window=context_window self.max_new_tokens=max_new_tokens self.system_prompt=system_prompt self.query_wrapper_prompt=query_wrapper_prompt self.tokenizer_name=tokenizer_name self.processor_name=processor_name self.model_name=model_name self.device_map=device_map self.stopping_ids=stopping_ids or [] self.tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [] self.tokenizer_kwargs=tokenizer_kwargs or {} self.processor_kwargs=processor_kwargs or {} self.model_kwargs=model_kwargs or {} self.generate_kwargs=generate_kwargs or {} self.is_chat_model=is_chat_model self.messages_images_to_prompt=messages_images_to_prompt # self.completion_to_prompt=completion_to_prompt, # self.pydantic_program_mode=pydantic_program_mode, # self.output_parser=output_parser, @classmethod def class_name(cls) -> str: return "HuggingFace_MultiModal_LLM" @property def metadata(self) -> LLMMetadata: """LLM metadata.""" return LLMMetadata( context_window=self.context_window, num_output=self.max_new_tokens, model_name=self.model_name, is_chat_model=self.is_chat_model, ) def _processor_messages_to_prompt(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument]) -> str: ### TODO(Jonathan Wang): Make this work generically. Currently we're building for `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5` """Converts a list of messages into a prompt for the multimodal LLM. NOTE: we assume for simplicity here that these images are related, and not the user bouncing between multiple different topics. Thus, we send them all at once. Args: messages (Sequence[ChatMessage]): A list of the messages to convert, where each message is a dict containing the message role and content. images (Sequence[ImageDocument]): The number of images the user is passing to the MultiModalLLM. Returns: str: The prompt. """ # NOTE: For `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`, we actually ignore the `images`; no plaaceholders. """Use the tokenizer to convert messages to prompt. Fallback to generic.""" if hasattr(self._tokenizer, "apply_chat_template"): messages_dict = [ {"role": message.role.value, "content": message.content} for message in messages ] return self._tokenizer.apply_chat_template( messages_dict, tokenize=False, add_generation_prompt=True ) return generic_messages_to_prompt(messages) @llm_completion_callback() def complete( self, prompt: str, image_documents: ImageNode | List[ImageNode] | ImageDocument | List[ImageDocument], # this also takes ImageDocument which inherits from ImageNode. formatted: bool = False, **kwargs: Any ) -> CompletionResponse: """Given a prompt and image node(s), get the Phi-3 Vision prompt""" # Handle images input # https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/demo.ipynb batch_image_list = [] batch_image_sizes = [] batch_prompt = [] # Fix image_documents input typing if (not isinstance(image_documents, list)): image_documents = [image_documents] image_documents = [cast(ImageDocument, image) for image in image_documents] # we probably won't be using the Document features, so I think this is fine. # Convert input images into PIL images for the model. image_list = [] image_sizes = [] for image in image_documents: # NOTE: ImageDocument inherets from ImageNode. We'll go extract the image. image_io = image.resolve_image() image_pil = PILImage.open(image_io) image_list.append(self._processor([image_pil], image_aspect_ratio='anyres')['pixel_values'].to(self._model.device)) image_sizes.append(image_pil.size) batch_image_list.append(image_list) batch_image_sizes.append(image_sizes) batch_prompt.append(prompt) # only one question per image # Get the prompt if not formatted and self.query_wrapper_prompt: prompt = self.query_wrapper_prompt.format(query_str=prompt) prompt_sequence = [] if self.system_prompt: prompt_sequence.append(ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt)) prompt_sequence.append(ChatMessage(role=MessageRole.USER, content=prompt)) prompt = self.messages_images_to_prompt(messages=prompt_sequence, images=image_documents) # Get the model input batch_inputs = { "pixel_values": batch_image_list } language_inputs = self._tokenizer( [prompt], return_tensors="pt", padding='longest', # probably not needed. max_length=self._tokenizer.model_max_length, truncation=True ).to(self._model.device) # TODO: why does the example cookbook have this weird conversion to Cuda instead of .to(device)? # language_inputs = {name: tensor.cuda() for name, tensor in language_inputs.items()} batch_inputs.update(language_inputs) gc.collect() torch.cuda.empty_cache() # remove keys from the tokenizer if needed, to avoid HF errors # TODO: this probably is broken and wouldn't work. for key in self.tokenizer_outputs_to_remove: if key in batch_inputs: batch_inputs.pop(key, None) # Get output tokens = self._model.generate( **batch_inputs, image_sizes=batch_image_sizes, pad_token_id=self._tokenizer.pad_token_id, eos_token_id=self._tokenizer.eos_token_id, max_new_tokens=self.max_new_tokens, stopping_criteria=self._stopping_criteria, # NOTE: Special snowflake processor for Salesforce XGEN Phi3 Mini. logits_processor=[EosLogitProcessor(eos_token_id=self._tokenizer.eos_token_id, end_token_id=32007)], **self.generate_kwargs ) gc.collect() torch.cuda.empty_cache() # completion_tokens = tokens[:, batch_inputs['input_ids'].shape[1]:] completion = self._tokenizer.batch_decode( tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] gc.collect() torch.cuda.empty_cache() output = CompletionResponse(text=completion, raw={'model_output': tokens}) # Clean stuff up del batch_image_list, batch_image_sizes, batch_inputs, tokens, completion gc.collect() torch.cuda.empty_cache() # Return the completion return output @llm_completion_callback() def stream_complete( self, prompt: str, formatted: bool = False, **kwargs: Any ) -> CompletionResponseGen: raise NotImplementedError @llm_chat_callback() def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: raise NotImplementedError @llm_chat_callback() def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: raise NotImplementedError @llm_completion_callback() async def acomplete( self, prompt: str, images: ImageNode | List[ImageNode], # this also takes ImageDocument which inherits from ImageNode. formatted: bool = False, **kwargs: Any ) -> CompletionResponse: raise NotImplementedError @llm_completion_callback() async def astream_complete( self, prompt: str, formatted: bool = False, **kwargs: Any ) -> CompletionResponseGen: raise NotImplementedError @llm_chat_callback() async def achat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: raise NotImplementedError @llm_chat_callback() async def astream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: raise NotImplementedError # @st.cache_resource() # def get_multimodal_llm(**kwargs) -> MultiModalLLM: # vision_llm = OpenAIMultiModal( # model='gpt-4o-mini', # temperature=0, # max_new_tokens=512, # image_detail='auto' # ) # return (vision_llm) @st.cache_resource def get_multimodal_llm( model_name: str = DEFAULT_HF_MULTIMODAL_LLM, device_map: str = "cuda", # does not support 'auto' processor_kwargs: dict[str, Any] | None = None, model_kwargs: dict[str, Any] | None = None, # {'torch_dtype': torch.bfloat16}, # {'torch_dtype': torch.float8_e5m2} generate_kwargs: dict[str, Any] | None = None, # from the example cookbook hf_quant_level: int | None = 8, ) -> HuggingFaceMultiModalLLM: # Get default generate kwargs if model_kwargs is None: model_kwargs = {} if processor_kwargs is None: processor_kwargs = {} if generate_kwargs is None: generate_kwargs = { "temperature": sys.float_info.min, "top_p": None, "num_beams": 1 # NOTE: we hack in EOSLogitProcessor in the HuggingFaceMultiModalLLM because it allows us to get the tokenizer.eos_token_id } # Get Quantization with Quanto quanto_config = None # NOTE: by default, no quantization. if (hf_quant_level == 4): # bnb_config = BitsAndBytesConfig( # # load_in_8bit=True, # load_in_4bit=True, # # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16 # # bnb_4bit_compute_dtype='float16' # ) quanto_config = QuantoConfig( weights="int4" # there's also 'int2' if you're crazy... ) elif (hf_quant_level == 8): # bnb_config = BitsAndBytesConfig( # load_in_8bit=True # ) quanto_config = QuantoConfig( weights="int8" ) if (quanto_config is not None): model_kwargs["quantization_config"] = quanto_config return HuggingFaceMultiModalLLM( model_name=model_name, device_map=device_map, processor_kwargs=processor_kwargs, model_kwargs=model_kwargs, generate_kwargs=generate_kwargs, max_new_tokens=1024 # from the example cookbook )