Spaces:

GoodiesHere
/

Apollo-LMMs-Apollo-1-5B-t32

Running

App Files Files Community

GoodiesHere commited on 4 days ago

Commit

9561718

•

1 Parent(s): 44693d6

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +386 -0
example.mp4 +3 -0
requirements.txt +29 -0
utils/constants.py +31 -0
utils/conversation.py +544 -0
utils/mm_utils.py +467 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import os, re, sys
+import spaces
+import traceback
+import shutil
+import torch
+import numpy as np
+from num2words import num2words
+from datetime import timedelta
+import datetime
+import subprocess
+from utils.mm_utils import (
+    KeywordsStoppingCriteria,
+    get_model_name_from_path,
+    tokenizer_mm_token,
+    ApolloMMLoader
+)
+from utils.conversation import conv_templates, SeparatorStyle
+from utils.constants import (
+    X_TOKEN,
+    X_TOKEN_INDEX,
+)
+from decord import cpu, VideoReader
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel #, BitsAndBytesConfig
+import gradio as gr
+import zipfile
+model_url = "GoodiesHere/Apollo-LMMs-Apollo-1_5B-t32"
+video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+yt_dlp_bin = os.getenv('YT_DLP')
+if yt_dlp_bin == "":
+    yt_dlp_bin = "yt-dlp"
+if not os.path.exists('example.mp4'):
+    subprocess.run([yt_dlp_bin, '-o', 'example.mp4', '--recode-video', 'mp4', video_url])
+title_markdown = """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <div>
+    <h1 >You are chatting with Apollo-3B</h1>
+  </div>
+</div>
+<div align="center">
+    <div style="display:flex; gap: 0.25rem; margin-top: 10px;" align="center">
+        <a href='https://apollo-lmms.github.io/Apollo/'><img src='https://img.shields.io/badge/Project-Apollo-deepskyblue'></a>
+        <a href='https://huggingface.co/GoodiesHere/Apollo-LMMs-Apollo-1_5B-t32'><img src='https://img.shields.io/badge/model-checkpoints-gold'></a>
+    </div>
+</div>
+"""
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+    color: #9C276A
+}
+"""
+plum_color = gr.themes.colors.Color(
+    name='plum',
+    c50='#F8E4EF',
+    c100='#E9D0DE',
+    c200='#DABCCD',
+    c300='#CBA8BC',
+    c400='#BC94AB',
+    c500='#AD809A',
+    c600='#9E6C89',
+    c700='#8F5878',
+    c800='#804467',
+    c900='#713056',
+    c950='#662647',
+)
+model_path = snapshot_download(model_url, repo_type="model")
+destination_path = './tmp/data'
+os.makedirs(destination_path, exist_ok=True)
+shutil.copytree(model_path, destination_path, dirs_exist_ok=True)
+#quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+class Chat:
+    def __init__(self):
+        self.version = "qwen_1_5"
+        model_name = "apollo"
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        #attn_implementation="sdpa" if torch.__version__ > "2.1.2" else "eager"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            #attn_implementation=attn_implementation,
+            device_map="auto",
+            #quantization_config=quantization_config,
+            #load_in_4bit=True,
+        ).to(device=device, dtype=torch.bfloat16).half()
+        self._model = model
+        self._tokenizer = model.tokenizer
+        self._vision_processors = model.vision_tower.vision_processor
+        self._max_length = model.config.llm_cfg['model_max_length']
+        self._config = self._model.config
+        self.num_repeat_token = self._config.mm_connector_cfg['num_output_tokens'] #todo: get from config
+        self.mm_use_im_start_end = self._config.use_mm_start_end
+        frames_per_clip = 4
+        clip_duration=getattr(self._config, 'clip_duration')
+        self.mm_processor =  ApolloMMLoader(self._vision_processors,
+                                            clip_duration,
+                                            frames_per_clip,
+                                            clip_sampling_ratio=0.65,
+                                            model_max_length = self._config.model_max_length,
+                                            device=device,
+                                            num_repeat_token=self.num_repeat_token)
+        self._model.config.encode_batch_size = 35
+        self._model.eval()
+    def remove_after_last_dot(self, s):
+        last_dot_index = s.rfind('.')
+        if last_dot_index == -1:
+            return s
+        return s[:last_dot_index + 1]
+    def apply_first_prompt(self, message, replace_string, data_type):
+        if self.mm_use_im_start_end:
+            message = X_START_TOKEN[data_type] + replace_string + X_END_TOKEN[data_type] + '\n\n' + message
+        else:
+            message = (replace_string) + '\n\n' + message
+        return message
+    @spaces.GPU(duration=120)
+    @torch.inference_mode()
+    def generate(self, data: list, message, temperature, top_p, max_output_tokens):
+        # TODO: support multiple turns of conversation.
+        mm_data, replace_string, data_type = data[0]
+        print(message)
+        conv = conv_templates[self.version].copy()
+        if isinstance(message, str):
+            message = self.apply_first_prompt(message, replace_string, data_type)
+            conv.append_message(conv.roles[0], message)
+        elif isinstance(message, list):
+            if X_TOKEN[data_type] not in message[0]['content']:
+                print('applying prompt')
+                message[0]['content'] = self.apply_first_prompt(message[0]['content'], replace_string, data_type)
+            for mes in message:
+                conv.append_message(mes["role"], mes["content"])
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        print(prompt.replace(X_TOKEN['video'],'<v>'))
+        input_ids = tokenizer_mm_token(prompt, self._tokenizer, return_tensors="pt").unsqueeze(0).cuda().to(self._model.device)
+        pad_token_ids = self._tokenizer.pad_token_id if self._tokenizer.pad_token_id is not None else self._tokenizer.eos_token_id
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self._tokenizer, input_ids)
+        print(f'running on {input_ids.shape[1]} tokens!')
+        with torch.inference_mode():
+            output_ids = self._model.generate(input_ids,
+                                            vision_input=[mm_data],
+                                            data_types=[data_type],
+                                            do_sample=True if temperature > 0 else False,
+                                            temperature=temperature,
+                                            max_new_tokens=max_output_tokens,
+                                            top_p=top_p,
+                                            use_cache=True,
+                                            num_beams=1,
+                                            stopping_criteria=[stopping_criteria])
+        print(f'generated on {output_ids.shape[1]} tokens!')
+        print(output_ids)
+        pred = self._tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        print(pred)
+        return self.remove_after_last_dot(pred)
+@spaces.GPU(duration=120)
+def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
+    print(message)
+    if textbox_in is None:
+        raise gr.Error("Chat messages cannot be empty")
+        return (
+            gr.update(value=image, interactive=True),
+            gr.update(value=video, interactive=True),
+            message,
+            chatbot,
+            None,
+        )
+    data = []
+    mm_processor = handler.mm_processor
+    try:
+        if image is not None:
+            image, prompt = mm_processor.load_image(image)
+            data.append((image, prompt, 'image'))
+        elif video is not None:
+            video_tensor, prompt = mm_processor.load_video(video)
+            data.append((video_tensor, prompt, 'video'))
+        elif image is None and video is None:
+            data.append((None, None, 'text'))
+        else:
+            raise NotImplementedError("Not support image and video at the same time")
+    except Exception as e:
+        traceback.print_exc()
+        return gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), message, chatbot, None
+    assert len(message) % 2 == 0, "The message should be a pair of user and system message."
+    show_images = ""
+    if image is not None:
+        show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
+    if video is not None:
+        show_images += f'<video controls playsinline width="300" style="display: inline-block;"  src="./file={video}"></video>'
+    one_turn_chat = [textbox_in, None]
+    # 1. first run case
+    if len(chatbot) == 0:
+        one_turn_chat[0] += "\n" + show_images
+    # 2. not first run case
+    else:
+        # scanning the last image or video
+        length = len(chatbot)
+        for i in range(length - 1, -1, -1):
+            previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[i][0])
+            previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;"  src="./file=(.+?)"', chatbot[i][0])
+            if len(previous_image) > 0:
+                previous_image = previous_image[-1]
+                # 2.1 new image append or pure text input will start a new conversation
+                if (video is not None) or (image is not None and os.path.basename(previous_image) != os.path.basename(image)):
+                    message.clear()
+                    one_turn_chat[0] += "\n" + show_images
+                break
+            elif len(previous_video) > 0:
+                previous_video = previous_video[-1]
+                # 2.2 new video append or pure text input will start a new conversation
+                if image is not None or (video is not None and os.path.basename(previous_video) != os.path.basename(video)):
+                    message.clear()
+                    one_turn_chat[0] += "\n" + show_images
+                break
+    message.append({'role': 'user', 'content': textbox_in})
+    text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
+    message.append({'role': 'assistant', 'content': text_en_out})
+    one_turn_chat[1] = text_en_out
+    chatbot.append(one_turn_chat)
+    return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot, None
+def regenerate(message, chatbot):
+    message.pop(-1), message.pop(-1)
+    chatbot.pop(-1)
+    return message, chatbot
+def clear_history(message, chatbot):
+    message.clear(), chatbot.clear()
+    return (gr.update(value=None, interactive=True),
+            gr.update(value=None, interactive=True),
+            message, chatbot,
+            gr.update(value=None, interactive=True))
+handler = Chat()
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+theme = gr.themes.Default(primary_hue=plum_color)
+# theme.update_color("primary", plum_color.c500)
+theme.set(slider_color="#9C276A")
+theme.set(block_title_text_color="#9C276A")
+theme.set(block_label_text_color="#9C276A")
+theme.set(button_primary_text_color="#9C276A")
+with gr.Blocks(title='Apollo-3B', theme=theme, css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    message = gr.State([])
+    with gr.Row():
+        with gr.Column(scale=3):
+            image = gr.State(None)
+            video = gr.Video(label="Input Video")
+            with gr.Accordion("Parameters", open=True) as parameter_row:
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.4,
+                    step=0.1,
+                    interactive=True,
+                    label="Temperature",
+                )
+                top_p = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.7,
+                        step=0.1,
+                        interactive=True,
+                        label="Top P",
+                )
+                max_output_tokens = gr.Slider(
+                    minimum=32,
+                    maximum=1024,
+                    value=256,
+                    step=32,
+                    interactive=True,
+                    label="Max output tokens",
+                )
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="Apollo", bubble_full_width=True, height=420)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox.render()
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn     = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn   = gr.Button(value="👎  Downvote", interactive=True)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    [
+                        f"{destination_path}/../../example.mp4",
+                        "What is this shit?",
+                    ],
+                ],
+                inputs=[video, textbox],
+            )
+    submit_btn.click(
+        generate,
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot, textbox])
+    textbox.submit(
+        generate,
+        [
+            image,
+            video,
+            message,
+            chatbot,
+            textbox,
+            temperature,
+            top_p,
+            max_output_tokens,
+        ],
+        [image, video, message, chatbot, textbox],
+    )
+    regenerate_btn.click(
+        regenerate,
+        [message, chatbot],
+        [message, chatbot]).then(
+        generate,
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot])
+    clear_btn.click(
+        clear_history,
+        [message, chatbot],
+        [image, video, message, chatbot, textbox])
+demo.launch()

example.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a9bb7c506333b1939a755e3c995083fa7b021de250618713ff3ccf6ae20bc69
+size 102956399

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+torch===2.2.0
+ezcolorlog
+numpy
+torchvision
+transformers==4.44.0
+tokenizers==0.19.1
+sentencepiece==0.1.99
+shortuuid
+accelerate==0.33.0
+pydantic<2,>=1
+markdown2
+scikit-learn==1.2.2
+gradio==3.35.2
+gradio_client==0.2.9
+requests
+httpx==0.24.0
+uvicorn
+fastapi
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.9.16
+decord
+ninja
+protobuf
+iopath
+num2words
+opencv-python
+s2wrapper@git+https://github.com/bfshi/scaling_on_scales
+easydict

utils/constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+X_TOKEN_INDEX = -200
+X_TOKEN =        {'image': "<|image_token|>", 'video': "<|video_token|>"}
+X_PATCH_TOKEN =  {'image': "<|image_patch|>", 'video': "<|video_patch|>"}
+X_START_TOKEN =  {'image': "<|image_start|>", 'video': "<|video_start|>"}
+X_END_TOKEN =    {'image': "<|image_end|>",   'video': "<|video_end|>"}

utils/conversation.py ADDED Viewed

	@@ -0,0 +1,544 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    MISTRAL = auto()
+    CHATML = auto()
+    QWEN = auto()
+    QWEN_2 = auto()
+    GEMMA = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.QWEN_2:
+                    seps = [self.sep, self.sep2]
+                    ret = self.system + seps[0]
+                    for i, (role, message) in enumerate(messages):
+                        if message:
+                            if type(message) is tuple:
+                                message, _, _ = message
+                            ret += role + ": " + message + seps[i % 2]
+                        else:
+                            ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        #TODO! NEED to add MM support!
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..."
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2 or self.sep_style == SeparatorStyle.MISTRAL:
+            if self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            else:
+                wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "")
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            if self.sep_style == SeparatorStyle.MISTRAL:
+                ret += "<s>"
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        if self.sep_style == SeparatorStyle.LLAMA_2:
+                            ret += " " + message + " " + self.sep2
+                        else:
+                            ret += message + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+# kentang-mit@: This conversation template is designed for SFT on VFLAN.
+conv_vicuna_v1_nosys = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="v1_nosys",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mistral = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MISTRAL,
+    sep="",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_plain = Conversation(
+    system="",
+    version="plain",
+    roles=("", ""),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+hermes_2 = Conversation(
+    system='<|im_start|>system\nAnswer the questions.',
+    roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+    sep_style=SeparatorStyle.MPT,
+    sep='<|im_end|>',
+    messages=(
+    ),
+    offset=0,
+    version="hermes-2"
+)
+# Template added by Yukang. Note (kentang-mit@): sep is <|eot_id|> for official template.
+llama_3_chat = Conversation(
+    system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n",
+           "<|start_header_id|>system<|end_header_id|>\n\n"),
+    version="llama_v3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|end_of_text|>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system\n\nYou are a helpful vision-language assistant.\n\n"You are able to understand the visual content that the user provides. You are to respond to the user's question while being nice, detailed, and  informative.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+conv_qwen_2 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="qwen_2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.QWEN_2,
+    sep=" ",
+    sep2="<|endoftext|>",
+)
+conv_gemma_instruct = Conversation(
+    system="",
+    roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
+    version="gemma",
+    messages=[],
+     offset=0,
+     sep_style=SeparatorStyle.GEMMA,
+     sep="<end_of_turn>\n"
+     )
+default_conversation = conv_plain
+conv_templates = {
+    "default": conv_plain,
+    "hermes-2": hermes_2,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "vicuna_v1_nosys": conv_vicuna_v1_nosys,
+    "llama_2": conv_llama_2,
+    "mistral": conv_mistral,
+    "plain": conv_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+    "llama_3": llama_3_chat,
+    "qwen_1_5": conv_qwen,
+    "qwen_2": conv_qwen_2,
+    "gemma_instruct": conv_gemma_instruct,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

utils/mm_utils.py ADDED Viewed

	@@ -0,0 +1,467 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from PIL import Image
+from io import BytesIO
+import base64
+import numpy as np
+import os, math, cv2, re
+import torch
+from transformers import StoppingCriteria
+from utils.constants import *
+import tempfile
+from io import BytesIO
+from decord import VideoReader, cpu
+from num2words import num2words
+from datetime import timedelta
+import datetime
+def read_video_cv2(video_path, all_indices):
+    vidcap = cv2.VideoCapture(video_path)
+    frames_dict = {}
+    max_index = max(all_indices)  # Find the maximum index to avoid unnecessary reading
+    count = 0
+    success = True
+    while success and count <= max_index:
+        success, frame = vidcap.read()
+        if success and count in all_indices:
+            img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            im_pil = Image.fromarray(img)
+            frames_dict[count] = im_pil
+        count += 1
+    # Now retrieve frames according to all_indices, allowing duplicates
+    images = [frames_dict[idx] for idx in all_indices if idx in frames_dict]
+    return np.stack([np.array(img) for img in images])
+def read_video_decord(video_file, all_indices):
+    vr = VideoReader(video_file, num_threads=1, ctx=cpu(0))
+    return vr.get_batch(all_indices).asnumpy()
+def read_video_decord_eval(video_file, all_indices):
+    vr = VideoReader(video_file)
+    return vr.get_batch(all_indices).asnumpy()
+def load_frames_from_video(video_file, all_indices, video_decode_backend="decord", eval_=False):
+    video_ending = os.path.splitext(video_file)[1]
+    if video_ending in ['.gif', '.webm'] or video_decode_backend=="opencv":
+        buffer = read_video_cv2(video_file, all_indices)
+    else:
+        # Use decord for other video formats
+        if eval_:
+            buffer = read_video_decord_eval(video_file, all_indices)
+        else:
+            buffer = read_video_decord(video_file, all_indices)
+    return buffer # (T, H, W, C)
+def pad_to_center_square(frames, mean_values):
+    """
+    Pad the given frame or frames numpy array to square dimensions using the mean values as the padding color.
+    Handles both single frames (H, W, C) and batches of frames (N, H, W, C).
+    Args:
+        frames (np.array): The input frame array of shape (H, W, C) or (N, H, W, C).
+        mean_values (tuple): Mean values for each channel, typically derived from dataset normalization parameters.
+    Returns:
+        np.array: The padded frame array with square dimensions.
+    """
+    if frames.ndim == 3:  # Single frame
+        frames = frames[np.newaxis, :]  # Add a batch dimension
+    elif frames.ndim != 4:
+        raise ValueError("Input array must be either of shape (H, W, C) or (N, H, W, C)")
+    N, height, width, channels = frames.shape
+    size = max(width, height)
+    background_color = np.array(mean_values, dtype=frames.dtype)
+    # Create a background array with the size and fill it with the mean values
+    padded_frames = np.full((N, size, size, channels), background_color, dtype=frames.dtype)
+    # Calculate padding offsets
+    top, left = (size - height) // 2, (size - width) // 2
+    # Place the original frames in the center of the square canvas
+    padded_frames[:, top:top + height, left:left + width, :] = frames
+    return padded_frames
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        # result.paste(pil_img, (0, 0))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        # result.paste(pil_img, (0, 0))
+        return result
+def calculate_sample_indices(clip_duration, frames_per_clip, total_frames, original_fps, video_duration, clip_sampling_ratio=1):
+    sample_video_fps = frames_per_clip / clip_duration
+    num_clips = math.ceil((video_duration / clip_duration) * clip_sampling_ratio)
+    frame_step = original_fps / sample_video_fps
+    partition_len = total_frames // num_clips
+    all_indices, clip_indices, timestamps = [], [], []
+    if frame_step > 0.5:
+        frame_step = max(1, int(original_fps / sample_video_fps)) #was int/floor
+        clip_len = int(frames_per_clip * frame_step) #was int/floor
+        sample_len = min(clip_len, total_frames)
+        clip_step = (total_frames - clip_len) // max(1, (num_clips - 1)) if total_frames > clip_len else 0
+        for i in range(num_clips):
+            if partition_len > clip_len:
+                start_idx = (partition_len - clip_len) // 2
+                end_idx = start_idx + clip_len
+                indices = np.arange(start_idx, end_idx, frame_step)
+                indices = np.clip(indices, 0, partition_len-1).astype(np.int64)
+                indices = indices+ i * partition_len
+            else:
+                indices = np.arange(0, sample_len, frame_step)
+                if len(indices) < frames_per_clip:
+                    padding = np.full(frames_per_clip - len(indices), sample_len)
+                    indices = np.concatenate((indices, padding))
+                indices = np.clip(indices, 0, sample_len-1).astype(np.int64)
+                indices = indices + i * clip_step
+            clip_indices.append(indices)
+            all_indices.extend(list(indices))
+            # Calculate timestamps
+            start_time = (indices[0] / original_fps)
+            end_time = (indices[-1] / original_fps)
+            timestamps.append((start_time, end_time))
+    else:
+        ## original video FPS too low, we need to sample the same frame multiple times.
+        ##  Generally should not happen.
+        # Calculate the number of times each frame should be sampled
+        num_sample = int(np.ceil(1 / frame_step))
+        # Compute the effective clip length considering the frame step
+        clip_len = int(frames_per_clip * frame_step)
+        # Create an expanded list of indices with each frame repeated num_sample times
+        indices = np.repeat(np.arange(clip_len), num_sample)
+        # Ensure the clip length does not exceed the total number of frames
+        clip_len = min(clip_len, len(indices))
+        clip_step = (total_frames - clip_len) // max(1, (num_clips - 1)) if total_frames > clip_len else 0
+        sample_len = min(clip_len, total_frames)
+        if len(indices) < frames_per_clip:
+            padding = np.full(frames_per_clip - len(indices), sample_len)
+            indices = np.concatenate((indices, padding))
+        # Distribute the indices into clips
+        for i in range(num_clips):
+            current_clip_indices = np.clip(indices, 0, sample_len-1).astype(np.int64)
+            current_clip_indices = current_clip_indices + i * clip_step
+            # Append the current clip indices to the list of all clips
+            clip_indices.append(current_clip_indices)
+            all_indices.extend(current_clip_indices)
+            # Calculate timestamps
+            start_time = (current_clip_indices[0] / original_fps)
+            end_time = (current_clip_indices[-1] / original_fps)
+            timestamps.append((start_time, end_time))
+    return clip_indices, all_indices, timestamps
+def calculate_sample_indices_uniform(frames_per_clip, total_frames, uniform_frame_count, original_fps):
+    # Generate indices
+    if total_frames >= N:
+        # Sample N frames uniformly without replacement
+        indices = np.linspace(0, total_frames - 1, N, dtype=int)
+    else:
+        # Not enough frames; repeat frames to reach N frames
+        repeats = math.ceil(N / total_frames)
+        base_indices = np.arange(total_frames)
+        indices = np.tile(base_indices, repeats)[:N]
+    # Split indices into clips
+    clip_indices = [
+        indices[i * frames_per_clip: (i + 1) * frames_per_clip]
+        for i in range(num_clips)
+    ]
+    # Calculate timestamps for each clip
+    timestamps = []
+    for clip in clip_indices:
+        start_time = clip[0] / original_fps
+        end_time = clip[-1] / original_fps
+        timestamps.append((start_time, end_time))
+    all_indices = indices.tolist()
+    return clip_indices, all_indices, timestamps
+def get_video_details(fname):
+    """ Load video content using Decord """
+    assert os.path.exists(fname), f'video path not found {fname}'
+    _fsize = os.path.getsize(fname)
+    assert _fsize >= 1 * 1024, f"video too short {fname}"
+    vr = VideoReader(fname, num_threads=-1, ctx=cpu(0))
+    # Get the total number of frames and the original fps of the video
+    total_frames = len(vr)
+    original_fps = vr.get_avg_fps()
+    video_duration = total_frames / original_fps
+    return total_frames, original_fps, video_duration
+def get_video_details_cv2(fname):
+    """
+    Load video content using OpenCV (cv2) and retrieve video details.
+    Args:
+        fname (str): Path to the video file.
+    Returns:
+        tuple: A tuple containing:
+            - total_frames (int): Total number of frames in the video.
+            - original_fps (float): Frames per second of the video.
+            - video_duration (float): Duration of the video in seconds.
+    Raises:
+        AssertionError: If the file does not exist or is too short.
+        ValueError: If the video cannot be opened or FPS is zero.
+    """
+    # Check if the file exists
+    assert os.path.exists(fname), f'Video path not found: {fname}'
+    # Check if the file size is at least 1 KB
+    _fsize = os.path.getsize(fname)
+    assert _fsize >= 1 * 1024, f"Video too short: {fname}"
+    # Open the video file
+    cap = cv2.VideoCapture(fname)
+    if not cap.isOpened():
+        raise ValueError(f"Failed to open video file: {fname}")
+    # Retrieve the total number of frames
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Retrieve the frames per second (FPS)
+    original_fps = cap.get(cv2.CAP_PROP_FPS)
+    if original_fps == 0:
+        cap.release()
+        raise ValueError(f"Failed to get FPS for video file: {fname}")
+    # Calculate the video duration in seconds
+    video_duration = total_frames / original_fps
+    # Release the video capture object
+    cap.release()
+    return total_frames, original_fps, video_duration
+def split_into_clips(video, frames_per_clip):
+    """ Split video into a list of clips """
+    fpc = frames_per_clip
+    nc = len(video) // frames_per_clip
+    return [video[i*fpc:(i+1)*fpc] for i in range(nc)]
+def process_image(vision_processors, frames_per_clip, image):
+    mm_data = []
+    for vision_processor in vision_processors:
+        tmp = expand2square(image, tuple(int(x * 255) for x in vision_processor.image_mean))
+        tmp = np.expand_dims(np.asarray(tmp), 0)
+        tmp = vision_processor.preprocess(tmp, return_tensors='pt')['pixel_values'][0].unsqueeze(0)
+        if len(tmp.shape)==4:
+            ## image, need B, T, C, W, H
+            tmp = tmp.unsqueeze(1)
+            tmp = tmp.repeat_interleave(frames_per_clip, dim=1)
+        else:
+            ## video, need B, C, T, W, H
+            if tmp.shape[1]==1:
+                tmp = tmp.repeat_interleave(frames_per_clip, dim=1)
+            else:
+                tmp = tmp.repeat_interleave(frames_per_clip, dim=2)
+        mm_data.append(tmp)
+    return mm_data
+def process_video(vision_processors, frames_per_clip, buffer):
+    mm_data=[]
+    for vision_processor in vision_processors:
+        centered_buffer = pad_to_center_square(buffer, tuple(int(x * 255) for x in vision_processor.image_mean))
+        processed_clips = []
+        for clip in split_into_clips(centered_buffer, frames_per_clip):
+            clip = vision_processor.preprocess(clip, return_tensors='pt')['pixel_values']
+            if type(clip) is list:
+                assert len(clip)==1, "LazyVideoDataset: error, vision processor returned clip that is list of len>1 ."
+                clip = clip[0]
+            processed_clips.append(clip)
+        mm_data.append(torch.stack(processed_clips))
+    return mm_data
+def load_video(video_file, vision_processors, clip_duration, frames_per_clip, clip_sampling_ratio=1, video_decode_backend='decord', eval_=False):
+    total_frames, original_fps, video_duration = get_video_details(video_file)
+    _, all_indices, timestamps = calculate_sample_indices(clip_duration, frames_per_clip, total_frames, original_fps, video_duration, clip_sampling_ratio=clip_sampling_ratio)
+    buffer = load_frames_from_video(video_file, all_indices, video_decode_backend, eval_)
+    mm_data = process_video(vision_processors, frames_per_clip, buffer)
+    return mm_data, timestamps
+class ApolloMMLoader:
+    def __init__(self, vision_processors, clip_duration, frames_per_clip, num_repeat_token, device, model_max_length = 32768, clip_sampling_ratio=1, video_decode_backend="decord"):
+        self.vision_processors=vision_processors
+        self.clip_duration=clip_duration
+        self.device=device
+        self.frames_per_clip=frames_per_clip
+        self.num_repeat_token = num_repeat_token
+        self.clip_sampling_ratio=clip_sampling_ratio
+        self.model_max_length=model_max_length
+        self.video_decode_backend=video_decode_backend
+        self.vidprompt = lambda num_clips, video_duration : f"You are provided the following series of {num2words(num_clips)}, {self.clip_duration} second clips from a {datetime.timedelta(seconds=video_duration)} [H:MM:SS] video.\n"
+    def load_video(self, video_file):
+        total_frames, original_fps, video_duration = get_video_details(video_file)
+        clip_sampling_ratio = min(1, (self.model_max_length * self.clip_sampling_ratio) / (video_duration  * self.num_repeat_token / self.clip_duration))
+        _, all_indices, timestamps = calculate_sample_indices(self.clip_duration, self.frames_per_clip, total_frames, original_fps, video_duration, clip_sampling_ratio=clip_sampling_ratio)
+        video, timestamps = load_video(video_file, self.vision_processors, self.clip_duration, self.frames_per_clip, clip_sampling_ratio=clip_sampling_ratio, eval_=True)
+        num_clips =  len(video[0])
+        num_tokens = num_clips * self.num_repeat_token
+        video = [v.to(device=self.device, dtype=torch.bfloat16) for v in video]
+        replace_string = self.vidprompt(num_clips, video_duration)
+        temporal_prompt = [f"{round(clip[0], 1)}-{round(clip[1], 1)} seconds: {X_TOKEN['video'] * self.num_repeat_token}" for clip in timestamps]
+        temporal_prompt = ',\n'.join(temporal_prompt)
+        replace_string = replace_string + temporal_prompt
+        return video, replace_string
+    def load_image(self, image_file):
+        print('implement image loading')
+        return None
+def expand2square(pil_img, background_color):
+    """
+    Expand the given PIL image to a square shape by adding padding.
+    Parameters:
+    - pil_img: The PIL image to be expanded.
+    - background_color: The color of the padding to be added.
+    Returns:
+    - The expanded PIL image.
+    If the image is already square, it is returned as is.
+    If the image is wider than it is tall, padding is added to the top and bottom.
+    If the image is taller than it is wide, padding is added to the left and right.
+    """
+    width, height = pil_img.size
+    if pil_img.mode == 'L':
+        background_color = background_color[0]
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def tokenizer_mm_token(prompt, tokenizer, return_tensors=None):
+    tokens_regex = re.compile('|'.join(re.escape(token) for token in X_TOKEN.values()))
+    input_ids, last_pos, start_id = [], 0, 0
+    for match in tokens_regex.finditer(prompt):
+        if match.start() > last_pos:
+            input_ids.extend(tokenizer(prompt[last_pos:match.start()]).input_ids)
+        elif match.start() == 0:
+            input_ids = tokenizer('').input_ids
+            start_id = 1
+        input_ids.append(X_TOKEN_INDEX)
+        last_pos = match.end()
+    if last_pos < len(prompt):
+        input_ids.extend(tokenizer(prompt[last_pos:]).input_ids[start_id:])
+    return torch.tensor(input_ids, dtype=torch.long) if return_tensors == 'pt' else input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if (
+                len(cur_keyword_ids) > 1
+                and cur_keyword_ids[0] == tokenizer.bos_token_id
+            ):
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [
+            keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
+        ]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0] :] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, -offset:], skip_special_tokens=True
+        )[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)