# Copyright (2024) Bytedance Ltd. and/or its affiliates # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from models.modeling_tarsier import TarsierForConditionalGeneration, LlavaConfig from dataset.processor import Processor import torch import base64 import os HF_TOKEN = os.environ.get('HF_TOKEN', '') class Color: @staticmethod def red(x): return '\33[31m' +x + '\033[0m' @staticmethod def green(x): return '\33[32m' +x + '\033[0m' @staticmethod def yellow(x): return '\33[33m' +x + '\033[0m' @staticmethod def blue(x): return '\33[34m' +x + '\033[0m' @staticmethod def violet(x): return '\33[35m' +x + '\033[0m' def file_to_base64(img_path): with open(img_path, 'rb') as video_file: video_b64_str = base64.b64encode(video_file.read()).decode() return video_b64_str def load_model_and_processor(model_name_or_path, max_n_frames=8): print(Color.red(f"Load model and processor from: {model_name_or_path}; with max_n_frames={max_n_frames}"), flush=True) processor = Processor( model_name_or_path, max_n_frames=max_n_frames, ) model_config = LlavaConfig.from_pretrained( model_name_or_path, trust_remote_code=True, token=HF_TOKEN, ) model = TarsierForConditionalGeneration.from_pretrained( model_name_or_path, config=model_config, device_map='auto', torch_dtype=torch.float16, trust_remote_code=True, token=HF_TOKEN, ) model.eval() return model, processor