Spaces:
Running
Running
import torch | |
import gradio as gr | |
from transformers import AutoModel | |
from transformers import AutoProcessor | |
import spaces | |
# Load pre-trained models for image captioning and language modeling | |
model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
# Define a function for image captioning | |
def videochat(image3, prompt3): | |
# Process input image and prompt | |
inputs = processor(text=[prompt3], images=[image3], return_tensors="pt") | |
# Generate captions | |
with torch.inference_mode(): | |
output = model3.generate( | |
**inputs, | |
do_sample=False, | |
use_cache=True, | |
max_new_tokens=256, | |
eos_token_id=151645, | |
pad_token_id=processor.tokenizer.pad_token_id | |
) | |
prompt_len = inputs["input_ids"].shape[1] | |
# Decode and return the generated captions | |
decoded_text = processor.batch_decode(output[:, prompt_len:])[0] | |
if decoded_text.endswith("<|im_end|>"): | |
decoded_text = decoded_text[:-10] | |
yield decoded_text |