Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import numpy as np | |
import supervision as sv | |
from PIL import Image | |
CAPTIONING_TASK = "<DETAILED_CAPTION>" | |
CAPTION_TO_PHRASE_GROUNDING_TASK = "<CAPTION_TO_PHRASE_GROUNDING>" | |
def run_captioning(model, processor, image: np.ndarray, device: torch.device) -> str: | |
image = Image.fromarray(image).convert("RGB") | |
text = "<DETAILED_CAPTION>" | |
inputs = processor(text=text, images=image, return_tensors="pt").to(device) | |
generated_ids = model.generate( | |
input_ids=inputs["input_ids"], | |
pixel_values=inputs["pixel_values"], | |
max_new_tokens=1024, | |
num_beams=3 | |
) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
return processor.post_process_generation( | |
generated_text, task=CAPTIONING_TASK, image_size=image.size) | |
def run_caption_to_phrase_grounding( | |
model, | |
processor, | |
caption: str, | |
image: np.ndarray, | |
device: torch.device | |
) -> sv.Detections: | |
image = Image.fromarray(image).convert("RGB") | |
text = f"{CAPTION_TO_PHRASE_GROUNDING_TASK} {caption}" | |
inputs = processor(text=text, images=image, return_tensors="pt").to(device) | |
generated_ids = model.generate( | |
input_ids=inputs["input_ids"], | |
pixel_values=inputs["pixel_values"], | |
max_new_tokens=1024, | |
num_beams=3 | |
) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
response = processor.post_process_generation( | |
generated_text, task=CAPTION_TO_PHRASE_GROUNDING_TASK, image_size=image.size) | |
return sv.Detections.from_lmm(sv.LMM.FLORENCE_2, response, resolution_wh=image.size) | |