Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,705 Bytes
fef0a8d 99eb93c fef0a8d 0c034e2 45099c6 fef0a8d 5268082 45099c6 5268082 cfd4c18 fef0a8d 96f2f76 fef0a8d 45099c6 86e9851 ef4d60d 45099c6 86e9851 45099c6 5268082 e7322bf 269bc1d baefccb e7322bf 029aec2 e7322bf 32f0fe9 45099c6 32f0fe9 5268082 1b6a68d 45099c6 5268082 8a86647 32f0fe9 8a86647 32f0fe9 5268082 201c325 5268082 32f0fe9 5268082 eb3d2f3 5268082 eb3d2f3 5268082 e7322bf 5268082 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# Imports
import gradio as gr
import spaces
import torch
from PIL import Image
from decord import VideoReader, cpu
from transformers import AutoModel, AutoTokenizer
# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")
# Variables
DEFAULT_INPUT = "Describe in one paragraph."
MAX_FRAMES = 64
repo_name = "openbmb/MiniCPM-V-2_6-int4" # "openbmb/MiniCPM-V-2_6"
repo = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
visibility: hidden
}
'''
# Functions
def encode_video(video_path):
def uniform_sample(l, n):
gap = len(l) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [l[i] for i in idxs]
vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1)
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if len(frame_idx) > MAX_FRAMES:
frame_idx = uniform_sample(frame_idx, MAX_FRAMES)
frames = vr.get_batch(frame_idx).asnumpy()
frames = [Image.fromarray(v.astype('uint8')) for v in frames]
return frames
@spaces.GPU(duration=60)
def generate(image, video, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
# repo.to(DEVICE)
print(image)
print(video)
print(instruction)
if not video:
image_data = Image.fromarray(image.astype('uint8'), 'RGB')
inputs = [{"role": "user", "content": [image_data, instruction]}]
else:
video_data = encode_video(video)
inputs = [{"role": "user", "content": video_data + [instruction]}]
parameters = {
"sampling": sampling,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"repetition_penalty": repetition_penalty,
"max_new_tokens": max_tokens,
"use_image_id": False,
"max_slice_nums": 2,
}
output = repo.chat(image=None, msgs=inputs, tokenizer=tokenizer, **parameters)
print(output)
return output
def cloud():
print("[CLOUD] | Space maintained.")
# Initialize
with gr.Blocks(css=css) as main:
with gr.Column():
gr.Markdown("🪄 Analyze images and caption them using state-of-the-art openbmb/MiniCPM-V-2_6.")
with gr.Column():
input = gr.Image(label="Image")
input_2 = gr.Video(label="Video")
instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
sampling = gr.Checkbox(value=False, label="Sampling")
temperature = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=0.7, label="Temperature")
top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label="Top P")
top_k = gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="Top K")
repetition_penalty = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=1.05, label="Repetition Penalty")
max_tokens = gr.Slider(minimum=1, maximum=4096, step=1, value=512, label="Max Tokens")
submit = gr.Button("▶")
maintain = gr.Button("☁️")
with gr.Column():
output = gr.Textbox(lines=1, value="", label="Output")
submit.click(fn=generate, inputs=[input, input_2, instruction, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
maintain.click(cloud, inputs=[], outputs=[], queue=False)
main.launch(show_api=True) |