|
import shutil |
|
import gradio as gr |
|
import torch |
|
from fastapi import FastAPI |
|
import os |
|
import tempfile |
|
from Infer import Infer |
|
|
|
title_markdown = (""" |
|
<div style="display: flex; justify-content: center; align-items: center; text-align: center;"> |
|
<div> |
|
<h1 >Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering</h1> |
|
<h5 style="margin: 0;">Under review.</h5> |
|
</div> |
|
</div> |
|
|
|
<div align="center"> |
|
<div style="display:flex; gap: 0.25rem;" align="center"> |
|
<a href='https://github.com/qyx1121/T-MoENet'><img src='https://img.shields.io/badge/Github-Code-blue'></a> |
|
</div> |
|
</div> |
|
""") |
|
|
|
block_css = """ |
|
#buttons button { |
|
min-width: min(120px,100%); |
|
} |
|
""" |
|
|
|
def save_video_to_local(video_path): |
|
filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4') |
|
shutil.copyfile(video_path, filename) |
|
return filename |
|
|
|
|
|
def generate(video, textbox_in, candbox_in): |
|
video = video if video else "none" |
|
|
|
text_en_out = handler.generate(textbox_in, eval(candbox_in), video) |
|
|
|
textbox_out = text_en_out |
|
|
|
return textbox_out |
|
|
|
|
|
device = "cpu" |
|
handler = Infer(device) |
|
|
|
if not os.path.exists("temp"): |
|
os.makedirs("temp") |
|
|
|
|
|
|
|
|
|
question_box = gr.Textbox( |
|
show_label=False, placeholder="Enter question", container=False |
|
) |
|
|
|
candidates_box = gr.Textbox( |
|
show_label=False, placeholder="Enter a list of options", container=False |
|
) |
|
|
|
with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo: |
|
gr.Markdown(title_markdown) |
|
state = gr.State() |
|
state_ = gr.State() |
|
first_run = gr.State() |
|
images_tensor = gr.State() |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
video = gr.Video(label="Input Video") |
|
cur_dir = os.path.dirname(os.path.abspath(__file__)) |
|
print(cur_dir) |
|
gr.Examples( |
|
examples=[ |
|
[ |
|
cur_dir + "/videos/3249402410.mp4", |
|
"What did the lady in black on the left do after she finished spreading the sauce on her pizza?", |
|
"['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']" |
|
], |
|
[ |
|
cur_dir + "/videos/4882821564.mp4", |
|
"Why did the boy clap his hands when he ran to the christmas tree?", |
|
"['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']" |
|
], |
|
[ |
|
cur_dir + "/videos/6233408665.mp4", |
|
"What did the people on the sofa do after the lady in pink finished singing?", |
|
"['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']" |
|
], |
|
], |
|
inputs=[video, question_box, candidates_box], |
|
) |
|
|
|
with gr.Column(scale=3): |
|
chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True) |
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
question_box.render() |
|
with gr.Column(scale=4): |
|
candidates_box.render() |
|
with gr.Column(scale=1, min_width=50): |
|
submit_btn = gr.Button( |
|
value="Send", variant="primary", interactive=True |
|
) |
|
|
|
submit_btn.click(generate, [video, question_box, candidates_box], [chatbot]) |
|
|
|
demo.launch(share=True) |
|
|