import shutil
import gradio as gr
import torch
import os
import tempfile
from Infer import Infer
title_markdown = ("""
Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering
Under review.
""")
block_css = """
#buttons button {
min-width: min(120px,100%);
}
"""
def save_video_to_local(video_path):
filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
shutil.copyfile(video_path, filename)
return filename
def generate(video, textbox_in, candbox_in):
video = video if video else "none"
text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
textbox_out = text_en_out
#torch.cuda.empty_cache()
print(textbox_out)
return textbox_out
device = "cpu"
handler = Infer(device)
# handler.model.to(dtype=dtype)
if not os.path.exists("temp"):
os.makedirs("temp")
#print(torch.cuda.memory_allocated())
#print(torch.cuda.max_memory_allocated())
video = gr.Video(label="Input Video")
question_box = gr.Textbox(
show_label=False, placeholder="Enter question", container=False)
candidates_box = gr.Textbox(
show_label=False, placeholder="Enter a list of options", container=False
)
with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
gr.Markdown(title_markdown)
state = gr.State()
state_ = gr.State()
first_run = gr.State()
images_tensor = gr.State()
cur_dir = os.path.dirname(os.path.abspath(__file__))
with gr.Column():
with gr.Column(scale=3):
gr.Interface(
generate,
[video, question_box, candidates_box],
["text"]
)
with gr.Column(scale=3):
gr.Examples(
examples=[
[
cur_dir + "/videos/3249402410.mp4",
"What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
"['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
],
[
cur_dir + "/videos/4882821564.mp4",
"Why did the boy clap his hands when he ran to the christmas tree?",
"['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
],
[
cur_dir + "/videos/6233408665.mp4",
"What did the people on the sofa do after the lady in pink finished singing?",
"['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
],
],
inputs=[video, question_box, candidates_box]
)
# with gr.Row():
# with gr.Column(scale=3):
# video = gr.Video(label="Input Video")
# cur_dir = os.path.dirname(os.path.abspath(__file__))
# print(cur_dir)
# gr.Examples(
# examples=[
# [
# cur_dir + "/videos/3249402410.mp4",
# "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
# "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
# ],
# [
# cur_dir + "/videos/4882821564.mp4",
# "Why did the boy clap his hands when he ran to the christmas tree?",
# "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
# ],
# [
# cur_dir + "/videos/6233408665.mp4",
# "What did the people on the sofa do after the lady in pink finished singing?",
# "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
# ],
# ],
# inputs=[video, question_box, candidates_box],
# )
# with gr.Column(scale=3):
# with gr.Row():
# with gr.Column(scale=4):
# question_box.render()
# with gr.Column(scale=4):
# candidates_box.render()
# with gr.Column(scale=1, min_width=50):
# submit_btn = gr.Button(
# value="Send", variant="primary", interactive=True
# )
#submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])
demo.launch(share=True)