import shutil import gradio as gr import torch import os import tempfile from Infer import Infer title_markdown = ("""

Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering

Under review.
""") block_css = """ #buttons button { min-width: min(120px,100%); } """ def save_video_to_local(video_path): filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4') shutil.copyfile(video_path, filename) return filename def generate(video, textbox_in, candbox_in): video = video if video else "none" text_en_out = handler.generate(textbox_in, eval(candbox_in), video) textbox_out = text_en_out #torch.cuda.empty_cache() print(textbox_out) return textbox_out device = "cpu" handler = Infer(device) # handler.model.to(dtype=dtype) if not os.path.exists("temp"): os.makedirs("temp") #print(torch.cuda.memory_allocated()) #print(torch.cuda.max_memory_allocated()) video = gr.Video(label="Input Video") question_box = gr.Textbox( show_label=False, placeholder="Enter question", container=False) candidates_box = gr.Textbox( show_label=False, placeholder="Enter a list of options", container=False ) with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo: gr.Markdown(title_markdown) state = gr.State() state_ = gr.State() first_run = gr.State() images_tensor = gr.State() cur_dir = os.path.dirname(os.path.abspath(__file__)) with gr.Column(): with gr.Column(scale=3): gr.Interface( generate, [video, question_box, candidates_box], ["text"] ) with gr.Column(scale=3): gr.Examples( examples=[ [ cur_dir + "/videos/3249402410.mp4", "What did the lady in black on the left do after she finished spreading the sauce on her pizza?", "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']" ], [ cur_dir + "/videos/4882821564.mp4", "Why did the boy clap his hands when he ran to the christmas tree?", "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']" ], [ cur_dir + "/videos/6233408665.mp4", "What did the people on the sofa do after the lady in pink finished singing?", "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']" ], ], inputs=[video, question_box, candidates_box] ) # with gr.Row(): # with gr.Column(scale=3): # video = gr.Video(label="Input Video") # cur_dir = os.path.dirname(os.path.abspath(__file__)) # print(cur_dir) # gr.Examples( # examples=[ # [ # cur_dir + "/videos/3249402410.mp4", # "What did the lady in black on the left do after she finished spreading the sauce on her pizza?", # "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']" # ], # [ # cur_dir + "/videos/4882821564.mp4", # "Why did the boy clap his hands when he ran to the christmas tree?", # "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']" # ], # [ # cur_dir + "/videos/6233408665.mp4", # "What did the people on the sofa do after the lady in pink finished singing?", # "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']" # ], # ], # inputs=[video, question_box, candidates_box], # ) # with gr.Column(scale=3): # with gr.Row(): # with gr.Column(scale=4): # question_box.render() # with gr.Column(scale=4): # candidates_box.render() # with gr.Column(scale=1, min_width=50): # submit_btn = gr.Button( # value="Send", variant="primary", interactive=True # ) #submit_btn.click(generate, [video, question_box, candidates_box], [chatbot]) demo.launch(share=True)