Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering

import shutil
import gradio as gr
import torch
import os
import tempfile
from Infer import Infer

title_markdown = ("""
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
  <div>
    <h1 >Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering</h1>
    <h5 style="margin: 0;">Under review.</h5>
  </div>
</div>
                  
<div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
        <a href='https://github.com/qyx1121/T-MoENet'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
    </div>
</div>
""")

block_css = """
#buttons button {
    min-width: min(120px,100%);
}
"""

def save_video_to_local(video_path):
    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
    shutil.copyfile(video_path, filename)
    return filename


def generate(video, textbox_in, candbox_in):
    video = video if video else "none"

    text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
    textbox_out = text_en_out
    #torch.cuda.empty_cache()
    print(textbox_out)
    return textbox_out


device = "cpu"
handler = Infer(device)
# handler.model.to(dtype=dtype)
if not os.path.exists("temp"):
    os.makedirs("temp")

#print(torch.cuda.memory_allocated())
#print(torch.cuda.max_memory_allocated())


video = gr.Video(label="Input Video")
question_box = gr.Textbox(
show_label=False, placeholder="Enter question", container=False)

candidates_box = gr.Textbox(
    show_label=False, placeholder="Enter a list of options", container=False
    )

with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
    gr.Markdown(title_markdown)
    state = gr.State()
    state_ = gr.State()
    first_run = gr.State()
    images_tensor = gr.State()
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    
    with gr.Column():
        with gr.Column(scale=3):
            gr.Interface(
                generate,
                [video, question_box, candidates_box],
                ["text"]
            )   
        with gr.Column(scale=3):
            gr.Examples(
                    examples=[
                        [
                            cur_dir + "/videos/3249402410.mp4",
                            "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
                                "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
                        ],
                        [
                            cur_dir + "/videos/4882821564.mp4",
                            "Why did the boy clap his hands when he ran to the christmas tree?",
                            "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
                        ],
                        [
                            cur_dir + "/videos/6233408665.mp4",
                            "What did the people on the sofa do after the lady in pink finished singing?",
                            "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
                        ],
                    ],
                    inputs=[video, question_box, candidates_box]
                ) 

    # with gr.Row():
    #     with gr.Column(scale=3):
    #         video = gr.Video(label="Input Video")
    #         cur_dir = os.path.dirname(os.path.abspath(__file__))
    #         print(cur_dir)
    #         gr.Examples(
    #             examples=[
    #                 [
    #                     cur_dir + "/videos/3249402410.mp4",
    #                     "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
    #                         "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
    #                 ],
    #                 [
    #                     cur_dir + "/videos/4882821564.mp4",
    #                     "Why did the boy clap his hands when he ran to the christmas tree?",
    #                     "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
    #                 ],
    #                 [
    #                     cur_dir + "/videos/6233408665.mp4",
    #                     "What did the people on the sofa do after the lady in pink finished singing?",
    #                     "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
    #                 ],
    #             ],
    #             inputs=[video, question_box, candidates_box],
    #         )

        # with gr.Column(scale=3):
            
        #     with gr.Row():
        #         with gr.Column(scale=4):
        #             question_box.render()
        #         with gr.Column(scale=4): 
        #             candidates_box.render()
        #         with gr.Column(scale=1, min_width=50):
        #             submit_btn = gr.Button(
        #                 value="Send", variant="primary", interactive=True
        #             )
    #submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])

demo.launch(share=True)