Spaces:

daarsh
/

Semantic_Summarization_of_Videos_B29

Runtime error

App Files Files Community

daarsh commited on Jun 1, 2024

Commit

4d426dd

verified ·

1 Parent(s): 74e8d01

Create app.py

Browse files

Files changed (1) hide show

app.py +68 -0

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from decord import VideoReader
+import torch
+from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
+import gradio as gr
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load pretrained processor, tokenizer, and model
+image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+model = VisionEncoderDecoderModel.from_pretrained(
+    "Neleac/timesformer-gpt2-video-captioning"
+).to(device)
+with gr.Blocks() as demo:
+    demo.title = "Semantic Summarization of Videos using DLSG"
+    gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29')
+    with gr.Row():
+        with gr.Column(scale=2):
+            video = gr.Video(label="Upload Video", format="mp4")
+            generate = gr.Button(value="Generate Caption")
+        with gr.Column(scale=1):
+            text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
+            with gr.Accordion("Settings", open=True):
+                with gr.Row():
+                    max_length = gr.Slider(
+                        label="Max Length", minimum=10, maximum=100, value=20, step=1
+                    )
+                    min_length = gr.Slider(
+                        label="Min Length", minimum=1, maximum=10, value=10, step=1
+                    )
+                beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1)
+                througputs = gr.Radio(
+                    label="througputs", choices=[1, 2, 3], value=1
+                )
+        def generate_caption(video, max_length, min_length, beam_size, througputs):
+            # read video
+            container = VideoReader(video)
+            clip_len = model.config.encoder.num_frames
+            frames = container.get_batch(
+                range(0, len(container), len(container) // (througputs * clip_len))
+            ).asnumpy()
+            frames = [frame for frame in frames[:-1]]
+            # process frames
+            # generate caption
+            gen_kwargs = {
+                "min_length": min_length,
+                "max_length": max_length,
+                "num_beams": beam_size,
+            }
+            pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
+                device
+            )
+            tokens = model.generate(pixel_values, **gen_kwargs)
+            caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
+            return caption
+        generate.click(
+            generate_caption,
+            inputs=[video, max_length, min_length, beam_size, througputs],
+            outputs=text,
+        )
+if __name__ == "__main__":
+    demo.launch()