daarsh commited on
Commit
4d426dd
·
verified ·
1 Parent(s): 74e8d01

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from decord import VideoReader
2
+ import torch
3
+ from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
4
+ import gradio as gr
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ # load pretrained processor, tokenizer, and model
9
+ image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
10
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
11
+ model = VisionEncoderDecoderModel.from_pretrained(
12
+ "Neleac/timesformer-gpt2-video-captioning"
13
+ ).to(device)
14
+
15
+
16
+ with gr.Blocks() as demo:
17
+ demo.title = "Semantic Summarization of Videos using DLSG"
18
+ gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29')
19
+ with gr.Row():
20
+ with gr.Column(scale=2):
21
+ video = gr.Video(label="Upload Video", format="mp4")
22
+ generate = gr.Button(value="Generate Caption")
23
+ with gr.Column(scale=1):
24
+ text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
25
+ with gr.Accordion("Settings", open=True):
26
+ with gr.Row():
27
+ max_length = gr.Slider(
28
+ label="Max Length", minimum=10, maximum=100, value=20, step=1
29
+ )
30
+ min_length = gr.Slider(
31
+ label="Min Length", minimum=1, maximum=10, value=10, step=1
32
+ )
33
+ beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1)
34
+ througputs = gr.Radio(
35
+ label="througputs", choices=[1, 2, 3], value=1
36
+ )
37
+
38
+ def generate_caption(video, max_length, min_length, beam_size, througputs):
39
+ # read video
40
+ container = VideoReader(video)
41
+ clip_len = model.config.encoder.num_frames
42
+ frames = container.get_batch(
43
+ range(0, len(container), len(container) // (througputs * clip_len))
44
+ ).asnumpy()
45
+ frames = [frame for frame in frames[:-1]]
46
+
47
+ # process frames
48
+ # generate caption
49
+ gen_kwargs = {
50
+ "min_length": min_length,
51
+ "max_length": max_length,
52
+ "num_beams": beam_size,
53
+ }
54
+ pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
55
+ device
56
+ )
57
+ tokens = model.generate(pixel_values, **gen_kwargs)
58
+ caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
59
+ return caption
60
+
61
+ generate.click(
62
+ generate_caption,
63
+ inputs=[video, max_length, min_length, beam_size, througputs],
64
+ outputs=text,
65
+ )
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()