Spaces:
Runtime error
Runtime error
File size: 4,966 Bytes
a0ec7ec 9924877 a84dbbf 40bd94b 9924877 a0ec7ec a84dbbf 9924877 a0ec7ec a84dbbf 940df70 a0ec7ec 40bd94b a0ec7ec 40bd94b a0ec7ec 40bd94b a0ec7ec 40bd94b a0ec7ec 3c47ae7 a0ec7ec 40bd94b a0ec7ec 40bd94b a0ec7ec 40bd94b 3c47ae7 a0ec7ec 7efde76 3c47ae7 a0ec7ec 3c47ae7 7efde76 a0ec7ec a84dbbf a0ec7ec a84dbbf 40bd94b a84dbbf 7efde76 a0ec7ec 3c47ae7 a0ec7ec 3c47ae7 f0272e1 3c47ae7 f0272e1 a0ec7ec f0272e1 a0ec7ec 7efde76 40bd94b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import subprocess
from deep_translator import GoogleTranslator
import torch
from llava.model.builder import load_pretrained_model
from llava.mm_utils import tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from decord import VideoReader, cpu
import numpy as np
import copy
# Gerekli kütüphanelerin kurulumu
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
subprocess.run("pip install deep_translator", shell=True)
# Çevirmen nesnesi oluştur
translator = GoogleTranslator(source='tr', target='en')
translator_reverse = GoogleTranslator(source='en', target='tr')
title = "# 🙋🏻♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, ...
"""
description2 = """
...
"""
join_us = """
## Bize Katılın:
...
"""
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
if max_frames_num == 0:
return np.zeros((1, 336, 336, 3))
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_frame_num = len(vr)
fps = round(vr.get_avg_fps()/fps)
frame_idx = [i for i in range(0, len(vr), fps)]
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
if len(frame_idx) > max_frames_num or force_sample:
sample_fps = max_frames_num
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
frame_idx = uniform_sampled_frames.tolist()
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
spare_frames = vr.get_batch(frame_idx).asnumpy()
return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()
# Model yükleme
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
model_name = "llava_qwen"
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = "auto"
print("Model yükleniyor...")
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
model.eval()
print("Model başarıyla yüklendi!")
def process_video(video_path, question):
try:
max_frames_num = 64
video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
video = [video]
conv_template = "qwen_1_5"
time_instruction = f"Video {video_time:.2f} saniye sürmektedir ve {len(video[0])} kare uniform olarak örneklenmiştir. Bu kareler {frame_time} konumlarında bulunmaktadır. Lütfen bu videoyla ilgili aşağıdaki soruları cevaplayın."
# Soruyu İngilizce'ye çevir
question_en = translator.translate(question)
full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question_en}"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], full_question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(
input_ids,
images=video,
modalities=["video"],
do_sample=False,
temperature=0,
max_new_tokens=4096,
)
response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
# Cevabı Türkçe'ye çevir
response_tr = translator_reverse.translate(response)
return response_tr
except Exception as e:
return f"Bir hata oluştu: {str(e)}"
def gradio_interface(video_file, question):
if video_file is None:
return "Lütfen bir video dosyası yükleyin."
response = process_video(video_file, question)
return response
with gr.Blocks() as demo:
gr.Markdown(title)
with gr.Row():
with gr.Group():
gr.Markdown(description1)
with gr.Group():
gr.Markdown(description2)
with gr.Accordion("Bize Katılın", open=False):
gr.Markdown(join_us)
with gr.Row():
with gr.Column():
video_input = gr.Video()
question_input = gr.Textbox(label="🙋🏻♂️Kullanıcı Sorusu", placeholder="Video hakkında bir soru sorun...")
submit_button = gr.Button("🌋📹LLaVA-Video'ya Sor")
output = gr.Textbox(label="🌋📹LLaVA-Video")
submit_button.click(
fn=gradio_interface,
inputs=[video_input, question_input],
outputs=output
)
if __name__ == "__main__":
demo.launch(show_error=True)
|