Spaces:

metek7
/

instagram-short-summarizing

Runtime error

File size: 4,966 Bytes

a0ec7ec
9924877
a84dbbf
40bd94b
 
 
 
 
 
 
 
9924877
a0ec7ec
 
 
 
 
 
a84dbbf
9924877
a0ec7ec
a84dbbf
 
940df70
a0ec7ec
40bd94b
a0ec7ec
 
40bd94b
a0ec7ec
 
 
 
40bd94b
a0ec7ec
 
 
 
 
40bd94b
a0ec7ec
3c47ae7
a0ec7ec
 
40bd94b
 
a0ec7ec
 
 
 
 
40bd94b
a0ec7ec
 
40bd94b
 
3c47ae7
 
 
 
 
a0ec7ec
7efde76
3c47ae7
a0ec7ec
3c47ae7
 
7efde76
a0ec7ec
a84dbbf
 
 
 
 
a0ec7ec
a84dbbf
 
 
 
 
 
 
 
 
 
 
 
40bd94b
 
a84dbbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7efde76
a0ec7ec
3c47ae7
 
a0ec7ec
 
3c47ae7
f0272e1
3c47ae7
f0272e1
a0ec7ec
 
 
 
 
 
 
 
 
 
 
 
f0272e1
a0ec7ec
 
 
 
 
7efde76
 
40bd94b

import gradio as gr
import subprocess
from deep_translator import GoogleTranslator
import torch
from llava.model.builder import load_pretrained_model
from llava.mm_utils import tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from decord import VideoReader, cpu
import numpy as np
import copy

# Gerekli kütüphanelerin kurulumu
subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)
subprocess.run("pip install deep_translator", shell=True)

# Çevirmen nesnesi oluştur
translator = GoogleTranslator(source='tr', target='en')
translator_reverse = GoogleTranslator(source='en', target='tr')

title = "# 🙋🏻‍♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, ...
"""
description2 = """
...
"""

join_us = """
## Bize Katılın:
...
"""

def load_video(video_path, max_frames_num, fps=1, force_sample=False):
    if max_frames_num == 0:
        return np.zeros((1, 336, 336, 3))
    
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    total_frame_num = len(vr)
    fps = round(vr.get_avg_fps()/fps)
    frame_idx = [i for i in range(0, len(vr), fps)]
    frame_time = [i/vr.get_avg_fps() for i in frame_idx]
    
    if len(frame_idx) > max_frames_num or force_sample:
        sample_fps = max_frames_num
        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frame_time = [i/vr.get_avg_fps() for i in frame_idx]

    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
    spare_frames = vr.get_batch(frame_idx).asnumpy()
    
    return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()

# Model yükleme
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
model_name = "llava_qwen"
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = "auto"

print("Model yükleniyor...")
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
model.eval()
print("Model başarıyla yüklendi!")

def process_video(video_path, question):
    try:
        max_frames_num = 64
        video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
        video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
        video = [video]

        conv_template = "qwen_1_5"
        time_instruction = f"Video {video_time:.2f} saniye sürmektedir ve {len(video[0])} kare uniform olarak örneklenmiştir. Bu kareler {frame_time} konumlarında bulunmaktadır. Lütfen bu videoyla ilgili aşağıdaki soruları cevaplayın."
        
        # Soruyu İngilizce'ye çevir
        question_en = translator.translate(question)
        full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question_en}"
        
        conv = copy.deepcopy(conv_templates[conv_template])
        conv.append_message(conv.roles[0], full_question)
        conv.append_message(conv.roles[1], None)
        prompt_question = conv.get_prompt()
        
        input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model.generate(
                input_ids,
                images=video,
                modalities=["video"],
                do_sample=False,
                temperature=0,
                max_new_tokens=4096,
            )
        
        response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
        
        # Cevabı Türkçe'ye çevir
        response_tr = translator_reverse.translate(response)
        return response_tr
    except Exception as e:
        return f"Bir hata oluştu: {str(e)}"

def gradio_interface(video_file, question):
    if video_file is None:
        return "Lütfen bir video dosyası yükleyin."
    response = process_video(video_file, question)
    return response

with gr.Blocks() as demo:
    gr.Markdown(title)
    with gr.Row():
        with gr.Group():
            gr.Markdown(description1)
        with gr.Group():
            gr.Markdown(description2)
    with gr.Accordion("Bize Katılın", open=False):
        gr.Markdown(join_us)
    with gr.Row():
        with gr.Column():
            video_input = gr.Video()
            question_input = gr.Textbox(label="🙋🏻‍♂️Kullanıcı Sorusu", placeholder="Video hakkında bir soru sorun...")
            submit_button = gr.Button("🌋📹LLaVA-Video'ya Sor")
        output = gr.Textbox(label="🌋📹LLaVA-Video")
    
    submit_button.click(
        fn=gradio_interface,
        inputs=[video_input, question_input],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch(show_error=True)