Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running

File size: 5,074 Bytes

878ab12
 
 
 
 
 
 
23c8125
e400aa9
878ab12
 
e400aa9
e8848c0
 
e400aa9
 
878ab12
e8848c0
878ab12

import streamlit as st
from openai import OpenAI
import os
import base64
import cv2
from moviepy.editor import VideoFileClip

API_KEY = os.getenv('gpt4okey')

# Set the API key and model name
MODEL = "gpt-4o"

# Switch to project based with limits and use org id and key to identify run pool
# models for GPT-4o project:    gpt-4o-2024-05-13   (gpt-4o)
#client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as an env var>"))
client = OpenAI(api_key=API_KEY)


def process_text():
    text_input = st.text_input("Enter your text:")
    if text_input:
        completion = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
                {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
            ]
        )
        st.write("Assistant: " + completion.choices[0].message.content)

def process_image(image_input):
    if image_input:
        base64_image = base64.b64encode(image_input.read()).decode("utf-8")
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
                {"role": "user", "content": [
                    {"type": "text", "text": "What's the area of the triangle?"},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"}
                    }
                ]}
            ],
            temperature=0.0,
        )
        st.markdown(response.choices[0].message.content)

def process_audio(audio_input):
    if audio_input:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_input,
        )
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."},
                {"role": "user", "content": [
                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
                ]},
            ],
            temperature=0,
        )
        st.markdown(response.choices[0].message.content)

def process_video(video_input):
    if video_input:
        base64Frames, audio_path = process_video_frames(video_input)
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=open(audio_path, "rb"),
        )
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"},
                {"role": "user", "content": [
                    "These are the frames from the video.",
                    *map(lambda x: {"type": "image_url",
                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
                ]},
            ],
            temperature=0,
        )
        st.markdown(response.choices[0].message.content)

def process_video_frames(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path.name)
    video = cv2.VideoCapture(video_path.name)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame = 0
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()
    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path.name)
    clip.audio.write_audiofile(audio_path, bitrate="32k")
    clip.audio.close()
    clip.close()
    return base64Frames, audio_path

def main():
    st.title("Omni Demo")
    option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
    if option == "Text":
        process_text()
    elif option == "Image":
        image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
        process_image(image_input)
    elif option == "Audio":
        audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
        process_audio(audio_input)
    elif option == "Video":
        video_input = st.file_uploader("Upload a video file", type=["mp4"])
        process_video(video_input)

if __name__ == "__main__":
    main()