Spaces:

NexaAIDev
/

omni-audio-demo

Running

File size: 3,172 Bytes

cfb4e8c
50ad4b3
 
22c5bdb
50ad4b3
c68be50
cfb4e8c
50ad4b3
c68be50
50ad4b3
c68be50
50ad4b3
de76a17
50ad4b3
de76a17
22c5bdb
b97cf3c
 
1bccd9f
de76a17
50ad4b3
 
 
 
22c5bdb
50ad4b3
 
 
 
 
 
 
 
 
 
1bccd9f
22c5bdb
50ad4b3
1bccd9f
50ad4b3
 
 
 
 
 
 
 
 
 
 
 
 
 
22c5bdb
de76a17
50ad4b3
ff9e518
50ad4b3
de76a17
22c5bdb
de76a17
 
 
 
 
 
1bccd9f
 
 
 
 
 
de76a17
 
22c5bdb
a2f7134
 
4201079
 
a2f7134
2e0b130
a6bd1ac
2e0b130
a2f7134
de76a17
6c152e8
 
 
d6282fe
6c152e8
de76a17
 
 
 
50ad4b3

import gradio as gr
import websockets
import asyncio
import json
import base64
import os

API_KEY = os.getenv('API_KEY')
if not API_KEY:
    raise ValueError("API_KEY must be set in environment variables")

async def process_audio_stream(audio_path, max_tokens):
    """
    Process audio with streaming response via WebSocket
    """
    if not audio_path:
        yield "Please upload or record an audio file first."
        return
    
    try:
        # Read audio file and convert to base64 bytes
        with open(audio_path, 'rb') as f:
            audio_bytes = f.read()
            base64_bytes = base64.b64encode(audio_bytes)
            
        # Connect to WebSocket
        async with websockets.connect('wss://nexa-omni.nexa4ai.com/ws/process-audio/?api_key=' + API_KEY) as websocket:
            # Send binary base64 audio data as bytes
            await websocket.send(base64_bytes)  # Send the raw base64 bytes
            
            # Send parameters as JSON string
            await websocket.send(json.dumps({
                "prompt": "",
                "max_tokens": max_tokens
            }))
            
            # Initialize response
            response = ""
            
            # Receive streaming response
            async for message in websocket:
                try:
                    data = json.loads(message)
                    if data["status"] == "generating":
                        response += data["token"]
                        yield response
                    elif data["status"] == "complete":
                        break
                    elif data["status"] == "error":
                        yield f"Error: {data['error']}"
                        break
                except json.JSONDecodeError:
                    continue
                
    except Exception as e:
        yield f"Error connecting to server: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=process_audio_stream,
    inputs=[
        gr.Audio(
            type="filepath",
            label="Upload or Record Audio",
            sources=["upload", "microphone"]
        ),
        gr.Slider(
            minimum=50,
            maximum=200,
            value=50,
            step=1,
            label="Max Tokens"
        )
    ],
    outputs=gr.Textbox(label="Response", interactive=False),
    title="NEXA OmniAudio-2.6B",
    description=f"""
    OmniAudio-2.6B is a compact audio-language model optimized for edge deployment.
        
    Model Repo: <a href="https://huggingface.co/NexaAIDev/OmniAudio-2.6B">NexaAIDev/OmniAudio-2.6B</a>
    
    Blog: <a href="https://nexa.ai/blogs/omniaudio-2.6b">OmniAudio-2.6B Blog</a>
    
    Upload an audio file and optionally provide a prompt to analyze the audio content.""",
    examples=[
        ["example_audios/voice_qa.mp3", 200],
        ["example_audios/voice_in_conversation.mp3", 200],
        ["example_audios/creative_content_generation.mp3", 200],
        ["example_audios/record_summary.mp3", 200],
        ["example_audios/change_tone.mp3", 200],
    ]
)

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)