File size: 3,028 Bytes
cfb4e8c
22c5bdb
 
 
9759803
cfb4e8c
22c5bdb
de76a17
22c5bdb
de76a17
22c5bdb
b97cf3c
 
1bccd9f
de76a17
42234b9
22c5bdb
9759803
ad55a96
4080a13
22c5bdb
9915f17
f078c1f
 
22c5bdb
42234b9
22c5bdb
 
 
 
1bccd9f
22c5bdb
 
1bccd9f
9759803
22c5bdb
9759803
 
 
 
 
 
 
 
 
 
 
 
22c5bdb
de76a17
9759803
ff9e518
de76a17
 
22c5bdb
de76a17
 
 
 
 
 
1bccd9f
 
 
 
 
 
de76a17
 
22c5bdb
a2f7134
 
4201079
 
 
a2f7134
2e0b130
6c152e8
2e0b130
a2f7134
de76a17
6c152e8
 
 
d6282fe
6c152e8
de76a17
 
 
 
9759803
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import websockets
import asyncio
import json
import base64

async def process_audio_stream(audio_path, max_tokens):
    """
    Process audio with streaming response via WebSocket
    """
    if not audio_path:
        yield "Please upload or record an audio file first."
        return
    
    try:
        # Read audio file and convert to base64 bytes
        with open(audio_path, 'rb') as f:
            audio_bytes = f.read()
            base64_bytes = base64.b64encode(audio_bytes)
            
        # Connect to WebSocket
        async with websockets.connect('wss://nexa-omni.nexa4ai.com/ws/process-audio/') as websocket:
            # Send binary base64 audio data as bytes
            await websocket.send(base64_bytes)  # Send the raw base64 bytes
            
            # Send parameters as JSON string
            await websocket.send(json.dumps({
                "prompt": "",
                "max_tokens": max_tokens
            }))
            
            # Initialize response
            response = ""
            
            # Receive streaming response
            async for message in websocket:
                try:
                    data = json.loads(message)
                    if data["status"] == "generating":
                        response += data["token"]
                        yield response
                    elif data["status"] == "complete":
                        break
                    elif data["status"] == "error":
                        yield f"Error: {data['error']}"
                        break
                except json.JSONDecodeError:
                    continue
                
    except Exception as e:
        yield f"Error connecting to server: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=process_audio_stream,
    inputs=[
        gr.Audio(
            type="filepath",
            label="Upload or Record Audio",
            sources=["upload", "microphone"]
        ),
        gr.Slider(
            minimum=50,
            maximum=200,
            value=50,
            step=1,
            label="Max Tokens"
        )
    ],
    outputs=gr.Textbox(label="Response", interactive=False),
    title="NEXA OmniAudio-2.6B",
    description=f"""

    OmniAudio-2.6B is a compact audio-language model optimized for edge deployment.
        
    Model Repo: <a href="https://huggingface.co/NexaAIDev/OmniAudio-2.6B">NexaAIDev/OmniAudio-2.6B</a>
    
    Blog: <a href="https://nexa.ai/blogs/OmniAudio-2.6B">OmniAudio-2.6B Blog</a>
    
    Upload an audio file and optionally provide a prompt to analyze the audio content.""",
    examples=[
        ["example_audios/voice_qa.mp3", 200],
        ["example_audios/voice_in_conversation.mp3", 200],
        ["example_audios/creative_content_generation.mp3", 200],
        ["example_audios/record_summary.mp3", 200],
        ["example_audios/change_tone.mp3", 200],
    ]
)

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)