Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

audio_streaming_client.py +44 -50
audio_streaming_test.py +143 -0
handler.py +27 -11
test_audio_handler.py +10 -0

audio_streaming_client.py CHANGED Viewed

@@ -10,7 +10,7 @@ from dataclasses import dataclass, field
 @dataclass
 class AudioStreamingClientArguments:
     sample_rate: int = field(default=16000, metadata={"help": "Audio sample rate in Hz. Default is 16000."})
-    chunk_size: int = field(default=1024, metadata={"help": "The size of audio chunks in samples. Default is 1024."})
     api_url: str = field(default="https://yxfmjcvuzgi123sw.us-east-1.aws.endpoints.huggingface.cloud", metadata={"help": "The URL of the API endpoint."})
     auth_token: str = field(default="your_auth_token", metadata={"help": "Authentication token for the API."})
@@ -26,17 +26,16 @@ class AudioStreamingClient:
             "Authorization": f"Bearer {self.args.auth_token}",
             "Content-Type": "application/json"
         }
     def start(self):
         print("Starting audio streaming...")
         send_thread = threading.Thread(target=self.send_audio)
-        recv_thread = threading.Thread(target=self.receive_audio)
         play_thread = threading.Thread(target=self.play_audio)
-        with sd.InputStream(samplerate=self.args.sample_rate, channels=1, dtype='int16', callback=self.audio_callback):
             send_thread.start()
-            recv_thread.start()
             play_thread.start()
             try:
@@ -46,7 +45,6 @@ class AudioStreamingClient:
             finally:
                 self.stop_event.set()
                 send_thread.join()
-                recv_thread.join()
                 play_thread.join()
                 print("Audio streaming stopped.")
@@ -56,28 +54,29 @@ class AudioStreamingClient:
     def send_audio(self):
         buffer = b''
         while not self.stop_event.is_set():
-            if not self.send_queue.empty():
                 chunk = self.send_queue.get().tobytes()
                 buffer += chunk
                 if len(buffer) >= self.args.chunk_size * 2:  # * 2 because of int16
                     self.send_request(buffer)
                     buffer = b''
             else:
-                time.sleep(0.01)
-    def send_request(self, audio_data):
-        if not self.session_id:
-            payload = {
-                "request_type": "start",
-                "inputs": base64.b64encode(audio_data).decode('utf-8'),
-                "input_type": "speech",
-            }
         else:
-            payload = {
-                "request_type": "continue",
-                "session_id": self.session_id,
-                "inputs": base64.b64encode(audio_data).decode('utf-8'),
-            }
         try:
             response = requests.post(self.args.api_url, headers=self.headers, json=payload)
@@ -86,53 +85,48 @@ class AudioStreamingClient:
             if "session_id" in response_data:
                 self.session_id = response_data["session_id"]
             if "output" in response_data and response_data["output"]:
                 audio_bytes = base64.b64decode(response_data["output"])
                 audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
-                self.recv_queue.put(audio_np)
         except Exception as e:
             print(f"Error sending request: {e}")
-    def receive_audio(self):
-        while not self.stop_event.is_set():
-            if self.session_id:
-                payload = {
-                    "request_type": "continue",
-                    "session_id": self.session_id
-                }
-                try:
-                    response = requests.post(self.args.api_url, headers=self.headers, json=payload)
-                    response_data = response.json()
-                    if response_data["status"] == "completed" and not response_data["output"]:
-                        break
-                    if response_data["output"]:
-                        audio_bytes = base64.b64decode(response_data["output"])
-                        audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
-                        self.recv_queue.put(audio_np)
-                except Exception as e:
-                    print(f"Error receiving audio: {e}")
-            time.sleep(0.1)
     def play_audio(self):
         def audio_callback(outdata, frames, time, status):
             if not self.recv_queue.empty():
                 chunk = self.recv_queue.get()
-                if len(chunk) < len(outdata):
-                    outdata[:len(chunk)] = chunk.reshape(-1, 1)
-                    outdata[len(chunk):] = 0
                 else:
-                    outdata[:] = chunk[:len(outdata)].reshape(-1, 1)
             else:
                 outdata[:] = 0
-        with sd.OutputStream(samplerate=self.args.sample_rate, channels=1, callback=audio_callback):
             while not self.stop_event.is_set():
-                time.sleep(0.1)
 if __name__ == "__main__":
     import argparse

 @dataclass
 class AudioStreamingClientArguments:
     sample_rate: int = field(default=16000, metadata={"help": "Audio sample rate in Hz. Default is 16000."})
+    chunk_size: int = field(default=512, metadata={"help": "The size of audio chunks in samples. Default is 1024."})
     api_url: str = field(default="https://yxfmjcvuzgi123sw.us-east-1.aws.endpoints.huggingface.cloud", metadata={"help": "The URL of the API endpoint."})
     auth_token: str = field(default="your_auth_token", metadata={"help": "Authentication token for the API."})
             "Authorization": f"Bearer {self.args.auth_token}",
             "Content-Type": "application/json"
         }
+        self.session_state = "idle"  # Possible states: idle, sending, processing, waiting
     def start(self):
         print("Starting audio streaming...")
         send_thread = threading.Thread(target=self.send_audio)
         play_thread = threading.Thread(target=self.play_audio)
+        with sd.InputStream(samplerate=self.args.sample_rate, channels=1, dtype='int16', callback=self.audio_callback, blocksize=self.args.chunk_size):
             send_thread.start()
             play_thread.start()
             try:
             finally:
                 self.stop_event.set()
                 send_thread.join()
                 play_thread.join()
                 print("Audio streaming stopped.")
     def send_audio(self):
         buffer = b''
         while not self.stop_event.is_set():
+            if self.session_state != "processing" and not self.send_queue.empty():
                 chunk = self.send_queue.get().tobytes()
                 buffer += chunk
                 if len(buffer) >= self.args.chunk_size * 2:  # * 2 because of int16
                     self.send_request(buffer)
                     buffer = b''
             else:
+                self.send_request()
+                time.sleep(0.1)
+    def send_request(self, audio_data=None):
+        payload = {}
+        if audio_data is not None:
+            print("Sending audio data")
+            payload["inputs"] = base64.b64encode(audio_data).decode('utf-8')
+            payload["input_type"] = "speech"
+        if self.session_id:
+            payload["session_id"] = self.session_id
+            payload["request_type"] = "continue"
         else:
+            payload["request_type"] = "start"
         try:
             response = requests.post(self.args.api_url, headers=self.headers, json=payload)
             if "session_id" in response_data:
                 self.session_id = response_data["session_id"]
+            if "status" in response_data and response_data["status"] == "processing":
+                print("Processing audio data")
+                self.session_state = "processing"
+            elif "status" in response_data and response_data["status"] == "completed":
+                print("Completed audio processing")
+                self.session_state = None
+                self.session_id = None
+                _ = self.send_queue.get()  # Clear the queue
             if "output" in response_data and response_data["output"]:
+                print("Received audio data")
+                self.session_state = "processing"  # Set state to processing when we start receiving audio
                 audio_bytes = base64.b64decode(response_data["output"])
                 audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+                # Split the audio into smaller chunks for playback
+                for i in range(0, len(audio_np), self.args.chunk_size):
+                    chunk = audio_np[i:i+self.args.chunk_size]
+                    self.recv_queue.put(chunk)
         except Exception as e:
             print(f"Error sending request: {e}")
+            self.session_state = "idle"  # Reset state to idle in case of error
     def play_audio(self):
         def audio_callback(outdata, frames, time, status):
             if not self.recv_queue.empty():
                 chunk = self.recv_queue.get()
+                # Ensure chunk is int16 and clip to valid range
+                chunk_int16 = np.clip(chunk, -32768, 32767).astype(np.int16)
+                if len(chunk_int16) < len(outdata):
+                    outdata[:len(chunk_int16), 0] = chunk_int16
+                    outdata[len(chunk_int16):] = 0
                 else:
+                    outdata[:, 0] = chunk_int16[:len(outdata)]
             else:
                 outdata[:] = 0
+        with sd.OutputStream(samplerate=self.args.sample_rate, channels=1, dtype='int16', callback=audio_callback, blocksize=self.args.chunk_size):
             while not self.stop_event.is_set():
+                time.sleep(0.01)
 if __name__ == "__main__":
     import argparse

audio_streaming_test.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import threading
+from queue import Queue
+import sounddevice as sd
+import numpy as np
+import requests
+import base64
+import time
+from dataclasses import dataclass, field
+@dataclass
+class AudioStreamingClientArguments:
+    sample_rate: int = field(default=16000, metadata={"help": "Audio sample rate in Hz. Default is 16000."})
+    chunk_size: int = field(default=512, metadata={"help": "The size of audio chunks in samples. Default is 1024."})
+    api_url: str = field(default="https://yxfmjcvuzgi123sw.us-east-1.aws.endpoints.huggingface.cloud", metadata={"help": "The URL of the API endpoint."})
+    auth_token: str = field(default="your_auth_token", metadata={"help": "Authentication token for the API."})
+class AudioStreamingClient:
+    def __init__(self, args: AudioStreamingClientArguments, handler):
+        self.args = args
+        self.handler = handler
+        self.stop_event = threading.Event()
+        self.send_queue = Queue()
+        self.recv_queue = Queue()
+        self.session_id = None
+        self.headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {self.args.auth_token}",
+            "Content-Type": "application/json"
+        }
+        self.session_state = "idle"  # Possible states: idle, sending, processing, waiting
+    def start(self):
+        print("Starting audio streaming...")
+        send_thread = threading.Thread(target=self.send_audio)
+        play_thread = threading.Thread(target=self.play_audio)
+        with sd.InputStream(samplerate=self.args.sample_rate, channels=1, dtype='int16', callback=self.audio_callback, blocksize=self.args.chunk_size):
+            send_thread.start()
+            play_thread.start()
+            try:
+                input("Press Enter to stop streaming...")
+            except KeyboardInterrupt:
+                print("\nStreaming interrupted by user.")
+            finally:
+                self.stop_event.set()
+                send_thread.join()
+                play_thread.join()
+                print("Audio streaming stopped.")
+    def audio_callback(self, indata, frames, time, status):
+        self.send_queue.put(indata.copy())
+    def send_audio(self):
+        buffer = b''
+        while not self.stop_event.is_set():
+            if self.session_state != "processing" and not self.send_queue.empty():
+                chunk = self.send_queue.get().tobytes()
+                buffer += chunk
+                if len(buffer) >= self.args.chunk_size * 2:  # * 2 because of int16
+                    self.send_request(buffer)
+                    buffer = b''
+            else:
+                self.send_request()
+                time.sleep(0.1)
+    def send_request(self, audio_data=None):
+        payload = {}
+        if audio_data is not None:
+            print("Sending audio data")
+            payload["inputs"] = base64.b64encode(audio_data).decode('utf-8')
+            payload["input_type"] = "speech"
+        if self.session_id:
+            payload["session_id"] = self.session_id
+            payload["request_type"] = "continue"
+        else:
+            payload["request_type"] = "start"
+        try:
+            response_data = self.handler(payload)
+            if "session_id" in response_data:
+                self.session_id = response_data["session_id"]
+            if "status" in response_data and response_data["status"] == "processing":
+                print("Processing audio data")
+                self.session_state = "processing"
+            elif "status" in response_data and response_data["status"] == "completed":
+                print("Completed audio processing")
+                self.session_state = None
+                self.session_id = None
+                _ = self.send_queue.get()  # Clear the queue
+            if "output" in response_data and response_data["output"]:
+                print("Received audio data")
+                self.session_state = "processing"  # Set state to processing when we start receiving audio
+                audio_bytes = base64.b64decode(response_data["output"])
+                audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+                # Split the audio into smaller chunks for playback
+                for i in range(0, len(audio_np), self.args.chunk_size):
+                    chunk = audio_np[i:i+self.args.chunk_size]
+                    self.recv_queue.put(chunk)
+        except Exception as e:
+            print(f"Error sending request: {e}")
+            self.session_state = "idle"  # Reset state to idle in case of error
+    def play_audio(self):
+        def audio_callback(outdata, frames, time, status):
+            if not self.recv_queue.empty():
+                chunk = self.recv_queue.get()
+                # Ensure chunk is int16 and clip to valid range
+                chunk_int16 = np.clip(chunk, -32768, 32767).astype(np.int16)
+                if len(chunk_int16) < len(outdata):
+                    outdata[:len(chunk_int16), 0] = chunk_int16
+                    outdata[len(chunk_int16):] = 0
+                else:
+                    outdata[:, 0] = chunk_int16[:len(outdata)]
+            else:
+                outdata[:] = 0
+        with sd.OutputStream(samplerate=self.args.sample_rate, channels=1, dtype='int16', callback=audio_callback, blocksize=self.args.chunk_size):
+            while not self.stop_event.is_set():
+                time.sleep(0.01)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Audio Streaming Client")
+    parser.add_argument("--sample_rate", type=int, default=16000, help="Audio sample rate in Hz. Default is 16000.")
+    parser.add_argument("--chunk_size", type=int, default=1024, help="The size of audio chunks in samples. Default is 1024.")
+    parser.add_argument("--api_url", type=str, required=True, help="The URL of the API endpoint.")
+    parser.add_argument("--auth_token", type=str, required=True, help="Authentication token for the API.")
+    args = parser.parse_args()
+    client_args = AudioStreamingClientArguments(**vars(args))
+    client = AudioStreamingClient(client_args)
+    client.start()

handler.py CHANGED Viewed

@@ -23,7 +23,7 @@ class EndpointHandler:
             self.parler_tts_handler_kwargs,
             self.melo_tts_handler_kwargs,
             self.chat_tts_handler_kwargs,
-        ) = get_default_arguments(mode='none', log_level='DEBUG')
         setup_logger(self.module_kwargs.log_level)
         prepare_all_args(
@@ -59,6 +59,22 @@ class EndpointHandler:
         # Add a new queue for collecting the final output
         self.final_output_queue = Queue()
         self.sessions = {}  # Store session information
     def _collect_output(self, session_id):
         while True:
@@ -87,9 +103,10 @@ class EndpointHandler:
     def _handle_start_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = str(uuid.uuid4())
         self.sessions[session_id] = {
-            'status': 'processing',
             'chunks': [],
-            'last_sent_index': 0
         }
         input_type = data.get("input_type", "text")
@@ -97,17 +114,16 @@ class EndpointHandler:
         if input_type == "speech":
             audio_bytes = base64.b64decode(input_data)
-            audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
-            self.queues_and_events['recv_audio_chunks_queue'].put(audio_array.tobytes())
         elif input_type == "text":
             self.queues_and_events['text_prompt_queue'].put(input_data)
-        else:
             raise ValueError(f"Unsupported input type: {input_type}")
         # Start output collection in a separate thread
         threading.Thread(target=self._collect_output, args=(session_id,)).start()
-        return {"session_id": session_id, "status": "processing"}
     def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = data.get("session_id")
@@ -116,12 +132,12 @@ class EndpointHandler:
         session = self.sessions[session_id]
-        # Handle additional input if provided
-        if "inputs" in data:
             input_data = data["inputs"]
             audio_bytes = base64.b64decode(input_data)
-            audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
-            self.queues_and_events['recv_audio_chunks_queue'].put(audio_array.tobytes())
         chunks_to_send = session['chunks'][session['last_sent_index']:]
         session['last_sent_index'] = len(session['chunks'])

             self.parler_tts_handler_kwargs,
             self.melo_tts_handler_kwargs,
             self.chat_tts_handler_kwargs,
+        ) = get_default_arguments(mode='none', log_level='DEBUG', stt='whisper-mlx', tts='melo', device='mps')
         setup_logger(self.module_kwargs.log_level)
         prepare_all_args(
         # Add a new queue for collecting the final output
         self.final_output_queue = Queue()
         self.sessions = {}  # Store session information
+        self.vad_chunk_size = 512  # Set the chunk size required by the VAD model
+        self.sample_rate = 16000  # Set the expected sample rate
+    def _process_audio_chunk(self, audio_data: bytes, session_id: str):
+        audio_array = np.frombuffer(audio_data, dtype=np.int16)
+        # Ensure the audio is in chunks of the correct size
+        chunks = [audio_array[i:i+self.vad_chunk_size] for i in range(0, len(audio_array), self.vad_chunk_size)]
+        for chunk in chunks:
+            if len(chunk) == self.vad_chunk_size:
+                self.queues_and_events['recv_audio_chunks_queue'].put(chunk.tobytes())
+            elif len(chunk) < self.vad_chunk_size:
+                # Pad the last chunk if it's smaller than the required size
+                padded_chunk = np.pad(chunk, (0, self.vad_chunk_size - len(chunk)), 'constant')
+                self.queues_and_events['recv_audio_chunks_queue'].put(padded_chunk.tobytes())
     def _collect_output(self, session_id):
         while True:
     def _handle_start_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = str(uuid.uuid4())
         self.sessions[session_id] = {
+            'status': 'new',
             'chunks': [],
+            'last_sent_index': 0,
+            'buffer': b''  # Add a buffer to store incomplete chunks
         }
         input_type = data.get("input_type", "text")
         if input_type == "speech":
             audio_bytes = base64.b64decode(input_data)
+            self._process_audio_chunk(audio_bytes, session_id)
         elif input_type == "text":
             self.queues_and_events['text_prompt_queue'].put(input_data)
+        else:
             raise ValueError(f"Unsupported input type: {input_type}")
         # Start output collection in a separate thread
         threading.Thread(target=self._collect_output, args=(session_id,)).start()
+        return {"session_id": session_id, "status": "new"}
     def _handle_continue_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
         session_id = data.get("session_id")
         session = self.sessions[session_id]
+        if not self.queues_and_events['should_listen'].is_set():
+            session['status'] = 'processing'
+        elif "inputs" in data:  # Handle additional input if provided
             input_data = data["inputs"]
             audio_bytes = base64.b64decode(input_data)
+            self._process_audio_chunk(audio_bytes, session_id)
         chunks_to_send = session['chunks'][session['last_sent_index']:]
         session['last_sent_index'] = len(session['chunks'])

test_audio_handler.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from handler import EndpointHandler
+from audio_streaming_test import AudioStreamingClientArguments, AudioStreamingClient
+my_handler = EndpointHandler()
+args = AudioStreamingClientArguments()
+client = AudioStreamingClient(args, my_handler)
+client.start()