import io import numpy as np import pydub from utils import have_pyrubberband # Keep non-native package imports out of global space def get_wave_header(frame_input=b"", channels=1, sample_width=2, sample_rate=24000): # This will create a wave header then append the frame input # It should be first on a streaming wav file # Other frames better should not have it (else you will hear some artifacts each chunk start) import wave wav_buf = io.BytesIO() with wave.open(wav_buf, "wb") as vfout: vfout.setnchannels(channels) vfout.setsampwidth(sample_width) vfout.setframerate(sample_rate) vfout.writeframes(frame_input) wav_buf.seek(0) return wav_buf.read() def prepare_speech(sr=24000): # Must set autoplay to True first return get_wave_header(sample_rate=sr) def get_no_audio(return_as_byte=True, return_nonbyte_as_file=False, sr=None): if return_as_byte: return b"" else: if return_nonbyte_as_file: return None else: assert sr is not None return sr, np.array([]).astype(np.int16) def combine_audios(audios, audio=None, channels=1, sample_width=2, sr=24000, expect_bytes=True, verbose=False): no_audio = get_no_audio(sr=sr) have_audio = any(x not in [no_audio, None, ''] for x in audios) or audio not in [no_audio, None, ''] if not have_audio: return no_audio if audio or audios: if verbose: print("begin combine audios") is_bytes = expect_bytes # force default as bytes no matter input if know should have been bytes if audios: is_bytes |= isinstance(audios[0], (bytes, bytearray)) if audio: is_bytes |= isinstance(audio, (bytes, bytearray)) assert audio is None or isinstance(audio, (bytes, bytearray)) from pydub import AudioSegment combined_wav = AudioSegment.empty() for x in audios: if x is not None: s = io.BytesIO(x) if is_bytes else x combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) if audio is not None: s = io.BytesIO(audio) if is_bytes else audio combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) if is_bytes: combined_wav = combined_wav.export(format='raw').read() if verbose: print("end1 combine audios") return combined_wav # audio just empty stream, but not None, else would nuke audio if verbose: print("end2 combine audios") return audio def chunk_speed_change(chunk, sr, tts_speed=1.0): if tts_speed == 1.0: return chunk if have_pyrubberband: import pyrubberband as pyrb chunk = pyrb.time_stretch(chunk, sr, tts_speed) chunk = (chunk * 32767).astype(np.int16) return chunk if tts_speed < 1.0: # chunk = chunk.astype(np.float32) # chunk = 0.5 * chunk / np.max(chunk) # chunk = librosa.effects.time_stretch(chunk, rate=tts_speed) return chunk # speed-up from pydub import AudioSegment from pydub.effects import speedup s = io.BytesIO(chunk) channels = 1 sample_width = 2 audio = AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) # chunk = speedup(audio, tts_speed, 150).export(format='raw').read() chunk = pydub_to_np(speedup(audio, tts_speed, 150)) # audio = audio._spawn(audio.raw_data, overrides={ # "frame_rate": int(audio.frame_rate * tts_speed) # }) # chunk = np.array(audio.get_array_of_samples()) return chunk def pydub_to_np(audio: pydub.AudioSegment) -> (np.ndarray, int): """ Converts pydub audio segment into np.int16 of shape [duration_in_seconds*sample_rate, channels], """ return np.array(audio.get_array_of_samples(), dtype=np.int16).reshape((-1, audio.channels))