import os from IPython.display import Audio import nltk # we'll use this to split into sentences import numpy as np from bark.generation import ( generate_text_semantic, preload_models, ) from bark.api import semantic_to_waveform from bark import generate_audio, SAMPLE_RATE import soundfile as sf os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Loads the model, should be run one time preload_models() class AudioBook: def __init__(self, output_folder="output"): self.output_folder = output_folder # Create the output folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) def generate_audio_from_text(self, text, speaker="male", filename="output_audio"): # Preprocess text text = text.replace("\n", " ").strip() sentences = nltk.sent_tokenize(text) # Choose the speaker based on the input if speaker == "male": SPEAKER = "v2/en_speaker_6" elif speaker == "female": SPEAKER = "v2/en_speaker_9" else: raise ValueError("Invalid speaker selection. Use 'male' or 'female'.") silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter-second of silence pieces = [] for sentence in sentences: audio_array = generate_audio(sentence, history_prompt=SPEAKER, text_temp=0.7, waveform_temp=0.7) pieces += [audio_array, silence.copy()] audio_data = np.concatenate(pieces) # Save the audio to a WAV file in the output folder output_path = os.path.join(self.output_folder, f"{filename}.wav") sf.write(output_path, audio_data, SAMPLE_RATE) return output_path