Spaces:
Running
Running
File size: 8,174 Bytes
71df28b f75b571 71df28b 73108db 3cae7e3 f75b571 71df28b 8797420 73b02c5 71df28b 73b02c5 71df28b c31b600 71df28b 8333465 71df28b c31b600 43660e3 4a46ced c31b600 71df28b beb79e1 71df28b 8797420 71df28b 794228c 8797420 71df28b 2cc9d7b c31b600 71df28b 8797420 71df28b c31b600 8eaa6b8 4a46ced 8eaa6b8 f8a93c9 8eaa6b8 8797420 71df28b 73b02c5 8797420 71df28b c31b600 71df28b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# generate_audio.py
import spaces
import pickle
import torch
import numpy as np
from tqdm import tqdm
from transformers import BarkModel, AutoProcessor, AutoTokenizer
from parler_tts import ParlerTTSForConditionalGeneration
from scipy.io import wavfile
from pydub import AudioSegment
import io
import ast
@spaces.GPU
class TTSGenerator:
"""
A class to generate podcast-style audio from a transcript using ParlerTTS and Bark models.
"""
#@spaces.GPU
def __init__(self, transcript_file_path,output_audio_path):
"""
Initialize the TTS generator with the path to the rewritten transcript file.
Args:
transcript_file_path (str): Path to the file containing the rewritten transcript.
"""
self.transcript_file_path = transcript_file_path
self.output_audio_path = output_audio_path
# Set device
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Load Parler model and tokenizer for Speaker 1
self.parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(self.device)
self.parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
self.speaker1_description = """
Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise and very clear audio.
"""
self.speaker2_description = """
Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise and very clear audio.
"""
# Load Bark model and processor for Speaker 2
# self.bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
# self.bark_model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(self.device)
# self.bark_sampling_rate = 24000
# self.voice_preset = "v2/en_speaker_6"
#@spaces.GPU
def load_transcript(self):
"""
Loads the rewritten transcript from the specified file.
Returns:
list: The content of the transcript as a list of tuples (speaker, text).
"""
with open(self.transcript_file_path, 'rb') as f:
return ast.literal_eval(pickle.load(f))
#@spaces.GPU(duration=30)
def generate_speaker1_audio(self, text):
"""
Generate audio for Speaker 1 using ParlerTTS.
Args:
text (str): Text to be synthesized for Speaker 1.
Returns:
np.array: Audio array.
int: Sampling rate.
"""
# input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt").input_ids.to(self.device)
# prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
# generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
# audio_arr = generation.cpu().numpy().squeeze()
# return audio_arr, self.parler_model.config.sampling_rate
with torch.no_grad():
input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).input_ids.to(self.device)
attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
# Pass all required arguments to generate() for reliable behavior
generation = self.parler_model.generate(
input_ids=input_ids,
attention_mask=attention_mask_input, # Set attention mask for input IDs
prompt_input_ids=prompt_input_ids,
prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
)
audio_arr = generation.cpu().numpy().squeeze()
return audio_arr, self.parler_model.config.sampling_rate
#@spaces.GPU(duration=30)
def generate_speaker2_audio(self, text):
"""
Generate audio for Speaker 2 using Bark.
Args:
text (str): Text to be synthesized for Speaker 2.
Returns:
np.array: Audio array.
int: Sampling rate.
"""
with torch.no_grad():
input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt", padding=True).input_ids.to(self.device)
attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
# Pass all required arguments to generate() for reliable behavior
generation = self.parler_model.generate(
input_ids=input_ids,
attention_mask=attention_mask_input, # Set attention mask for input IDs
prompt_input_ids=prompt_input_ids,
prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
)
audio_arr = generation.cpu().numpy().squeeze()
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
# audio_arr = speech_output[0].cpu().numpy()
# return audio_arr, self.bark_sampling_rate
# Tokenize input text and obtain input IDs and attention mask
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
# audio_arr = speech_output[0].cpu().numpy()
return audio_arr, self.parler_model.config.sampling_rate
#@spaces.GPU
@staticmethod
def numpy_to_audio_segment(audio_arr, sampling_rate):
"""
Convert numpy array to AudioSegment.
Args:
audio_arr (np.array): Numpy array of audio data.
sampling_rate (int): Sampling rate of the audio.
Returns:
AudioSegment: Converted audio segment.
"""
audio_int16 = (audio_arr * 32767).astype(np.int16)
byte_io = io.BytesIO()
wavfile.write(byte_io, sampling_rate, audio_int16)
byte_io.seek(0)
return AudioSegment.from_wav(byte_io)
#@spaces.GPU(duration=300)
def generate_audio(self):
"""
Converts the transcript into audio and saves it to a file.
Returns:
str: Path to the saved audio file.
"""
transcript = self.load_transcript()
final_audio = None
for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
if speaker == "Speaker 1":
audio_arr, rate = self.generate_speaker1_audio(text)
else: # Speaker 2
audio_arr, rate = self.generate_speaker2_audio(text)
# Convert to AudioSegment
audio_segment = self.numpy_to_audio_segment(audio_arr, rate)
# Add segment to final audio
if final_audio is None:
final_audio = audio_segment
else:
final_audio += audio_segment
torch.cuda.empty_cache()
# Export final audio to MP3
final_audio.export(self.output_audio_path, format="mp3", bitrate="192k", parameters=["-q:a", "0"])
return self.output_audio_path |