yasserrmd commited on
Commit
71df28b
1 Parent(s): 6b720a3

Create generate_audio.py

Browse files
Files changed (1) hide show
  1. generate_audio.py +133 -0
generate_audio.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generate_audio.py
2
+
3
+ import pickle
4
+ import torch
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ from transformers import BarkModel, AutoProcessor, AutoTokenizer
8
+ from parler_tts import ParlerTTSForConditionalGeneration
9
+ from scipy.io import wavfile
10
+ from pydub import AudioSegment
11
+ import io
12
+ import ast
13
+
14
+ class TTSGenerator:
15
+ """
16
+ A class to generate podcast-style audio from a transcript using ParlerTTS and Bark models.
17
+ """
18
+
19
+ def __init__(self, transcript_file_path):
20
+ """
21
+ Initialize the TTS generator with the path to the rewritten transcript file.
22
+
23
+ Args:
24
+ transcript_file_path (str): Path to the file containing the rewritten transcript.
25
+ """
26
+ self.transcript_file_path = transcript_file_path
27
+ self.output_audio_path = './resources/_podcast.mp3'
28
+
29
+ # Set device
30
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+ # Load Parler model and tokenizer for Speaker 1
33
+ self.parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(self.device)
34
+ self.parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
35
+ self.speaker1_description = """
36
+ Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
37
+ """
38
+
39
+ # Load Bark model and processor for Speaker 2
40
+ self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
41
+ self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
42
+ self.bark_sampling_rate = 24000
43
+ self.voice_preset = "v2/en_speaker_6"
44
+
45
+ def load_transcript(self):
46
+ """
47
+ Loads the rewritten transcript from the specified file.
48
+
49
+ Returns:
50
+ list: The content of the transcript as a list of tuples (speaker, text).
51
+ """
52
+ with open(self.transcript_file_path, 'rb') as f:
53
+ return ast.literal_eval(pickle.load(f))
54
+
55
+ def generate_speaker1_audio(self, text):
56
+ """
57
+ Generate audio for Speaker 1 using ParlerTTS.
58
+
59
+ Args:
60
+ text (str): Text to be synthesized for Speaker 1.
61
+
62
+ Returns:
63
+ np.array: Audio array.
64
+ int: Sampling rate.
65
+ """
66
+ input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt").input_ids.to(self.device)
67
+ prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
68
+ generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
69
+ audio_arr = generation.cpu().numpy().squeeze()
70
+ return audio_arr, self.parler_model.config.sampling_rate
71
+
72
+ def generate_speaker2_audio(self, text):
73
+ """
74
+ Generate audio for Speaker 2 using Bark.
75
+
76
+ Args:
77
+ text (str): Text to be synthesized for Speaker 2.
78
+
79
+ Returns:
80
+ np.array: Audio array.
81
+ int: Sampling rate.
82
+ """
83
+ inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
84
+ speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
85
+ audio_arr = speech_output[0].cpu().numpy()
86
+ return audio_arr, self.bark_sampling_rate
87
+
88
+ @staticmethod
89
+ def numpy_to_audio_segment(audio_arr, sampling_rate):
90
+ """
91
+ Convert numpy array to AudioSegment.
92
+
93
+ Args:
94
+ audio_arr (np.array): Numpy array of audio data.
95
+ sampling_rate (int): Sampling rate of the audio.
96
+
97
+ Returns:
98
+ AudioSegment: Converted audio segment.
99
+ """
100
+ audio_int16 = (audio_arr * 32767).astype(np.int16)
101
+ byte_io = io.BytesIO()
102
+ wavfile.write(byte_io, sampling_rate, audio_int16)
103
+ byte_io.seek(0)
104
+ return AudioSegment.from_wav(byte_io)
105
+
106
+ def generate_audio(self):
107
+ """
108
+ Converts the transcript into audio and saves it to a file.
109
+
110
+ Returns:
111
+ str: Path to the saved audio file.
112
+ """
113
+ transcript = self.load_transcript()
114
+ final_audio = None
115
+
116
+ for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
117
+ if speaker == "Speaker 1":
118
+ audio_arr, rate = self.generate_speaker1_audio(text)
119
+ else: # Speaker 2
120
+ audio_arr, rate = self.generate_speaker2_audio(text)
121
+
122
+ # Convert to AudioSegment
123
+ audio_segment = self.numpy_to_audio_segment(audio_arr, rate)
124
+
125
+ # Add segment to final audio
126
+ if final_audio is None:
127
+ final_audio = audio_segment
128
+ else:
129
+ final_audio += audio_segment
130
+
131
+ # Export final audio to MP3
132
+ final_audio.export(self.output_audio_path, format="mp3", bitrate="192k", parameters=["-q:a", "0"])
133
+ return self.output_audio_path