bl4dylion commited on
Commit
3ee8f12
Β·
1 Parent(s): 3a9a0d8

add text_mod and audio normalization

Browse files
src/audio_generators.py CHANGED
@@ -59,57 +59,55 @@ class AudioGeneratorWithEffects:
59
 
60
  def __init__(self):
61
  self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
 
 
62
 
63
- async def generate_audio_with_text_modification(
64
  self,
65
- annotated_text: str,
66
  character_to_voice: dict[str, str],
67
  ) -> Path:
68
  """Main method to generate the audiobook with TTS, emotion, and sound effects."""
69
- num_lines = len(annotated_text.splitlines())
70
  lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
71
 
72
  # Step 1: Process and modify text
73
  modified_texts, sound_emotion_results = await self._process_and_modify_text(
74
- annotated_text, lines_for_sound_effect
75
  )
76
 
77
  # Step 2: Generate TTS audio for modified text
78
- tts_results, temp_files = await self._generate_tts_audio(
79
- annotated_text, modified_texts, character_to_voice
80
  )
81
 
82
  # Step 3: Add sound effects to selected lines
83
  audio_chunks = await self._add_sound_effects(
84
- tts_results, lines_for_sound_effect, sound_emotion_results, temp_files
85
  )
86
 
87
  # Step 4: Merge audio files
88
- final_output = self._merge_audio_files(audio_chunks)
 
89
 
90
  # Clean up temporary files
91
- self._cleanup_temp_files(temp_files)
92
 
93
  return final_output
94
 
95
  def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
96
  """Select 20% of the lines randomly for sound effect generation."""
97
- return random.sample(range(num_lines), k=int(0.2 * num_lines))
98
 
99
  async def _process_and_modify_text(
100
- self, annotated_text: str, lines_for_sound_effect: list[int]
101
  ) -> tuple[list[dict], list[dict]]:
102
  """Process the text by modifying it and generating tasks for sound effects."""
103
  tasks_for_text_modification = []
104
  sound_emotion_tasks = []
105
 
106
- for idx, line in enumerate(annotated_text.splitlines()):
107
- cleaned_line = line.strip().lower()
108
- if not cleaned_line:
109
- continue
110
-
111
- # Extract character text
112
- character_text = cleaned_line[cleaned_line.rfind("]") + 1 :].lstrip()
113
 
114
  # Add text emotion modification task
115
  tasks_for_text_modification.append(
@@ -132,38 +130,35 @@ class AudioGeneratorWithEffects:
132
 
133
  async def _generate_tts_audio(
134
  self,
135
- annotated_text: str,
136
- modified_texts: list[dict], # TODO ? type ?
137
  character_to_voice: dict[str, str],
138
  ) -> tuple[list[str], list[str]]:
139
  """Generate TTS audio for modified text."""
140
  tasks_for_tts = []
141
  temp_files = []
142
- current_character = "narrator"
143
 
144
- for idx, (modified_text, line) in enumerate(
145
- zip(modified_texts, annotated_text.splitlines())
 
 
 
 
 
 
146
  ):
147
- cleaned_line = line.strip().lower()
148
 
149
- # Extract character
150
- try:
151
- current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
152
- except IndexError:
153
- pass
154
-
155
- # Get voice ID and generate TTS
156
- voice_id = character_to_voice[current_character]
157
- tasks_for_tts.append(
158
- tts_astream(
159
- voice_id=voice_id,
160
- text=modified_text["text"], # TODO ? type ?
161
- params=modified_texts["params"], # TODO ? type ?
162
- )
163
  )
 
164
 
165
  # Gather all TTS results
166
- tts_results = await asyncio.gather(*(consume_aiter(t) for t in tasks_for_tts))
167
 
168
  # Save the results to temporary files
169
  tts_audio_files = []
@@ -189,15 +184,11 @@ class AudioGeneratorWithEffects:
189
  for idx, tts_filename in enumerate(tts_audio_files):
190
  # If the line has sound emotion data, generate sound effect and overlay
191
  if idx in lines_for_sound_effect:
192
- sound_effect_data = sound_emotion_results.pop(
193
- 0
194
- ) # Get next sound effect data
195
  sound_effect_filename = f"sound_effect_{idx}.wav"
196
 
197
  # Generate sound effect asynchronously
198
- sound_result = await consume_aiter(
199
- sound_generation_astream(sound_effect_data)
200
- )
201
  with open(sound_effect_filename, "wb") as ab:
202
  for chunk in sound_result:
203
  ab.write(chunk)
@@ -217,6 +208,25 @@ class AudioGeneratorWithEffects:
217
 
218
  return audio_chunks
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
221
  """Helper function to merge multiple audio files into one."""
222
  combined = AudioSegment.from_file(audio_filenames[0])
@@ -236,4 +246,4 @@ class AudioGeneratorWithEffects:
236
  try:
237
  os.remove(temp_file)
238
  except FileNotFoundError:
239
- continue
 
59
 
60
  def __init__(self):
61
  self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
62
+ self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
63
+ self.temp_files = []
64
 
65
+ async def generate_audio(
66
  self,
67
+ text_split: SplitTextOutput,
68
  character_to_voice: dict[str, str],
69
  ) -> Path:
70
  """Main method to generate the audiobook with TTS, emotion, and sound effects."""
71
+ num_lines = len(text_split.phrases)
72
  lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
73
 
74
  # Step 1: Process and modify text
75
  modified_texts, sound_emotion_results = await self._process_and_modify_text(
76
+ text_split, lines_for_sound_effect
77
  )
78
 
79
  # Step 2: Generate TTS audio for modified text
80
+ tts_results, self.temp_files = await self._generate_tts_audio(
81
+ text_split, modified_texts, character_to_voice
82
  )
83
 
84
  # Step 3: Add sound effects to selected lines
85
  audio_chunks = await self._add_sound_effects(
86
+ tts_results, lines_for_sound_effect, sound_emotion_results, self.temp_files
87
  )
88
 
89
  # Step 4: Merge audio files
90
+ normalized_audio_chunks = self._normalize_audio_chunks(audio_chunks, self.temp_files)
91
+ final_output = self._merge_audio_files(normalized_audio_chunks)
92
 
93
  # Clean up temporary files
94
+ self._cleanup_temp_files(self.temp_files)
95
 
96
  return final_output
97
 
98
  def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
99
  """Select 20% of the lines randomly for sound effect generation."""
100
+ return random.sample(range(num_lines), k=int(0.0 * num_lines))
101
 
102
  async def _process_and_modify_text(
103
+ self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
104
  ) -> tuple[list[dict], list[dict]]:
105
  """Process the text by modifying it and generating tasks for sound effects."""
106
  tasks_for_text_modification = []
107
  sound_emotion_tasks = []
108
 
109
+ for idx, character_phrase in enumerate(text_split.phrases):
110
+ character_text = character_phrase.text.strip().lower()
 
 
 
 
 
111
 
112
  # Add text emotion modification task
113
  tasks_for_text_modification.append(
 
130
 
131
  async def _generate_tts_audio(
132
  self,
133
+ text_split: SplitTextOutput,
134
+ modified_texts: list[dict],
135
  character_to_voice: dict[str, str],
136
  ) -> tuple[list[str], list[str]]:
137
  """Generate TTS audio for modified text."""
138
  tasks_for_tts = []
139
  temp_files = []
 
140
 
141
+ async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
142
+ async with self.semaphore:
143
+ iter_ = tts_astream(voice_id=voice_id, text=text, params=params)
144
+ bytes_ = await consume_aiter(iter_)
145
+ return bytes_
146
+
147
+ for idx, (modified_text, character_phrase) in enumerate(
148
+ zip(modified_texts, text_split.phrases)
149
  ):
150
+ voice_id = character_to_voice[character_phrase.character]
151
 
152
+ # Use the semaphore-protected TTS function
153
+ task = tts_astream_with_semaphore(
154
+ voice_id=voice_id,
155
+ text=modified_text["modified_text"],
156
+ params=modified_text["params"],
 
 
 
 
 
 
 
 
 
157
  )
158
+ tasks_for_tts.append(task)
159
 
160
  # Gather all TTS results
161
+ tts_results = await asyncio.gather(*tasks_for_tts)
162
 
163
  # Save the results to temporary files
164
  tts_audio_files = []
 
184
  for idx, tts_filename in enumerate(tts_audio_files):
185
  # If the line has sound emotion data, generate sound effect and overlay
186
  if idx in lines_for_sound_effect:
187
+ sound_effect_data = sound_emotion_results.pop(0) # Get next sound effect data
 
 
188
  sound_effect_filename = f"sound_effect_{idx}.wav"
189
 
190
  # Generate sound effect asynchronously
191
+ sound_result = await consume_aiter(sound_generation_astream(sound_effect_data))
 
 
192
  with open(sound_effect_filename, "wb") as ab:
193
  for chunk in sound_result:
194
  ab.write(chunk)
 
208
 
209
  return audio_chunks
210
 
211
+ def _normalize_audio(self, audio_segment: AudioSegment, target_dBFS: float = -20.0) -> AudioSegment:
212
+ """Normalize an audio segment to the target dBFS level."""
213
+ change_in_dBFS = target_dBFS - audio_segment.dBFS
214
+ return audio_segment.apply_gain(change_in_dBFS)
215
+
216
+ def _normalize_audio_chunks(self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0) -> list[str]:
217
+ """Normalize all audio chunks to the target volume level."""
218
+ normalized_files = []
219
+ for audio_file in audio_filenames:
220
+ audio_segment = AudioSegment.from_file(audio_file)
221
+ normalized_audio = self._normalize_audio(audio_segment, target_dBFS)
222
+
223
+ normalized_filename = f"normalized_{Path(audio_file).stem}.wav"
224
+ normalized_audio.export(normalized_filename, format="wav")
225
+ normalized_files.append(normalized_filename)
226
+ temp_files.append(normalized_filename)
227
+
228
+ return normalized_files
229
+
230
  def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
231
  """Helper function to merge multiple audio files into one."""
232
  combined = AudioSegment.from_file(audio_filenames[0])
 
246
  try:
247
  os.remove(temp_file)
248
  except FileNotFoundError:
249
+ continue
src/builder.py CHANGED
@@ -1,6 +1,6 @@
1
  from langchain_community.callbacks import get_openai_callback
2
 
3
- from src.audio_generators import AudioGeneratorSimple
4
  from src.lc_callbacks import LCMessageLoggerAsync
5
  from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
6
  from src.text_split_chain import SplitTextOutput, create_split_text_chain
@@ -13,7 +13,7 @@ class AudiobookBuilder:
13
  self.voice_selector = VoiceSelector(
14
  csv_table_fp="data/11labs_available_tts_voices.csv"
15
  )
16
- self.audio_generator = AudioGeneratorSimple()
17
 
18
  async def split_text(self, text: str) -> SplitTextOutput:
19
  chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
 
1
  from langchain_community.callbacks import get_openai_callback
2
 
3
+ from src.audio_generators import AudioGeneratorSimple, AudioGeneratorWithEffects
4
  from src.lc_callbacks import LCMessageLoggerAsync
5
  from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
6
  from src.text_split_chain import SplitTextOutput, create_split_text_chain
 
13
  self.voice_selector = VoiceSelector(
14
  csv_table_fp="data/11labs_available_tts_voices.csv"
15
  )
16
+ self.audio_generator = AudioGeneratorWithEffects()
17
 
18
  async def split_text(self, text: str) -> SplitTextOutput:
19
  chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
src/emotions/generation.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  from requests import HTTPError
4
  from abc import ABC, abstractmethod
5
 
6
- from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION
7
  from .utils import get_audio_duration
8
  from src.config import logger
9
 
@@ -25,7 +25,7 @@ class EffectGenerator(AbstractEffectGenerator):
25
  def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
26
  self.client = openai.OpenAI(api_key=api_key)
27
  self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
28
- self.text_modification_prompt = TEXT_MODIFICATION
29
  self.model_type = model_type
30
  logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
31
 
@@ -88,7 +88,7 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
88
  def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
89
  self.client = openai.AsyncOpenAI(api_key=api_key)
90
  self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
91
- self.text_modification_prompt = TEXT_MODIFICATION
92
  self.model_type = model_type
93
 
94
  async def generate_text_for_sound_effect(self, text: str) -> dict:
 
3
  from requests import HTTPError
4
  from abc import ABC, abstractmethod
5
 
6
+ from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
7
  from .utils import get_audio_duration
8
  from src.config import logger
9
 
 
25
  def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
26
  self.client = openai.OpenAI(api_key=api_key)
27
  self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
28
+ self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
29
  self.model_type = model_type
30
  logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
31
 
 
88
  def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
89
  self.client = openai.AsyncOpenAI(api_key=api_key)
90
  self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
91
+ self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
92
  self.model_type = model_type
93
 
94
  async def generate_text_for_sound_effect(self, text: str) -> dict:
src/emotions/prompts.py CHANGED
@@ -98,4 +98,60 @@ Adjust both according to the emotional intensity of the text.
98
  Example of text that could be passed:
99
 
100
  Text: "I can't believe this is happening."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  """
 
98
  Example of text that could be passed:
99
 
100
  Text: "I can't believe this is happening."
101
+ """
102
+
103
+ TEXT_MODIFICATION_WITH_SSML = """
104
+ You should help me to make an audiobook with overabundant emotion-based voice using TTS.
105
+ You are tasked with transforming the text provided into a sophisticated SSML script
106
+ that is optimized for emotionally, dramatically and breathtaking rich audiobook narration.
107
+ Analyze the text for underlying emotions, detect nuances in intonation, and discern the intended impact.
108
+ Apply suitable SSML enhancements to ensure that the final TTS output delivers
109
+ a powerful, engaging, dramatic and breathtaking listening experience appropriate for an audiobook context
110
+ (more effects/emotions are better than less)."
111
+
112
+ Please, use only provided SSML tags and don't generate any other tags.
113
+ Key SSML Tags to Utilize:
114
+ <speak>: This is the root element. All SSML content to be synthesized must be enclosed within this tag.
115
+ <prosody>: Manipulates pitch, rate, and volume to convey various emotions and emphases. Use this tag to adjust the voice to match the mood and tone of different parts of the narrative.
116
+ <break>: Inserts pauses of specified durations. Use this to create natural breaks in speech, aiding in dramatic effect and better comprehension for listeners.
117
+ <emphasis>: Adds stress to words or phrases to highlight key points or emotions, similar to vocal emphasis in natural speech.
118
+ <p> and <s>: Structural tags that denote paragraphs and sentences, respectively. They help to manage the flow and pacing of the narrative appropriately.
119
+
120
+ Input Text Example: "He stood there, gazing into the endless horizon. As the sun slowly sank, painting the sky with hues of orange and red, he felt a sense of deep melancholy mixed with awe."
121
+
122
+ Modified text should be in the XML format. Expected SSML-enriched Output:
123
+
124
+ <speak>
125
+ <p>
126
+ <s>
127
+ He stood there, <prosody rate="slow" volume="soft">gazing into the endless horizon.</prosody>
128
+ </s>
129
+ <s>
130
+ As the sun slowly <prosody rate="medium" pitch="-2st">sank,</prosody>
131
+ <prosody volume="medium" pitch="+1st">painting the sky with hues of orange and red,</prosody>
132
+ he felt a sense of deep <prosody volume="soft" pitch="-1st">melancholy</prosody> mixed with <emphasis level="moderate">awe.</emphasis>
133
+ </s>
134
+ </p>
135
+ </speak>
136
+
137
+ After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
138
+ according to the level of emotional intensity in the modified text.
139
+ Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
140
+ Your output should be in the following JSON format:
141
+ {
142
+ "modified_text": "Modified text in xml format with SSML tags.",
143
+ "params": {
144
+ "stability": 0.7,
145
+ "similarity_boost": 0.5,
146
+ "style": 0.3
147
+ }
148
+ }
149
+
150
+ The "stability" parameter should range from 0 to 1,
151
+ with lower values indicating a more expressive, less stable voice.
152
+ The "similarity_boost" parameter should also range from 0 to 1,
153
+ with higher values indicating more emphasis on the voice similarity.
154
+ The "style" parameter should also range from 0 to 1,
155
+ where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
156
+ Adjust both according to the emotional intensity of the text.
157
  """