Spaces:
Running
Running
add text_mod and audio normalization
Browse files- src/audio_generators.py +56 -46
- src/builder.py +2 -2
- src/emotions/generation.py +3 -3
- src/emotions/prompts.py +56 -0
src/audio_generators.py
CHANGED
@@ -59,57 +59,55 @@ class AudioGeneratorWithEffects:
|
|
59 |
|
60 |
def __init__(self):
|
61 |
self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
|
|
|
|
|
62 |
|
63 |
-
async def
|
64 |
self,
|
65 |
-
|
66 |
character_to_voice: dict[str, str],
|
67 |
) -> Path:
|
68 |
"""Main method to generate the audiobook with TTS, emotion, and sound effects."""
|
69 |
-
num_lines = len(
|
70 |
lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
|
71 |
|
72 |
# Step 1: Process and modify text
|
73 |
modified_texts, sound_emotion_results = await self._process_and_modify_text(
|
74 |
-
|
75 |
)
|
76 |
|
77 |
# Step 2: Generate TTS audio for modified text
|
78 |
-
tts_results, temp_files = await self._generate_tts_audio(
|
79 |
-
|
80 |
)
|
81 |
|
82 |
# Step 3: Add sound effects to selected lines
|
83 |
audio_chunks = await self._add_sound_effects(
|
84 |
-
tts_results, lines_for_sound_effect, sound_emotion_results, temp_files
|
85 |
)
|
86 |
|
87 |
# Step 4: Merge audio files
|
88 |
-
|
|
|
89 |
|
90 |
# Clean up temporary files
|
91 |
-
self._cleanup_temp_files(temp_files)
|
92 |
|
93 |
return final_output
|
94 |
|
95 |
def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
|
96 |
"""Select 20% of the lines randomly for sound effect generation."""
|
97 |
-
return random.sample(range(num_lines), k=int(0.
|
98 |
|
99 |
async def _process_and_modify_text(
|
100 |
-
self,
|
101 |
) -> tuple[list[dict], list[dict]]:
|
102 |
"""Process the text by modifying it and generating tasks for sound effects."""
|
103 |
tasks_for_text_modification = []
|
104 |
sound_emotion_tasks = []
|
105 |
|
106 |
-
for idx,
|
107 |
-
|
108 |
-
if not cleaned_line:
|
109 |
-
continue
|
110 |
-
|
111 |
-
# Extract character text
|
112 |
-
character_text = cleaned_line[cleaned_line.rfind("]") + 1 :].lstrip()
|
113 |
|
114 |
# Add text emotion modification task
|
115 |
tasks_for_text_modification.append(
|
@@ -132,38 +130,35 @@ class AudioGeneratorWithEffects:
|
|
132 |
|
133 |
async def _generate_tts_audio(
|
134 |
self,
|
135 |
-
|
136 |
-
modified_texts: list[dict],
|
137 |
character_to_voice: dict[str, str],
|
138 |
) -> tuple[list[str], list[str]]:
|
139 |
"""Generate TTS audio for modified text."""
|
140 |
tasks_for_tts = []
|
141 |
temp_files = []
|
142 |
-
current_character = "narrator"
|
143 |
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
):
|
147 |
-
|
148 |
|
149 |
-
#
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
# Get voice ID and generate TTS
|
156 |
-
voice_id = character_to_voice[current_character]
|
157 |
-
tasks_for_tts.append(
|
158 |
-
tts_astream(
|
159 |
-
voice_id=voice_id,
|
160 |
-
text=modified_text["text"], # TODO ? type ?
|
161 |
-
params=modified_texts["params"], # TODO ? type ?
|
162 |
-
)
|
163 |
)
|
|
|
164 |
|
165 |
# Gather all TTS results
|
166 |
-
tts_results = await asyncio.gather(*
|
167 |
|
168 |
# Save the results to temporary files
|
169 |
tts_audio_files = []
|
@@ -189,15 +184,11 @@ class AudioGeneratorWithEffects:
|
|
189 |
for idx, tts_filename in enumerate(tts_audio_files):
|
190 |
# If the line has sound emotion data, generate sound effect and overlay
|
191 |
if idx in lines_for_sound_effect:
|
192 |
-
sound_effect_data = sound_emotion_results.pop(
|
193 |
-
0
|
194 |
-
) # Get next sound effect data
|
195 |
sound_effect_filename = f"sound_effect_{idx}.wav"
|
196 |
|
197 |
# Generate sound effect asynchronously
|
198 |
-
sound_result = await consume_aiter(
|
199 |
-
sound_generation_astream(sound_effect_data)
|
200 |
-
)
|
201 |
with open(sound_effect_filename, "wb") as ab:
|
202 |
for chunk in sound_result:
|
203 |
ab.write(chunk)
|
@@ -217,6 +208,25 @@ class AudioGeneratorWithEffects:
|
|
217 |
|
218 |
return audio_chunks
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
|
221 |
"""Helper function to merge multiple audio files into one."""
|
222 |
combined = AudioSegment.from_file(audio_filenames[0])
|
@@ -236,4 +246,4 @@ class AudioGeneratorWithEffects:
|
|
236 |
try:
|
237 |
os.remove(temp_file)
|
238 |
except FileNotFoundError:
|
239 |
-
continue
|
|
|
59 |
|
60 |
def __init__(self):
|
61 |
self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
|
62 |
+
self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
|
63 |
+
self.temp_files = []
|
64 |
|
65 |
+
async def generate_audio(
|
66 |
self,
|
67 |
+
text_split: SplitTextOutput,
|
68 |
character_to_voice: dict[str, str],
|
69 |
) -> Path:
|
70 |
"""Main method to generate the audiobook with TTS, emotion, and sound effects."""
|
71 |
+
num_lines = len(text_split.phrases)
|
72 |
lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
|
73 |
|
74 |
# Step 1: Process and modify text
|
75 |
modified_texts, sound_emotion_results = await self._process_and_modify_text(
|
76 |
+
text_split, lines_for_sound_effect
|
77 |
)
|
78 |
|
79 |
# Step 2: Generate TTS audio for modified text
|
80 |
+
tts_results, self.temp_files = await self._generate_tts_audio(
|
81 |
+
text_split, modified_texts, character_to_voice
|
82 |
)
|
83 |
|
84 |
# Step 3: Add sound effects to selected lines
|
85 |
audio_chunks = await self._add_sound_effects(
|
86 |
+
tts_results, lines_for_sound_effect, sound_emotion_results, self.temp_files
|
87 |
)
|
88 |
|
89 |
# Step 4: Merge audio files
|
90 |
+
normalized_audio_chunks = self._normalize_audio_chunks(audio_chunks, self.temp_files)
|
91 |
+
final_output = self._merge_audio_files(normalized_audio_chunks)
|
92 |
|
93 |
# Clean up temporary files
|
94 |
+
self._cleanup_temp_files(self.temp_files)
|
95 |
|
96 |
return final_output
|
97 |
|
98 |
def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
|
99 |
"""Select 20% of the lines randomly for sound effect generation."""
|
100 |
+
return random.sample(range(num_lines), k=int(0.0 * num_lines))
|
101 |
|
102 |
async def _process_and_modify_text(
|
103 |
+
self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
|
104 |
) -> tuple[list[dict], list[dict]]:
|
105 |
"""Process the text by modifying it and generating tasks for sound effects."""
|
106 |
tasks_for_text_modification = []
|
107 |
sound_emotion_tasks = []
|
108 |
|
109 |
+
for idx, character_phrase in enumerate(text_split.phrases):
|
110 |
+
character_text = character_phrase.text.strip().lower()
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# Add text emotion modification task
|
113 |
tasks_for_text_modification.append(
|
|
|
130 |
|
131 |
async def _generate_tts_audio(
|
132 |
self,
|
133 |
+
text_split: SplitTextOutput,
|
134 |
+
modified_texts: list[dict],
|
135 |
character_to_voice: dict[str, str],
|
136 |
) -> tuple[list[str], list[str]]:
|
137 |
"""Generate TTS audio for modified text."""
|
138 |
tasks_for_tts = []
|
139 |
temp_files = []
|
|
|
140 |
|
141 |
+
async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
|
142 |
+
async with self.semaphore:
|
143 |
+
iter_ = tts_astream(voice_id=voice_id, text=text, params=params)
|
144 |
+
bytes_ = await consume_aiter(iter_)
|
145 |
+
return bytes_
|
146 |
+
|
147 |
+
for idx, (modified_text, character_phrase) in enumerate(
|
148 |
+
zip(modified_texts, text_split.phrases)
|
149 |
):
|
150 |
+
voice_id = character_to_voice[character_phrase.character]
|
151 |
|
152 |
+
# Use the semaphore-protected TTS function
|
153 |
+
task = tts_astream_with_semaphore(
|
154 |
+
voice_id=voice_id,
|
155 |
+
text=modified_text["modified_text"],
|
156 |
+
params=modified_text["params"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
)
|
158 |
+
tasks_for_tts.append(task)
|
159 |
|
160 |
# Gather all TTS results
|
161 |
+
tts_results = await asyncio.gather(*tasks_for_tts)
|
162 |
|
163 |
# Save the results to temporary files
|
164 |
tts_audio_files = []
|
|
|
184 |
for idx, tts_filename in enumerate(tts_audio_files):
|
185 |
# If the line has sound emotion data, generate sound effect and overlay
|
186 |
if idx in lines_for_sound_effect:
|
187 |
+
sound_effect_data = sound_emotion_results.pop(0) # Get next sound effect data
|
|
|
|
|
188 |
sound_effect_filename = f"sound_effect_{idx}.wav"
|
189 |
|
190 |
# Generate sound effect asynchronously
|
191 |
+
sound_result = await consume_aiter(sound_generation_astream(sound_effect_data))
|
|
|
|
|
192 |
with open(sound_effect_filename, "wb") as ab:
|
193 |
for chunk in sound_result:
|
194 |
ab.write(chunk)
|
|
|
208 |
|
209 |
return audio_chunks
|
210 |
|
211 |
+
def _normalize_audio(self, audio_segment: AudioSegment, target_dBFS: float = -20.0) -> AudioSegment:
|
212 |
+
"""Normalize an audio segment to the target dBFS level."""
|
213 |
+
change_in_dBFS = target_dBFS - audio_segment.dBFS
|
214 |
+
return audio_segment.apply_gain(change_in_dBFS)
|
215 |
+
|
216 |
+
def _normalize_audio_chunks(self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0) -> list[str]:
|
217 |
+
"""Normalize all audio chunks to the target volume level."""
|
218 |
+
normalized_files = []
|
219 |
+
for audio_file in audio_filenames:
|
220 |
+
audio_segment = AudioSegment.from_file(audio_file)
|
221 |
+
normalized_audio = self._normalize_audio(audio_segment, target_dBFS)
|
222 |
+
|
223 |
+
normalized_filename = f"normalized_{Path(audio_file).stem}.wav"
|
224 |
+
normalized_audio.export(normalized_filename, format="wav")
|
225 |
+
normalized_files.append(normalized_filename)
|
226 |
+
temp_files.append(normalized_filename)
|
227 |
+
|
228 |
+
return normalized_files
|
229 |
+
|
230 |
def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
|
231 |
"""Helper function to merge multiple audio files into one."""
|
232 |
combined = AudioSegment.from_file(audio_filenames[0])
|
|
|
246 |
try:
|
247 |
os.remove(temp_file)
|
248 |
except FileNotFoundError:
|
249 |
+
continue
|
src/builder.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from langchain_community.callbacks import get_openai_callback
|
2 |
|
3 |
-
from src.audio_generators import AudioGeneratorSimple
|
4 |
from src.lc_callbacks import LCMessageLoggerAsync
|
5 |
from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
|
6 |
from src.text_split_chain import SplitTextOutput, create_split_text_chain
|
@@ -13,7 +13,7 @@ class AudiobookBuilder:
|
|
13 |
self.voice_selector = VoiceSelector(
|
14 |
csv_table_fp="data/11labs_available_tts_voices.csv"
|
15 |
)
|
16 |
-
self.audio_generator =
|
17 |
|
18 |
async def split_text(self, text: str) -> SplitTextOutput:
|
19 |
chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
|
|
|
1 |
from langchain_community.callbacks import get_openai_callback
|
2 |
|
3 |
+
from src.audio_generators import AudioGeneratorSimple, AudioGeneratorWithEffects
|
4 |
from src.lc_callbacks import LCMessageLoggerAsync
|
5 |
from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
|
6 |
from src.text_split_chain import SplitTextOutput, create_split_text_chain
|
|
|
13 |
self.voice_selector = VoiceSelector(
|
14 |
csv_table_fp="data/11labs_available_tts_voices.csv"
|
15 |
)
|
16 |
+
self.audio_generator = AudioGeneratorWithEffects()
|
17 |
|
18 |
async def split_text(self, text: str) -> SplitTextOutput:
|
19 |
chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
|
src/emotions/generation.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3 |
from requests import HTTPError
|
4 |
from abc import ABC, abstractmethod
|
5 |
|
6 |
-
from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION
|
7 |
from .utils import get_audio_duration
|
8 |
from src.config import logger
|
9 |
|
@@ -25,7 +25,7 @@ class EffectGenerator(AbstractEffectGenerator):
|
|
25 |
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
|
26 |
self.client = openai.OpenAI(api_key=api_key)
|
27 |
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
28 |
-
self.text_modification_prompt =
|
29 |
self.model_type = model_type
|
30 |
logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
|
31 |
|
@@ -88,7 +88,7 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
|
|
88 |
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
|
89 |
self.client = openai.AsyncOpenAI(api_key=api_key)
|
90 |
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
91 |
-
self.text_modification_prompt =
|
92 |
self.model_type = model_type
|
93 |
|
94 |
async def generate_text_for_sound_effect(self, text: str) -> dict:
|
|
|
3 |
from requests import HTTPError
|
4 |
from abc import ABC, abstractmethod
|
5 |
|
6 |
+
from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
|
7 |
from .utils import get_audio_duration
|
8 |
from src.config import logger
|
9 |
|
|
|
25 |
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
|
26 |
self.client = openai.OpenAI(api_key=api_key)
|
27 |
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
28 |
+
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
29 |
self.model_type = model_type
|
30 |
logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
|
31 |
|
|
|
88 |
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
|
89 |
self.client = openai.AsyncOpenAI(api_key=api_key)
|
90 |
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
91 |
+
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
92 |
self.model_type = model_type
|
93 |
|
94 |
async def generate_text_for_sound_effect(self, text: str) -> dict:
|
src/emotions/prompts.py
CHANGED
@@ -98,4 +98,60 @@ Adjust both according to the emotional intensity of the text.
|
|
98 |
Example of text that could be passed:
|
99 |
|
100 |
Text: "I can't believe this is happening."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
"""
|
|
|
98 |
Example of text that could be passed:
|
99 |
|
100 |
Text: "I can't believe this is happening."
|
101 |
+
"""
|
102 |
+
|
103 |
+
TEXT_MODIFICATION_WITH_SSML = """
|
104 |
+
You should help me to make an audiobook with overabundant emotion-based voice using TTS.
|
105 |
+
You are tasked with transforming the text provided into a sophisticated SSML script
|
106 |
+
that is optimized for emotionally, dramatically and breathtaking rich audiobook narration.
|
107 |
+
Analyze the text for underlying emotions, detect nuances in intonation, and discern the intended impact.
|
108 |
+
Apply suitable SSML enhancements to ensure that the final TTS output delivers
|
109 |
+
a powerful, engaging, dramatic and breathtaking listening experience appropriate for an audiobook context
|
110 |
+
(more effects/emotions are better than less)."
|
111 |
+
|
112 |
+
Please, use only provided SSML tags and don't generate any other tags.
|
113 |
+
Key SSML Tags to Utilize:
|
114 |
+
<speak>: This is the root element. All SSML content to be synthesized must be enclosed within this tag.
|
115 |
+
<prosody>: Manipulates pitch, rate, and volume to convey various emotions and emphases. Use this tag to adjust the voice to match the mood and tone of different parts of the narrative.
|
116 |
+
<break>: Inserts pauses of specified durations. Use this to create natural breaks in speech, aiding in dramatic effect and better comprehension for listeners.
|
117 |
+
<emphasis>: Adds stress to words or phrases to highlight key points or emotions, similar to vocal emphasis in natural speech.
|
118 |
+
<p> and <s>: Structural tags that denote paragraphs and sentences, respectively. They help to manage the flow and pacing of the narrative appropriately.
|
119 |
+
|
120 |
+
Input Text Example: "He stood there, gazing into the endless horizon. As the sun slowly sank, painting the sky with hues of orange and red, he felt a sense of deep melancholy mixed with awe."
|
121 |
+
|
122 |
+
Modified text should be in the XML format. Expected SSML-enriched Output:
|
123 |
+
|
124 |
+
<speak>
|
125 |
+
<p>
|
126 |
+
<s>
|
127 |
+
He stood there, <prosody rate="slow" volume="soft">gazing into the endless horizon.</prosody>
|
128 |
+
</s>
|
129 |
+
<s>
|
130 |
+
As the sun slowly <prosody rate="medium" pitch="-2st">sank,</prosody>
|
131 |
+
<prosody volume="medium" pitch="+1st">painting the sky with hues of orange and red,</prosody>
|
132 |
+
he felt a sense of deep <prosody volume="soft" pitch="-1st">melancholy</prosody> mixed with <emphasis level="moderate">awe.</emphasis>
|
133 |
+
</s>
|
134 |
+
</p>
|
135 |
+
</speak>
|
136 |
+
|
137 |
+
After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
|
138 |
+
according to the level of emotional intensity in the modified text.
|
139 |
+
Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
|
140 |
+
Your output should be in the following JSON format:
|
141 |
+
{
|
142 |
+
"modified_text": "Modified text in xml format with SSML tags.",
|
143 |
+
"params": {
|
144 |
+
"stability": 0.7,
|
145 |
+
"similarity_boost": 0.5,
|
146 |
+
"style": 0.3
|
147 |
+
}
|
148 |
+
}
|
149 |
+
|
150 |
+
The "stability" parameter should range from 0 to 1,
|
151 |
+
with lower values indicating a more expressive, less stable voice.
|
152 |
+
The "similarity_boost" parameter should also range from 0 to 1,
|
153 |
+
with higher values indicating more emphasis on the voice similarity.
|
154 |
+
The "style" parameter should also range from 0 to 1,
|
155 |
+
where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
|
156 |
+
Adjust both according to the emotional intensity of the text.
|
157 |
"""
|