Spaces:

ales
/

ai-audio-books

Running

App Files Files Community

bl4dylion commited on Oct 10, 2024

Commit

3ee8f12

1 Parent(s): 3a9a0d8

add text_mod and audio normalization

Browse files

Files changed (4) hide show

src/audio_generators.py +56 -46
src/builder.py +2 -2
src/emotions/generation.py +3 -3
src/emotions/prompts.py +56 -0

src/audio_generators.py CHANGED Viewed

@@ -59,57 +59,55 @@ class AudioGeneratorWithEffects:
     def __init__(self):
         self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
-    async def generate_audio_with_text_modification(
         self,
-        annotated_text: str,
         character_to_voice: dict[str, str],
     ) -> Path:
         """Main method to generate the audiobook with TTS, emotion, and sound effects."""
-        num_lines = len(annotated_text.splitlines())
         lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
         # Step 1: Process and modify text
         modified_texts, sound_emotion_results = await self._process_and_modify_text(
-            annotated_text, lines_for_sound_effect
         )
         # Step 2: Generate TTS audio for modified text
-        tts_results, temp_files = await self._generate_tts_audio(
-            annotated_text, modified_texts, character_to_voice
         )
         # Step 3: Add sound effects to selected lines
         audio_chunks = await self._add_sound_effects(
-            tts_results, lines_for_sound_effect, sound_emotion_results, temp_files
         )
         # Step 4: Merge audio files
-        final_output = self._merge_audio_files(audio_chunks)
         # Clean up temporary files
-        self._cleanup_temp_files(temp_files)
         return final_output
     def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
         """Select 20% of the lines randomly for sound effect generation."""
-        return random.sample(range(num_lines), k=int(0.2 * num_lines))
     async def _process_and_modify_text(
-        self, annotated_text: str, lines_for_sound_effect: list[int]
     ) -> tuple[list[dict], list[dict]]:
         """Process the text by modifying it and generating tasks for sound effects."""
         tasks_for_text_modification = []
         sound_emotion_tasks = []
-        for idx, line in enumerate(annotated_text.splitlines()):
-            cleaned_line = line.strip().lower()
-            if not cleaned_line:
-                continue
-            # Extract character text
-            character_text = cleaned_line[cleaned_line.rfind("]") + 1 :].lstrip()
             # Add text emotion modification task
             tasks_for_text_modification.append(
@@ -132,38 +130,35 @@ class AudioGeneratorWithEffects:
     async def _generate_tts_audio(
         self,
-        annotated_text: str,
-        modified_texts: list[dict],  # TODO ? type ?
         character_to_voice: dict[str, str],
     ) -> tuple[list[str], list[str]]:
         """Generate TTS audio for modified text."""
         tasks_for_tts = []
         temp_files = []
-        current_character = "narrator"
-        for idx, (modified_text, line) in enumerate(
-            zip(modified_texts, annotated_text.splitlines())
         ):
-            cleaned_line = line.strip().lower()
-            # Extract character
-            try:
-                current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
-            except IndexError:
-                pass
-            # Get voice ID and generate TTS
-            voice_id = character_to_voice[current_character]
-            tasks_for_tts.append(
-                tts_astream(
-                    voice_id=voice_id,
-                    text=modified_text["text"],  # TODO ? type ?
-                    params=modified_texts["params"],  # TODO ? type ?
-                )
             )
         # Gather all TTS results
-        tts_results = await asyncio.gather(*(consume_aiter(t) for t in tasks_for_tts))
         # Save the results to temporary files
         tts_audio_files = []
@@ -189,15 +184,11 @@ class AudioGeneratorWithEffects:
         for idx, tts_filename in enumerate(tts_audio_files):
             # If the line has sound emotion data, generate sound effect and overlay
             if idx in lines_for_sound_effect:
-                sound_effect_data = sound_emotion_results.pop(
-                    0
-                )  # Get next sound effect data
                 sound_effect_filename = f"sound_effect_{idx}.wav"
                 # Generate sound effect asynchronously
-                sound_result = await consume_aiter(
-                    sound_generation_astream(sound_effect_data)
-                )
                 with open(sound_effect_filename, "wb") as ab:
                     for chunk in sound_result:
                         ab.write(chunk)
@@ -217,6 +208,25 @@ class AudioGeneratorWithEffects:
         return audio_chunks
     def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
         """Helper function to merge multiple audio files into one."""
         combined = AudioSegment.from_file(audio_filenames[0])
@@ -236,4 +246,4 @@ class AudioGeneratorWithEffects:
             try:
                 os.remove(temp_file)
             except FileNotFoundError:
-                continue

     def __init__(self):
         self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
+        self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
+        self.temp_files = []
+    async def generate_audio(
         self,
+        text_split: SplitTextOutput,
         character_to_voice: dict[str, str],
     ) -> Path:
         """Main method to generate the audiobook with TTS, emotion, and sound effects."""
+        num_lines = len(text_split.phrases)
         lines_for_sound_effect = self._select_lines_for_sound_effect(num_lines)
         # Step 1: Process and modify text
         modified_texts, sound_emotion_results = await self._process_and_modify_text(
+            text_split, lines_for_sound_effect
         )
         # Step 2: Generate TTS audio for modified text
+        tts_results, self.temp_files = await self._generate_tts_audio(
+            text_split, modified_texts, character_to_voice
         )
         # Step 3: Add sound effects to selected lines
         audio_chunks = await self._add_sound_effects(
+            tts_results, lines_for_sound_effect, sound_emotion_results, self.temp_files
         )
         # Step 4: Merge audio files
+        normalized_audio_chunks = self._normalize_audio_chunks(audio_chunks, self.temp_files)
+        final_output = self._merge_audio_files(normalized_audio_chunks)
         # Clean up temporary files
+        self._cleanup_temp_files(self.temp_files)
         return final_output
     def _select_lines_for_sound_effect(self, num_lines: int) -> list[int]:
         """Select 20% of the lines randomly for sound effect generation."""
+        return random.sample(range(num_lines), k=int(0.0 * num_lines))
     async def _process_and_modify_text(
+        self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
     ) -> tuple[list[dict], list[dict]]:
         """Process the text by modifying it and generating tasks for sound effects."""
         tasks_for_text_modification = []
         sound_emotion_tasks = []
+        for idx, character_phrase in enumerate(text_split.phrases):
+            character_text = character_phrase.text.strip().lower()
             # Add text emotion modification task
             tasks_for_text_modification.append(
     async def _generate_tts_audio(
         self,
+        text_split: SplitTextOutput,
+        modified_texts: list[dict],
         character_to_voice: dict[str, str],
     ) -> tuple[list[str], list[str]]:
         """Generate TTS audio for modified text."""
         tasks_for_tts = []
         temp_files = []
+        async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
+            async with self.semaphore:
+                iter_ = tts_astream(voice_id=voice_id, text=text, params=params)
+                bytes_ = await consume_aiter(iter_)
+                return bytes_
+        for idx, (modified_text, character_phrase) in enumerate(
+            zip(modified_texts, text_split.phrases)
         ):
+            voice_id = character_to_voice[character_phrase.character]
+            # Use the semaphore-protected TTS function
+            task = tts_astream_with_semaphore(
+                voice_id=voice_id,
+                text=modified_text["modified_text"],
+                params=modified_text["params"],
             )
+            tasks_for_tts.append(task)
         # Gather all TTS results
+        tts_results = await asyncio.gather(*tasks_for_tts)
         # Save the results to temporary files
         tts_audio_files = []
         for idx, tts_filename in enumerate(tts_audio_files):
             # If the line has sound emotion data, generate sound effect and overlay
             if idx in lines_for_sound_effect:
+                sound_effect_data = sound_emotion_results.pop(0)  # Get next sound effect data
                 sound_effect_filename = f"sound_effect_{idx}.wav"
                 # Generate sound effect asynchronously
+                sound_result = await consume_aiter(sound_generation_astream(sound_effect_data))
                 with open(sound_effect_filename, "wb") as ab:
                     for chunk in sound_result:
                         ab.write(chunk)
         return audio_chunks
+    def _normalize_audio(self, audio_segment: AudioSegment, target_dBFS: float = -20.0) -> AudioSegment:
+        """Normalize an audio segment to the target dBFS level."""
+        change_in_dBFS = target_dBFS - audio_segment.dBFS
+        return audio_segment.apply_gain(change_in_dBFS)
+    def _normalize_audio_chunks(self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0) -> list[str]:
+        """Normalize all audio chunks to the target volume level."""
+        normalized_files = []
+        for audio_file in audio_filenames:
+            audio_segment = AudioSegment.from_file(audio_file)
+            normalized_audio = self._normalize_audio(audio_segment, target_dBFS)
+            normalized_filename = f"normalized_{Path(audio_file).stem}.wav"
+            normalized_audio.export(normalized_filename, format="wav")
+            normalized_files.append(normalized_filename)
+            temp_files.append(normalized_filename)
+        return normalized_files
     def _merge_audio_files(self, audio_filenames: list[str]) -> Path:
         """Helper function to merge multiple audio files into one."""
         combined = AudioSegment.from_file(audio_filenames[0])
             try:
                 os.remove(temp_file)
             except FileNotFoundError:
+                continue

src/builder.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from langchain_community.callbacks import get_openai_callback
-from src.audio_generators import AudioGeneratorSimple
 from src.lc_callbacks import LCMessageLoggerAsync
 from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
 from src.text_split_chain import SplitTextOutput, create_split_text_chain
@@ -13,7 +13,7 @@ class AudiobookBuilder:
         self.voice_selector = VoiceSelector(
             csv_table_fp="data/11labs_available_tts_voices.csv"
         )
-        self.audio_generator = AudioGeneratorSimple()
     async def split_text(self, text: str) -> SplitTextOutput:
         chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)

 from langchain_community.callbacks import get_openai_callback
+from src.audio_generators import AudioGeneratorSimple, AudioGeneratorWithEffects
 from src.lc_callbacks import LCMessageLoggerAsync
 from src.select_voice_chain import SelectVoiceChainOutput, VoiceSelector
 from src.text_split_chain import SplitTextOutput, create_split_text_chain
         self.voice_selector = VoiceSelector(
             csv_table_fp="data/11labs_available_tts_voices.csv"
         )
+        self.audio_generator = AudioGeneratorWithEffects()
     async def split_text(self, text: str) -> SplitTextOutput:
         chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)

src/emotions/generation.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 from requests import HTTPError
 from abc import ABC, abstractmethod
-from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION
 from .utils import get_audio_duration
 from src.config import logger
@@ -25,7 +25,7 @@ class EffectGenerator(AbstractEffectGenerator):
     def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
         self.client = openai.OpenAI(api_key=api_key)
         self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
-        self.text_modification_prompt = TEXT_MODIFICATION
         self.model_type = model_type
         logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
@@ -88,7 +88,7 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
     def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
         self.client = openai.AsyncOpenAI(api_key=api_key)
         self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
-        self.text_modification_prompt = TEXT_MODIFICATION
         self.model_type = model_type
     async def generate_text_for_sound_effect(self, text: str) -> dict:

 from requests import HTTPError
 from abc import ABC, abstractmethod
+from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
 from .utils import get_audio_duration
 from src.config import logger
     def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
         self.client = openai.OpenAI(api_key=api_key)
         self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
         logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
     def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
         self.client = openai.AsyncOpenAI(api_key=api_key)
         self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
     async def generate_text_for_sound_effect(self, text: str) -> dict:

src/emotions/prompts.py CHANGED Viewed

@@ -98,4 +98,60 @@ Adjust both according to the emotional intensity of the text.
 Example of text that could be passed:
 Text: "I can't believe this is happening."
 """

 Example of text that could be passed:
 Text: "I can't believe this is happening."
+"""
+TEXT_MODIFICATION_WITH_SSML = """
+You should help me to make an audiobook with overabundant emotion-based voice using TTS.
+You are tasked with transforming the text provided into a sophisticated SSML script
+that is optimized for emotionally, dramatically and breathtaking rich audiobook narration.
+Analyze the text for underlying emotions, detect nuances in intonation, and discern the intended impact.
+Apply suitable SSML enhancements to ensure that the final TTS output delivers
+a powerful, engaging, dramatic and breathtaking listening experience appropriate for an audiobook context
+(more effects/emotions are better than less)."
+Please, use only provided SSML tags and don't generate any other tags.
+Key SSML Tags to Utilize:
+<speak>: This is the root element. All SSML content to be synthesized must be enclosed within this tag.
+<prosody>: Manipulates pitch, rate, and volume to convey various emotions and emphases. Use this tag to adjust the voice to match the mood and tone of different parts of the narrative.
+<break>: Inserts pauses of specified durations. Use this to create natural breaks in speech, aiding in dramatic effect and better comprehension for listeners.
+<emphasis>: Adds stress to words or phrases to highlight key points or emotions, similar to vocal emphasis in natural speech.
+<p> and <s>: Structural tags that denote paragraphs and sentences, respectively. They help to manage the flow and pacing of the narrative appropriately.
+Input Text Example: "He stood there, gazing into the endless horizon. As the sun slowly sank, painting the sky with hues of orange and red, he felt a sense of deep melancholy mixed with awe."
+Modified text should be in the XML format. Expected SSML-enriched Output:
+<speak>
+    <p>
+        <s>
+            He stood there, <prosody rate="slow" volume="soft">gazing into the endless horizon.</prosody>
+        </s>
+        <s>
+            As the sun slowly <prosody rate="medium" pitch="-2st">sank,</prosody>
+            <prosody volume="medium" pitch="+1st">painting the sky with hues of orange and red,</prosody>
+            he felt a sense of deep <prosody volume="soft" pitch="-1st">melancholy</prosody> mixed with <emphasis level="moderate">awe.</emphasis>
+        </s>
+    </p>
+</speak>
+After modifying the text, adjust the "stability", "similarity_boost" and "style" parameters
+according to the level of emotional intensity in the modified text.
+Higher emotional intensity should lower the "stability" and raise the "similarity_boost".
+Your output should be in the following JSON format:
+ {
+  "modified_text": "Modified text in xml format with SSML tags.",
+  "params": {
+    "stability": 0.7,
+    "similarity_boost": 0.5,
+    "style": 0.3
+  }
+}
+The "stability" parameter should range from 0 to 1,
+with lower values indicating a more expressive, less stable voice.
+The "similarity_boost" parameter should also range from 0 to 1,
+with higher values indicating more emphasis on the voice similarity.
+The "style" parameter should also range from 0 to 1,
+where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
+Adjust both according to the emotional intensity of the text.
 """