Spaces:

drewThomasson
/

better_fast_xtts

Paused

App Files Files Community

trying to add sentence chunking

by drewThomasson - opened 18 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+146

-27

Files changed (1) hide show

app.py +146 -27

app.py CHANGED Viewed

@@ -3,9 +3,10 @@ import base64
 import time
 import uuid
 import shutil
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import List, Optional
 import subprocess
 import ebooklib
@@ -74,14 +75,99 @@ def clone_voice(audio_path: str):
         audio_data = base64.b64encode(f.read()).decode('utf-8')
     return audio_data
-def process_text_and_generate(input_text, ref_audio_files, speed, enhance_speech, temperature, top_p, top_k, repetition_penalty, language, *args):
     """Process text and generate audio."""
     log_messages = ""
     if not ref_audio_files:
         log_messages += "Please provide at least one reference audio!\n"
         return None, log_messages
-    # clone voices from all file paths (shorten them)
     base64_voices = ref_audio_files[:5]
     request = TTSRequest(
@@ -109,7 +195,7 @@ def process_text_and_generate(input_text, ref_audio_files, speed, enhance_speech
                 return None, log_messages
     except Exception as e:
         logger.error(f"Error: {e}")
-        log_messages += f"❌ An Error occured: {e}\n"
         return None, log_messages
 def build_gradio_ui():
@@ -187,26 +273,37 @@ def build_gradio_ui():
                     generate_button = gr.Button("Generate Speech")
                 with gr.Column():
                     audio_output = gr.Audio(label="Generated Audio")
-                    log_output = gr.Text(label="Log Output")
             def process_file_and_generate(
                 file_input, ref_audio_files, speed, enhance_speech,
                 temperature, top_p, top_k, repetition_penalty, language
             ):
                 if not file_input:
-                    return None, "Please provide an input file!"
                 try:
                     # Convert input file to text
                     input_text = text_from_file(file_input.name)
-                    return process_text_and_generate(
-                        input_text, ref_audio_files, speed, enhance_speech,
-                        temperature, top_p, top_k, repetition_penalty, language
                     )
                 except Exception as e:
                     logger.error(f"Error processing file: {e}")
-                    return None, f"Error processing file: {str(e)}"
             generate_button.click(
                 process_file_and_generate,
@@ -229,7 +326,8 @@ def build_gradio_ui():
                     )
                     mic_ref_audio = gr.Audio(
                         label="Record Reference Audio",
-                        sources=["microphone"]
                     )
                     with gr.Accordion("Advanced settings", open=False):
@@ -283,16 +381,16 @@ def build_gradio_ui():
                     generate_button_mic = gr.Button("Generate Speech")
                 with gr.Column():
                     audio_output_mic = gr.Audio(label="Generated Audio")
-                    log_output_mic = gr.Text(label="Log Output")
             def process_mic_and_generate(
                 file_input, mic_ref_audio, speed_mic, enhance_speech_mic,
                 temperature_mic, top_p_mic, top_k_mic, repetition_penalty_mic, language_mic
             ):
-                if not mic_ref_audio:
-                    return None, "Please record an audio!"
                 if not file_input:
-                    return None, "Please provide an input file!"
                 try:
                     # Convert input file to text
@@ -303,21 +401,42 @@ def build_gradio_ui():
                     hash = hashlib.sha1(data).hexdigest()[:10]
                     output_path = temp_dir / (f"mic_{hash}.wav")
-                    torch_audio = torch.from_numpy(mic_ref_audio[1].astype(float))
-                    torchaudio.save(
-                        str(output_path),
-                        torch_audio.unsqueeze(0),
-                        mic_ref_audio[0]
-                    )
-                    return process_text_and_generate(
-                        input_text, [Path(output_path)], speed_mic,
-                        enhance_speech_mic, temperature_mic, top_p_mic,
-                        top_k_mic, repetition_penalty_mic, language_mic
                     )
                 except Exception as e:
                     logger.error(f"Error processing input: {e}")
-                    return None, f"Error processing input: {str(e)}"
             generate_button_mic.click(
                 process_mic_and_generate,
@@ -333,4 +452,4 @@ def build_gradio_ui():
 if __name__ == "__main__":
     ui = build_gradio_ui()
-    ui.launch(debug=True, server_name="0.0.0.0", server_port=7860)

 import time
 import uuid
 import shutil
+import hashlib
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
+from typing import List, Optional, Tuple
 import subprocess
 import ebooklib
         audio_data = base64.b64encode(f.read()).decode('utf-8')
     return audio_data
+def chunk_text(text: str, max_words: int = 300) -> List[str]:
+    """
+    Splits the input text into chunks with a maximum of `max_words` per chunk.
+    """
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), max_words):
+        chunk = ' '.join(words[i:i + max_words])
+        chunks.append(chunk)
+    return chunks
+def generate_audio_from_chunks(
+    chunks: List[str],
+    ref_audio_files: List[str],
+    speed: float,
+    enhance_speech: bool,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    language: str
+) -> Tuple[Optional[str], str]:
+    """
+    Generates audio for each text chunk and combines them into a single audio file.
+    Returns the path to the combined audio file and a log message.
+    """
+    audio_files = []
+    log_messages = ""
+    for idx, chunk in enumerate(chunks):
+        result, log = process_text_and_generate(
+            chunk, ref_audio_files, speed, enhance_speech, temperature,
+            top_p, top_k, repetition_penalty, language
+        )
+        if result:
+            sample_rate, audio_array = result
+            # Save audio array to temp file
+            audio_path = temp_dir / f"chunk_{uuid.uuid4().hex[:8]}_{idx}.wav"
+            audio_tensor = torch.from_numpy(audio_array)
+            torchaudio.save(str(audio_path), audio_tensor.unsqueeze(0), sample_rate)
+            audio_files.append(str(audio_path))
+            log_messages += f"✅ Generated audio for chunk {idx + 1}/{len(chunks)}\n"
+        else:
+            logger.error(f"Failed to generate audio for chunk {idx}: {log}")
+            log_messages += f"❌ Failed to generate audio for chunk {idx + 1}: {log}\n"
+            return None, log_messages
+    # Create a list file for ffmpeg
+    list_file = temp_dir / f"list_{uuid.uuid4().hex[:8]}.txt"
+    with open(list_file, 'w') as f:
+        for audio_file in audio_files:
+            f.write(f"file '{audio_file}'\n")
+    # Define the output combined audio path
+    combined_audio_path = temp_dir / f"combined_{uuid.uuid4().hex[:8]}.wav"
+    try:
+        subprocess.run(
+            [
+                'ffmpeg', '-y', '-f', 'concat', '-safe', '0',
+                '-i', str(list_file),
+                '-c', 'copy',
+                str(combined_audio_path)
+            ],
+            check=True,
+            capture_output=True,
+            text=True
+        )
+        log_messages += "✅ Successfully combined all audio chunks."
+        return str(combined_audio_path), log_messages
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to combine audio files: {e.stderr}")
+        log_messages += f"❌ Failed to combine audio files: {e.stderr}"
+        return None, log_messages
+def process_text_and_generate(
+    input_text: str,
+    ref_audio_files: List[str],
+    speed: float,
+    enhance_speech: bool,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+    language: str
+) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
     """Process text and generate audio."""
     log_messages = ""
     if not ref_audio_files:
         log_messages += "Please provide at least one reference audio!\n"
         return None, log_messages
+    # Clone voices from all file paths (shorten them)
     base64_voices = ref_audio_files[:5]
     request = TTSRequest(
                 return None, log_messages
     except Exception as e:
         logger.error(f"Error: {e}")
+        log_messages += f"❌ An Error occurred: {e}\n"
         return None, log_messages
 def build_gradio_ui():
                     generate_button = gr.Button("Generate Speech")
                 with gr.Column():
                     audio_output = gr.Audio(label="Generated Audio")
+                    log_output = gr.Textbox(label="Log Output", lines=10)
             def process_file_and_generate(
                 file_input, ref_audio_files, speed, enhance_speech,
                 temperature, top_p, top_k, repetition_penalty, language
             ):
                 if not file_input:
+                    return None, "❌ Please provide an input file!"
                 try:
                     # Convert input file to text
                     input_text = text_from_file(file_input.name)
+                    # Chunk the text
+                    chunks = chunk_text(input_text, max_words=300)
+                    # Generate audio from chunks and combine
+                    combined_audio_path, log = generate_audio_from_chunks(
+                        chunks, ref_audio_files, speed, enhance_speech, temperature, top_p,
+                        top_k, repetition_penalty, language
                     )
+                    if combined_audio_path:
+                        # Read the combined audio file to return as audio output
+                        waveform, sr = torchaudio.load(combined_audio_path)
+                        return (sr, waveform.numpy()), log
+                    else:
+                        return None, log
                 except Exception as e:
                     logger.error(f"Error processing file: {e}")
+                    return None, f"❌ Error processing file: {str(e)}"
             generate_button.click(
                 process_file_and_generate,
                     )
                     mic_ref_audio = gr.Audio(
                         label="Record Reference Audio",
+                        source="microphone",
+                        type="numpy"
                     )
                     with gr.Accordion("Advanced settings", open=False):
                     generate_button_mic = gr.Button("Generate Speech")
                 with gr.Column():
                     audio_output_mic = gr.Audio(label="Generated Audio")
+                    log_output_mic = gr.Textbox(label="Log Output", lines=10)
             def process_mic_and_generate(
                 file_input, mic_ref_audio, speed_mic, enhance_speech_mic,
                 temperature_mic, top_p_mic, top_k_mic, repetition_penalty_mic, language_mic
             ):
+                if mic_ref_audio is None:
+                    return None, "❌ Please record an audio!"
                 if not file_input:
+                    return None, "❌ Please provide an input file!"
                 try:
                     # Convert input file to text
                     hash = hashlib.sha1(data).hexdigest()[:10]
                     output_path = temp_dir / (f"mic_{hash}.wav")
+                    # Ensure mic_ref_audio is in the correct format
+                    if isinstance(mic_ref_audio, tuple):
+                        mic_waveform, mic_sr = mic_ref_audio
+                        torch_audio = torch.from_numpy(mic_waveform.astype(float))
+                        torchaudio.save(
+                            str(output_path),
+                            torch_audio.unsqueeze(0),
+                            mic_sr
+                        )
+                    else:
+                        # If mic_ref_audio is not a tuple, handle accordingly
+                        logger.error("Invalid microphone audio format.")
+                        return None, "❌ Invalid microphone audio format."
+                    # Clone voice from the saved mic audio
+                    ref_audio_files = [str(output_path)]
+                    # Chunk the text
+                    chunks = chunk_text(input_text, max_words=300)
+                    # Generate audio from chunks and combine
+                    combined_audio_path, log = generate_audio_from_chunks(
+                        chunks, ref_audio_files, speed_mic, enhance_speech_mic,
+                        temperature_mic, top_p_mic, top_k_mic, repetition_penalty_mic,
+                        language_mic
                     )
+                    if combined_audio_path:
+                        # Read the combined audio file to return as audio output
+                        waveform, sr = torchaudio.load(combined_audio_path)
+                        return (sr, waveform.numpy()), log
+                    else:
+                        return None, log
                 except Exception as e:
                     logger.error(f"Error processing input: {e}")
+                    return None, f"❌ Error processing input: {str(e)}"
             generate_button_mic.click(
                 process_mic_and_generate,
 if __name__ == "__main__":
     ui = build_gradio_ui()
+    ui.launch(debug=True, server_name="0.0.0.0", server_port=7860)