Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

Remsky commited on 3 days ago

Commit

9a2b6d1

1 Parent(s): 7cffcd0

Add .gitignore, update requirements, and implement book processing utilities

- Added .gitignore to exclude specific files and directories.
- Updated SDK version in README.md.
- Created new utility functions for processing book XML files and retrieving chapter information.
- Added mock TTS implementation for local development.
- Updated UI content with additional information and warnings.

Files changed (12) hide show

.gitignore +3 -0
README.md +3 -5
app.py +168 -15
lib/book_utils.py +66 -0
lib/mock_tts.py +41 -0
lib/ui_content.py +17 -4
parse_chapters.py +32 -0
requirements.txt +1 -1
texts/processed/dorian_grey.xml +0 -0
texts/processed/time_machine.xml +0 -0
texts/processor.py +80 -0
voices/voices/mock_voice.pt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+dorian_grey.txt
+texts/time_machine.txt
+*.pyc

README.md CHANGED Viewed

@@ -4,10 +4,10 @@ emoji: 🎴
 colorFrom: gray
 colorTo: purple
 sdk: gradio
-sdk_version: 5.9.1
 app_file: app.py
 pinned: true
-short_description: Accelerated Text-To-Speech on Kokoro-82M
 models:
 - hexgrad/Kokoro-82M
 ---
@@ -42,6 +42,4 @@ Main dependencies:
 - Transformers 4.47.1
 - HuggingFace Hub ≥0.25.1
-For a complete list, see requirements.txt.

 colorFrom: gray
 colorTo: purple
 sdk: gradio
+sdk_version: 5.10.0
 app_file: app.py
 pinned: true
+short_description: Accelerated Text-To-Speech on Kokoro-82M
 models:
 - hexgrad/Kokoro-82M
 ---
 - Transformers 4.47.1
 - HuggingFace Hub ≥0.25.1
+For a complete list, see requirements.txt.

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import os
 import gradio as gr
-import spaces
 import time
 import matplotlib.pyplot as plt
 import numpy as np
-import torch
-import os
-from tts_model import TTSModel
 from lib import format_audio_output
 from lib.ui_content import header_html, demo_text_info
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
@@ -16,6 +18,16 @@ os.environ["HF_HOME"] = "/data/.huggingface"
 # Create TTS model instance
 model = TTSModel()
 def initialize_model():
     """Initialize model and get voices"""
     if model.model is None:
@@ -163,6 +175,7 @@ def create_performance_plot(metrics, voice_names):
     return fig, metrics_text
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo", css="""
     .equal-height {
@@ -170,20 +183,157 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
         display: flex;
         flex-direction: column;
     }
 """) as demo:
     gr.HTML(header_html)
     with gr.Row():
-        # Column 1: Text Input
-        with open("the_time_machine_hgwells.txt") as f:
-            text = f.readlines()[:200]
-            text = "".join(text)
         with gr.Column(elem_classes="equal-height"):
             text_input = gr.TextArea(
-                label="Text to speak",
-                placeholder="Enter text here or upload a .txt file",
                 lines=10,
-                value=text
             )
         # Column 2: Controls
@@ -196,16 +346,19 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
             def load_text_from_file(file_bytes):
                 if file_bytes is None:
-                    return None
                 try:
-                    return file_bytes.decode('utf-8')
                 except Exception as e:
                     raise gr.Error(f"Failed to read file: {str(e)}")
             file_input.change(
                 fn=load_text_from_file,
                 inputs=[file_input],
-                outputs=[text_input]
             )
             with gr.Group():
@@ -231,7 +384,7 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
                     label="GPU Timeout (seconds)",
                     minimum=15,
                     maximum=120,
-                    value=60,
                     step=1,
                     info="Maximum time allowed for GPU processing"
                 )

 import os
 import gradio as gr
 import time
+import math
+import logging
 import matplotlib.pyplot as plt
 import numpy as np
+# from lib.mock_tts import MockTTSModel
 from lib import format_audio_output
 from lib.ui_content import header_html, demo_text_info
+from lib.book_utils import get_available_books, get_book_info, get_chapter_text
+from lib.text_utils import count_tokens
+from tts_model import TTSModel
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
 # Create TTS model instance
 model = TTSModel()
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+# Suppress matplotlib debug messages
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logger.debug("Starting app initialization...")
+model = TTSModel()
 def initialize_model():
     """Initialize model and get voices"""
     if model.model is None:
     return fig, metrics_text
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo", css="""
     .equal-height {
         display: flex;
         flex-direction: column;
     }
+    .token-label {
+        font-size: 1rem;
+        margin-bottom: 0.5rem;
+    }
+    .token-count {
+        color: #4169e1;
+    }
+    .centered-label {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        text-align: center;
+        margin: 10px 0;
+    }
 """) as demo:
     gr.HTML(header_html)
     with gr.Row():
+        # Column 1: Text Input and Book Selection
         with gr.Column(elem_classes="equal-height"):
+            # Book selection
+            books = get_available_books()
+            book_dropdown = gr.Dropdown(
+                label="Select Book",
+                choices=[book['label'] for book in books],
+                value=books[0]['label'] if books else None,
+                type="value",
+                allow_custom_value=True
+            )
+            # Initialize chapters for first book
+            initial_book = books[0]['value'] if books else None
+            initial_chapters = []
+            if initial_book:
+                book_path = os.path.join("texts/processed", initial_book)
+                _, chapters = get_book_info(book_path)
+                initial_chapters = [ch['title'] for ch in chapters]
+            # Chapter selection with initial chapters
+            chapter_dropdown = gr.Dropdown(
+                label="Select Chapter",
+                choices=initial_chapters,
+                value=initial_chapters[0] if initial_chapters else None,
+                type="value",
+                allow_custom_value=True
+            )
+            lab_tps = 175
+            lab_rts = 50
+            # Text input area with initial chapter text
+            initial_text = ""
+            if initial_chapters and initial_book:
+                book_path = os.path.join("texts/processed", initial_book)
+                _, chapters = get_book_info(book_path)
+                if chapters:
+                    initial_text = get_chapter_text(book_path, chapters[0]['id'])
+                    tokens = count_tokens(initial_text)
+                    time_estimate = math.ceil(tokens / lab_tps)
+                    output_estimate = (time_estimate * lab_rts)//60
+                    initial_label = f'<div class="token-label">Text to speak <span class="token-count">Estimated {output_estimate} minutes in ~{time_estimate}s</span></div>'
+                else:
+                    initial_label = '<div class="token-label">Text to speak</div>'
+            else:
+                initial_label = '<div class="token-label">Text to speak</div>'
+            def update_text_label(text):
+                if not text:
+                    return '<div class="token-label">Text to speak</div>'
+                tokens = count_tokens(text)
+                time_estimate = math.ceil(tokens / lab_tps)
+                output_estimate = (time_estimate * lab_rts)//60
+                return  f'<div class="token-label">Text to speak <span class="token-count">Estimated {output_estimate} minutes in ~{time_estimate}s</span></div>'
             text_input = gr.TextArea(
+                label=None,
+                placeholder="Enter text here, select a chapter, or upload a .txt file",
+                value=initial_text,
                 lines=10,
+                show_label=False,
+                show_copy_button=True  # Add copy button for convenience
+            )
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    label_html = gr.HTML(initial_label, elem_classes="centered-label")
+                    # Update label whenever text changes
+                    text_input.change(
+                        fn=update_text_label,
+                        inputs=[text_input],
+                        outputs=[label_html],
+                        trigger_mode="always_last"
+                    )
+            clear_btn = gr.Button("Clear Text", variant="secondary")
+            def clear_text():
+                return "", '<div class="token-label">Text to speak</div>'
+            clear_btn.click(
+                fn=clear_text,
+                outputs=[text_input, label_html]
+            )
+            def update_chapters(book_name):
+                if not book_name:
+                    return gr.update(choices=[], value=None), "", '<div class="token-label">Text to speak</div>'
+                # Find the corresponding book file
+                book_file = next((book['value'] for book in books if book['label'] == book_name), None)
+                if not book_file:
+                    return gr.update(choices=[], value=None), "", '<div class="token-label">Text to speak</div>'
+                book_path = os.path.join("texts/processed", book_file)
+                book_title, chapters = get_book_info(book_path)
+                # Create simple choices list of chapter titles
+                chapter_choices = [ch['title'] for ch in chapters]
+                # Set initial chapter text when book is selected
+                initial_text = get_chapter_text(book_path, chapters[0]['id']) if chapters else ""
+                if initial_text:
+                    tokens = count_tokens(initial_text)
+                    time_estimate = math.ceil(tokens / 150 / 10) * 10
+                    label = f'<div class="token-label">Text to speak <span class="token-count">({tokens} tokens, ~{time_estimate}s generation time)</span></div>'
+                else:
+                    label = '<div class="token-label">Text to speak</div>'
+                return gr.update(choices=chapter_choices, value=chapter_choices[0] if chapter_choices else None), initial_text, label
+            def load_chapter_text(book_name, chapter_title):
+                if not book_name or not chapter_title:
+                    return "", '<div class="token-label">Text to speak</div>'
+                # Find the corresponding book file
+                book_file = next((book['value'] for book in books if book['label'] == book_name), None)
+                if not book_file:
+                    return "", '<div class="token-label">Text to speak</div>'
+                book_path = os.path.join("texts/processed", book_file)
+                # Get all chapters and find the one matching the title
+                _, chapters = get_book_info(book_path)
+                for ch in chapters:
+                    if ch['title'] == chapter_title:
+                        text = get_chapter_text(book_path, ch['id'])
+                        tokens = count_tokens(text)
+                        time_estimate = math.ceil(tokens / 150 / 10) * 10
+                        return text, f'<div class="token-label">Text to speak <span class="token-count">({tokens} tokens, ~{time_estimate}s generation time)</span></div>'
+                return "", '<div class="token-label">Text to speak</div>'
+            # Set up event handlers for book/chapter selection
+            book_dropdown.change(
+                fn=update_chapters,
+                inputs=[book_dropdown],
+                outputs=[chapter_dropdown, text_input, label_html]
+            )
+            chapter_dropdown.change(
+                fn=load_chapter_text,
+                inputs=[book_dropdown, chapter_dropdown],
+                outputs=[text_input, label_html]
             )
         # Column 2: Controls
             def load_text_from_file(file_bytes):
                 if file_bytes is None:
+                    return None, '<div class="token-label">Text to speak</div>'
                 try:
+                    text = file_bytes.decode('utf-8')
+                    tokens = count_tokens(text)
+                    time_estimate = math.ceil(tokens / 150 / 10) * 10  # Round up to nearest 10 seconds
+                    return text, f'<div class="token-label">Text to speak <span class="token-count">({tokens} tokens, ~{time_estimate}s generation time)</span></div>'
                 except Exception as e:
                     raise gr.Error(f"Failed to read file: {str(e)}")
             file_input.change(
                 fn=load_text_from_file,
                 inputs=[file_input],
+                outputs=[text_input, label_html]
             )
             with gr.Group():
                     label="GPU Timeout (seconds)",
                     minimum=15,
                     maximum=120,
+                    value=90,
                     step=1,
                     info="Maximum time allowed for GPU processing"
                 )

lib/book_utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import xml.etree.ElementTree as ET
+import os
+from typing import Dict, List, Tuple
+from .text_utils import count_tokens
+import logging
+logger = logging.getLogger(__name__)
+def get_available_books() -> List[Dict[str, str]]:
+    """Get list of available book XML files
+    Returns:
+        List of dicts with keys:
+        - value: filename with extension (for internal use)
+        - label: display name without extension
+    """
+    processed_dir = "texts/processed"
+    books = []
+    logger.info(f"Checking directory: {processed_dir}")
+    for file in os.listdir(processed_dir):
+        logger.info(f"Found file: {file}")
+        if file.endswith('.xml'):
+            books.append({
+                'value': file,
+                'label': file[:-4]  # Remove .xml extension for display
+            })
+    return books
+def get_book_info(xml_path: str) -> Tuple[str, List[Dict]]:
+    """Get book title and chapter information from XML file
+    Returns:
+        Tuple containing:
+        - Book title (str)
+        - List of chapter dicts with keys: id, title, text
+    """
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    book_title = root.get('title')
+    chapters = []
+    for chapter in root.findall('chapter'):
+        chapter_info = {
+            'id': chapter.get('id'),
+            'title': chapter.get('title'),
+            'text': chapter.text.strip() if chapter.text else ""
+        }
+        # Remove first line and strip whitespace
+        chapter_info['text'] = chapter_info['text'][chapter_info['text'].find("\n") + 1:].strip()
+        chapters.append(chapter_info)
+    return book_title, chapters
+def get_chapter_text(xml_path: str, chapter_id: str) -> str:
+    """Get text content for a specific chapter"""
+    _, chapters = get_book_info(xml_path)
+    for chapter in chapters:
+        if chapter['id'] == chapter_id:
+            return chapter['text']
+    return ""
+def get_book_chapters(xml_path: str) -> List[Dict]:
+    """Get list of chapters with id and title for dropdown"""
+    _, chapters = get_book_info(xml_path)
+    return [{'id': ch['id'], 'title': ch['title']} for ch in chapters]

lib/mock_tts.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# """Mock TTS implementation for local development"""
+# import numpy as np
+# class MockTTSModel:
+#     def __init__(self):
+#         self.model = None
+#     def initialize(self):
+#         """Mock initialization"""
+#         self.model = "mock_model"
+#         return True
+#     def list_voices(self):
+#         """Return mock list of voices"""
+#         return ["mock_voice_1", "mock_voice_2"]
+#     def generate_speech(self, text, voice_names, speed, gpu_timeout=90, progress_callback=None, progress_state=None, progress=None):
+#         """Generate mock audio data"""
+#         # Create mock audio data (1 second of silence)
+#         sample_rate = 22050
+#         duration = 1.0
+#         t = np.linspace(0, duration, int(sample_rate * duration))
+#         audio_array = np.zeros_like(t)
+#         # Mock metrics
+#         metrics = {
+#             "tokens_per_sec": [10.5, 11.2, 10.8],
+#             "rtf": [0.5, 0.48, 0.52],
+#             "total_time": 3,
+#             "total_tokens": 100
+#         }
+#         # Simulate progress updates
+#         if progress_callback and progress_state and progress:
+#             for i in range(3):
+#                 progress_callback(i+1, 3, metrics["tokens_per_sec"][i],
+#                                 metrics["rtf"][i], progress_state,
+#                                 progress_state.get("start_time", 0),
+#                                 gpu_timeout, progress)
+#         return audio_array, duration, metrics

lib/ui_content.py CHANGED Viewed

@@ -1,5 +1,18 @@
 # HTML content for the header section
-header_html = """
 <div>
     <!-- Top badges bar -->
     <div style="display: flex; justify-content: flex-end; padding: 4px; gap: 8px; height: 32px; align-items: center;">
@@ -13,16 +26,16 @@ header_html = """
     <div style="text-align: center; margin-bottom: 1rem;">
         <h1 style="font-size: 1.75rem; font-weight: bold; color: #ffffff; margin-bottom: 0.5rem;">Kokoro TTS Demo</h1>
-        <p style="color: #d1d5db;">Rapidly convert text to natural speech using various and blended voices.</p>
     </div>
     <div style="display: flex; gap: 1rem;">
         <div style="flex: 1; background: rgba(30, 58, 138, 0.3); border: 1px solid rgba(59, 130, 246, 0.3); padding: 0.5rem 1rem; border-radius: 6px; display: flex; align-items: center; justify-content: center;">
-            <span style="font-weight: 500; color: #60a5fa; text-align: center;">⏱️ Small requests/Initial chunks can be slower due to warm-up</span>
         </div>
         <div style="flex: 1; background: rgba(147, 51, 234, 0.3); border: 1px solid rgba(168, 85, 247, 0.3); padding: 0.5rem 1rem; border-radius: 6px; display: flex; align-items: center; justify-content: center;">
-            <span style="font-weight: 500; color: #e879f9; text-align: center;">⚠️ 120-second timeout per request (~1+ hour of output audio)</span>
         </div>
     </div>
 </div>

 # HTML content for the header section
+header_title = """
+Generates about 1 hour of audio per minute, with unexpected quality
+""".strip()
+time_button = """
+⏱️ Small requests/Initial chunks can be slower due to warm-up
+"""
+warning_button = """
+⚠️ 120-second maximum timeout per request
+"""
+header_html = f"""
 <div>
     <!-- Top badges bar -->
     <div style="display: flex; justify-content: flex-end; padding: 4px; gap: 8px; height: 32px; align-items: center;">
     <div style="text-align: center; margin-bottom: 1rem;">
         <h1 style="font-size: 1.75rem; font-weight: bold; color: #ffffff; margin-bottom: 0.5rem;">Kokoro TTS Demo</h1>
+        <p style="color: #d1d5db;">{header_title}</p>
     </div>
     <div style="display: flex; gap: 1rem;">
         <div style="flex: 1; background: rgba(30, 58, 138, 0.3); border: 1px solid rgba(59, 130, 246, 0.3); padding: 0.5rem 1rem; border-radius: 6px; display: flex; align-items: center; justify-content: center;">
+            <span style="font-weight: 500; color: #60a5fa; text-align: center;">{time_button}</span>
         </div>
         <div style="flex: 1; background: rgba(147, 51, 234, 0.3); border: 1px solid rgba(168, 85, 247, 0.3); padding: 0.5rem 1rem; border-radius: 6px; display: flex; align-items: center; justify-content: center;">
+            <span style="font-weight: 500; color: #e879f9; text-align: center;">{warning_button}</span>
         </div>
     </div>
 </div>

parse_chapters.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import xml.etree.ElementTree as ET
+def parse_chapters(xml_path):
+    # Parse the XML file
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    # Get book title
+    book_title = root.get('title')
+    print(f"\nBook: {book_title}\n")
+    # Find all chapter elements
+    chapters = root.findall('chapter')
+    for chapter in chapters:
+        # Get chapter info
+        chapter_id = chapter.get('id')
+        chapter_title = chapter.get('title')
+        # Get chapter text and limit to first 100 chars
+        chapter_text = chapter.text.strip() if chapter.text else ""
+        # cut off top line and strip
+        chapter_text = chapter_text[chapter_text.find("\n") + 1:].strip()
+        preview = chapter_text[:100] + "..." if len(chapter_text) > 100 else chapter_text
+        print(f"=== {chapter_title} ({chapter_id}) ===")
+        print(f"{preview}\n")
+if __name__ == "__main__":
+    xml_path = "texts/processed/dorian_grey.xml"
+    parse_chapters(xml_path)

requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ regex==2024.11.6
 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0
-matplotlib==3.4.3

 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0
+matplotlib==3.4.3

texts/processed/dorian_grey.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

texts/processed/time_machine.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

texts/processor.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# import re
+# import os
+# from xml.etree import ElementTree as ET
+# from xml.dom import minidom
+# def process_dorian_grey():
+#     # Create processed directory if it doesn't exist
+#     os.makedirs('texts/processed', exist_ok=True)
+#     # Read the file
+#     with open('texts/dorian_grey.txt', 'r', encoding='utf-8') as f:
+#         text = f.read()
+#     # Create root XML element
+#     root = ET.Element("book")
+#     root.set("title", "The Picture of Dorian Gray")
+#     # Split into chapters using regex
+#     # Look for chapter markers and keep them with the content
+#     chapter_pattern = r'(CHAPTER [IVXLC\d]+\..*?)(?=CHAPTER [IVXLC\d]+\.|$)'
+#     chapters = re.findall(chapter_pattern, text, re.DOTALL)
+#     # Process chapters
+#     for i, content in enumerate(chapters):
+#         # Create chapter element
+#         chapter = ET.SubElement(root, "chapter")
+#         chapter.set("id", f"chapter_{i}")
+#         chapter.set("title", f"Chapter {i}")
+#         chapter.text = content.strip()
+#     # Pretty print XML
+#     xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
+#     # Save as XML
+#     output_path = 'texts/processed/dorian_grey.xml'
+#     with open(output_path, 'w', encoding='utf-8') as f:
+#         f.write(xml_str)
+#     print(f"Processed and saved to {output_path}")
+# def process_time_machine():
+#     # Create processed directory if it doesn't exist
+#     os.makedirs('texts/processed', exist_ok=True)
+#     # Read the file
+#     with open('texts/time_machine.txt', 'r', encoding='utf-8') as f:
+#         text = f.read()
+#     # Create root XML element
+#     root = ET.Element("book")
+#     root.set("title", "The Time Machine")
+#     # Split into chapters using 4 or more newlines as separator
+#     chapters = re.split(r'\n{4,}', text)
+#     # Track actual chapter number (no skipping)
+#     chapter_num = 1
+#     # Process chapters
+#     for content in chapters:
+#         if content.strip():  # Only process non-empty chapters
+#             # Create chapter element
+#             chapter = ET.SubElement(root, "chapter")
+#             chapter.set("id", f"chapter_{chapter_num-1}")  # Keep 0-based ids
+#             chapter.set("title", f"Chapter {chapter_num}")
+#             chapter.text = content.strip()
+#             chapter_num += 1
+#     # Pretty print XML
+#     xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
+#     # Save as XML
+#     output_path = 'texts/processed/time_machine.xml'
+#     with open(output_path, 'w', encoding='utf-8') as f:
+#         f.write(xml_str)
+#     print(f"Processed and saved to {output_path}")
+# if __name__ == "__main__":
+#     process_time_machine()

voices/voices/mock_voice.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:912f5af0b31abadd4c60aae1d295f9f2b05bf925b35bb1bdc8b928fbf0dc052b
+size 15