Spaces:

hu-po
/

speech2speech

Build error

App Files Files Community

hu-po commited on Mar 18, 2023

Commit

a86e62a

1 Parent(s): 64c61b2

release 0.3

Browse files

Files changed (14) hide show

app.py +20 -3
requirements.txt +3 -3
src/src/__pycache__/elevenlabs.cpython-310.pyc +0 -0
src/src/__pycache__/elevenlabs.cpython-39.pyc +0 -0
src/src/__pycache__/openailib.cpython-310.pyc +0 -0
src/src/__pycache__/openailib.cpython-39.pyc +0 -0
src/src/__pycache__/tube.cpython-310.pyc +0 -0
src/src/__pycache__/tube.cpython-39.pyc +0 -0
src/src/__pycache__/utils.cpython-310.pyc +0 -0
src/src/__pycache__/utils.cpython-39.pyc +0 -0
src/src/elevenlabs.py +136 -0
src/src/openailib.py +58 -0
src/src/tube.py +64 -0
src/src/utils.py +16 -0

app.py CHANGED Viewed

@@ -197,7 +197,26 @@ def make_voices(voices_yaml: str):
 # Define the main GradIO UI
 with gr.Blocks() as demo:
-    gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
     with gr.Tab("Conversation"):
         gr_convo_output = gr.HTML()
         with gr.Row():
@@ -249,8 +268,6 @@ with gr.Blocks() as demo:
     gr.HTML('''<center>
     Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
-    <br>
-    Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
     </center>
     ''')

 # Define the main GradIO UI
 with gr.Blocks() as demo:
+    gr.HTML('''
+    <center>
+    <h1>Speech2Speech</h1>
+    Make a private copy of this space to paste your API keys.
+    <br>
+    <a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+    </center>''')
+    with gr.Row():
+        openai_api_key_textbox = gr.Textbox(
+            placeholder="Paste your OpenAI API key here",
+            show_label=False,
+            lines=1,
+            type="password",
+        )
+        elevenlabs_api_key_textbox = gr.Textbox(
+            placeholder="Paste your ElevenLabs API key here",
+            show_label=False,
+            lines=1,
+            type="password",
+        )
     with gr.Tab("Conversation"):
         gr_convo_output = gr.HTML()
         with gr.Row():
     gr.HTML('''<center>
     Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
     </center>
     ''')

requirements.txt CHANGED Viewed

@@ -5,7 +5,7 @@ soundfile==0.12.1
 gradio==3.19.1
 scipy==1.10.1
 SpeechRecognition==3.9.0
-# pytube==12.1.2
 # git+https://github.com/pytube/pytube@master#egg=pytube
-librosa
-pytube

 gradio==3.19.1
 scipy==1.10.1
 SpeechRecognition==3.9.0
 # git+https://github.com/pytube/pytube@master#egg=pytube
+pytube==12.1.2
+# librosa
+# torchlibrosa

src/src/__pycache__/elevenlabs.cpython-310.pyc ADDED Viewed

Binary file (4.65 kB). View file

src/src/__pycache__/elevenlabs.cpython-39.pyc ADDED Viewed

Binary file (4.64 kB). View file

src/src/__pycache__/openailib.cpython-310.pyc ADDED Viewed

Binary file (1.59 kB). View file

src/src/__pycache__/openailib.cpython-39.pyc ADDED Viewed

Binary file (1.59 kB). View file

src/src/__pycache__/tube.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

src/src/__pycache__/tube.cpython-39.pyc ADDED Viewed

Binary file (1.81 kB). View file

src/src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (639 Bytes). View file

src/src/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (637 Bytes). View file

src/src/elevenlabs.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import asyncio
+import io
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Union, Tuple
+import sounddevice as sd
+import soundfile as sf
+from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
+from .utils import timeit
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+USER = None
+def set_elevenlabs_key(elevenlabs_api_key_textbox=None):
+    global USER
+    log.info(f"Setting ElevenLabs key.")
+    if elevenlabs_api_key_textbox is not None:
+        os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox
+    try:
+        USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
+    except KeyError as e:
+        USER = None
+        log.warning("ELEVENLABS_API_KEY not found in environment variables.")
+        pass
+set_elevenlabs_key()
+@dataclass
+class Speaker:
+    name: str
+    voice: ElevenLabsVoice
+    color: str
+    description: str = None
+async def text_to_speechbytes_async(text, speaker, loop):
+    with ThreadPoolExecutor() as executor:
+        speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
+    return speech_bytes
+async def play_history(history: List[Tuple[Speaker, str]]):
+    loop = asyncio.get_event_loop()
+    # Create a list of tasks for all text_to_speechbytes function calls
+    tasks = [text_to_speechbytes_async(
+        text, speaker, loop) for speaker, text in history]
+    # Run tasks concurrently, waiting for the first one to complete
+    for speech_bytes in await asyncio.gather(*tasks):
+        audioFile = io.BytesIO(speech_bytes)
+        soundFile = sf.SoundFile(audioFile)
+        sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
+async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
+    loop = asyncio.get_event_loop()
+    # Create a list of tasks for all text_to_speechbytes function calls
+    tasks = [text_to_speechbytes_async(
+        text, speaker, loop) for speaker, text in history]
+    # Run tasks concurrently, waiting for the first one to complete
+    all_speech_bytes = await asyncio.gather(*tasks)
+    # Combine all audio bytes into a single audio file
+    concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
+    # Save the combined audio file to disk
+    with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
+        with sf.SoundFile(
+            audio_savepath, mode='w',
+            samplerate=soundFile.samplerate,
+            channels=soundFile.channels,
+        ) as outputFile:
+            outputFile.write(soundFile.read())
+def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
+    if USER is None:
+        log.warning(
+            "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
+        return None
+    log.info(f"Getting voice {voice}...")
+    _available_voices = USER.get_voices_by_name(voice)
+    if _available_voices:
+        log.info(f"Voice {voice} already exists, found {_available_voices}.")
+        return _available_voices[0]
+    return None
+@timeit
+def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
+    if USER is None:
+        log.warning(
+            "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
+        return None
+    _voice = check_voice_exists(voice)
+    if _voice is not None:
+        return _voice
+    else:
+        if USER.get_voice_clone_available():
+            assert audio_path is not None, "audio_path must be provided"
+            assert isinstance(audio_path, list), "audio_path must be a list"
+            log.info(f"Cloning voice {voice}...")
+            _audio_source_dict = {
+                # Audio path is a PosixPath
+                _.name: open(_, "rb").read() for _ in audio_path
+            }
+            newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
+            return newVoice
+    raise ValueError(
+        f"Voice {voice} does not exist and cloning is not available.")
+@timeit
+def text_to_speech(text: str, voice: ElevenLabsVoice):
+    log.info(f"Generating audio using voice {voice}...")
+    time_start = time.time()
+    voice.generate_and_play_audio(text, playInBackground=False)
+    duration = time.time() - time_start
+    return duration
+@timeit
+def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
+    log.info(f"Generating audio for voice {voice} text {text}...")
+    audio_bytes = voice.generate_audio_bytes(text)
+    return audio_bytes

src/src/openailib.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import logging
+import os
+from .utils import timeit
+import openai
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+def set_openai_key(openai_api_key_textbox = None):
+    log.info(f"Setting OpenAI key.")
+    if openai_api_key_textbox is not None:
+        os.environ["OPENAI_API_KEY"] = openai_api_key_textbox
+    try:
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+    except KeyError as e:
+        log.warning("OPENAI_API_KEY not found in environment variables.")
+        pass
+set_openai_key()
+@timeit
+def speech_to_text(audio_path):
+    log.info("Transcribing audio...")
+    transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
+    text = transcript["text"]
+    log.info(f"Transcript: \n\t{text}")
+    return text
+@timeit
+def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
+    _prompt = [
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    if system:
+        _prompt = [
+            {
+                "role": "system",
+                "content": system,
+            },
+        ] + _prompt
+    log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
+    _response = openai.ChatCompletion.create(
+        model=model,
+        messages=_prompt,
+        temperature=temperature,
+        n=1,
+        max_tokens=max_tokens,
+    )
+    log.info(f"API reponse: \n\t{_response}")
+    response: str = _response['choices'][0]['message']['content']
+    return response

src/src/tube.py ADDED Viewed

	@@ -0,0 +1,64 @@

+'''
+Extract audio from a YouTube video
+Usage:
+    tube.py <url> <person> [-s <start_time>] [-d <duration>]
+'''
+import subprocess
+from pathlib import Path
+import datetime
+import argparse
+import os
+from pytube import YouTube
+# Define argparse arguments
+parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
+parser.add_argument('url', type=str, help='the YouTube video URL')
+parser.add_argument('person', type=str, help='the name of the person speaking')
+parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
+parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
+# 200 seconds seems to be max duration for single clips
+def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
+    # Download the YouTube video
+    youtube_object = YouTube(url)
+    stream = youtube_object.streams.first()
+    video_path = Path(stream.download(skip_existing=True))
+    # Convert start time to seconds
+    start_time_seconds = int(start_minute * 60)
+    # Format the start time in HH:MM:SS.mmm format
+    start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
+    start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
+    # Set the output path using the audio file name
+    output_path = video_path.parent / f"{label}.wav"
+    # Run ffmpeg to extract the audio
+    cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
+    if duration is not None:
+        # Format the duration in HH:MM:SS.mmm format
+        duration_formatted = str(datetime.timedelta(seconds=duration))
+        duration_formatted = duration_formatted[:11] + duration_formatted[12:]
+        cmd += ['-t', duration_formatted]
+    cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
+    subprocess.run(cmd)
+    # remove the extra .3gpp file that is created:
+    for file in os.listdir(video_path.parent):
+        if file.endswith(".3gpp"):
+            os.remove(os.path.join(video_path.parent, file))
+    return output_path
+if __name__ == '__main__':
+    # Parse the arguments
+    args = parser.parse_args()
+    # Extract the audio
+    extract_audio(args.url, args.person, args.start_time, args.duration)

src/src/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import time
+import logging
+log = logging.getLogger(__name__)
+# Decorator to time a function
+def timeit(func):
+    def timed(*args, **kwargs):
+        time_start = time.time()
+        result = func(*args, **kwargs)
+        _yellow = "\x1b[33;20m"
+        _reset = "\x1b[0m"
+        _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
+        log.info(_msg)
+        return result
+    return timed