Spaces:

hu-po
/

speech2speech

Build error

App Files Files Community

hu-po commited on Mar 17, 2023

Commit

d2d6865

1 Parent(s): 924b0a9

release 0.1

Browse files

Files changed (15) hide show

app.py +289 -0
requirements.txt +8 -0
src/__pycache__/elevenlabs.cpython-310.pyc +0 -0
src/__pycache__/elevenlabs.cpython-39.pyc +0 -0
src/__pycache__/openailib.cpython-310.pyc +0 -0
src/__pycache__/openailib.cpython-39.pyc +0 -0
src/__pycache__/tube.cpython-310.pyc +0 -0
src/__pycache__/tube.cpython-39.pyc +0 -0
src/__pycache__/utils.cpython-310.pyc +0 -0
src/__pycache__/utils.cpython-39.pyc +0 -0
src/elevenlabs.py +115 -0
src/openailib.py +47 -0
src/tube.py +64 -0
src/utils.py +16 -0
voices.yaml +30 -0

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import asyncio
+import logging
+import os
+import random
+from typing import Dict, List, Tuple
+import gradio as gr
+import yaml
+from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
+                            play_history, save_history)
+from src.openailib import top_response, speech_to_text
+from src.tube import extract_audio
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+def set_openai_key(openai_api_key_textbox):
+    log.info(f"Setting OpenAI key.")
+    os.environ["OPENAI_API_KEY"] = openai_api_key_textbox
+    import openai
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+def set_elevenlabs_key(elevenlabs_api_key_textbox):
+    log.info(f"Setting ElevenLabs key.")
+    os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox
+    import elevenlabs
+    elevenlabs.api_key = os.getenv("ELEVENLABS_API_KEY")
+class ConversationState:
+    COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
+                    '#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
+    YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
+    AUDIO_SAVEDIR: str = os.path.join(
+        os.path.dirname(__file__), 'audio_export')
+    def __init__(self,
+                 names: list = None,
+                 iam: str = None,
+                 model: str = "gpt-3.5-turbo",
+                 max_tokens: int = 30,
+                 temperature: float = 0.5,
+                 history: list = None):
+        self.model = model
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        # Make sure save dir exists, make any necessary directories
+        os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
+        self.audio_savepath = os.path.join(
+            self.AUDIO_SAVEDIR, 'conversation.wav')
+        log.info(f"Resetting conversation")
+        with open(self.YAML_FILEPATH, 'r') as file:
+            self.characters_yaml = file.read()
+            file.seek(0)
+            self.characters_dict = yaml.safe_load(file)
+            self.all_characters = [
+                name for name in self.characters_dict.keys()]
+        self.names = names or random.choices(self.all_characters, k=2)
+        self.iam = iam or random.choice(self.names)
+        assert self.iam in self.names, f"{self.iam} not in {self.names}"
+        log.info(f"Loading voices")
+        self.speakers: Dict[str, Speaker] = {}
+        self.speakers_descriptions: str = ''
+        for i, name in enumerate(self.names):
+            if check_voice_exists(name) is None:
+                log.warning(f"Voice {name} does not exist")
+                continue
+            _speaker = Speaker(
+                name=name,
+                voice=get_make_voice(name),
+                color=self.COLORS[i % len(self.COLORS)],
+                description=self.characters_dict[name].get(
+                    "description", None),
+            )
+            self.speakers[name] = _speaker
+            if _speaker.description is not None:
+                self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
+        # System is fed into OpenAI to condition the prompt
+        self.system = f"You create funny conversation dialogues."
+        self.system += f"This conversation is between {', '.join(self.names)}."
+        self.system += "Do not introduce new characters."
+        self.system += "Descriptions for each of the characters are:\n"
+        for speaker in self.speakers.values():
+            self.system += f"{speaker.name}: {speaker.description}\n"
+        self.system += "Only return one person's response at a time."
+        self.system += "Each response must start with the character name, then a colon, then their response in a single line."
+        self.system += "Keep the responses short and witty."
+        self.system += "Make sure the responses are only one sentence long."
+        self.system += "Do not continue a previous response. Always start a new response."
+        # History is fed in at every step
+        self.step = 0
+        if history is None:
+            self.history: List[Tuple[Speaker, str]] = []
+    def add_to_history(self, text: str, speaker: Speaker = None):
+        if speaker is None:
+            speaker = self.speakers[self.iam]
+        self.history.append((speaker, text))
+    def history_to_prompt(self) -> str:
+        prompt: str = ''
+        for speaker, text in self.history:
+            prompt += f"{speaker.name}:{text}\n"
+        return prompt
+    def html_history(self) -> str:
+        history_html: str = ''
+        for speaker, text in self.history:
+            _bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
+            history_html += _bubble
+        return history_html
+# Storing state in the global scope like this is bad, but
+# perfect is the enemy of good enough and gradio is kind of shit
+STATE = ConversationState()
+def reset(names, iam, model, max_tokens, temperature):
+    # Push new global state to the global scope
+    global STATE
+    STATE = ConversationState(
+        names=names,
+        iam=iam,
+        model=model,
+        max_tokens=max_tokens,
+        temperature=temperature,
+    )
+    return STATE.html_history()
+def step_mic(audio):
+    global STATE
+    try:
+        request = speech_to_text(audio)
+        STATE.add_to_history(request)
+    except TypeError as e:
+        log.warning(e)
+        pass
+    return STATE.html_history()
+def step_continue():
+    global STATE
+    response = top_response(STATE.history_to_prompt(),
+                            system=STATE.system,
+                            model=STATE.model,
+                            max_tokens=STATE.max_tokens,
+                            temperature=STATE.temperature,
+                            )
+    for line in response.splitlines():
+        try:
+            # TODO: Add any filters here as assertion errors
+            if not line:
+                continue
+            assert ":" in line, f"Line {line} does not have a colon"
+            name, text = line.split(":")
+            assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
+            speaker = STATE.speakers[name]
+            assert len(text) > 0, f"Text {text} is empty"
+            STATE.add_to_history(text, speaker=speaker)
+        except AssertionError as e:
+            log.warning(e)
+            continue
+    return STATE.html_history()
+def save_audio():
+    global STATE
+    log.info(f"Saving audio")
+    asyncio.run(save_history(STATE.history, STATE.audio_savepath))
+    return STATE.audio_savepath
+def play_audio():
+    global STATE
+    log.info(f"Playing audio")
+    asyncio.run(play_history(STATE.history))
+    return STATE.html_history()
+def make_voices(voices_yaml: str):
+    global STATE
+    try:
+        STATE.characters_dict = yaml.safe_load(voices_yaml)
+        for name, metadata in STATE.characters_dict.items():
+            videos = metadata['references']
+            assert isinstance(name, str), f"Name {name} is not a string"
+            assert isinstance(videos, list), f"Videos {videos} is not a list"
+            if check_voice_exists(name):
+                continue
+            audio_paths = []
+            for i, video in enumerate(videos):
+                assert isinstance(video, Dict), f"Video {video} is not a dict"
+                assert 'url' in video, f"Video {video} does not have a url"
+                url = video['url']
+                start_minute = video.get('start_minute', 0)
+                duration = video.get('duration_seconds', 120)
+                label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
+                output_path = extract_audio(url, label, start_minute, duration)
+                audio_paths.append(output_path)
+            get_make_voice(name, audio_paths)
+    except Exception as e:
+        raise e
+        # return f"Error: {e}"
+    return "Success"
+# Define the main GradIO UI
+with gr.Blocks() as demo:
+    gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
+    with gr.Tab("Conversation"):
+        gr_convo_output = gr.HTML()
+        with gr.Row():
+            with gr.Column():
+                gr_mic = gr.Audio(
+                    label="Record audio into conversation",
+                    source="microphone",
+                    type="filepath",
+                )
+                gr_add_button = gr.Button(value="Add to conversation")
+                gr_playaudio_button = gr.Button(value="Play audio")
+                gr_saveaudio_button = gr.Button(value="Export audio")
+                gr_outputaudio = gr.Audio(
+                    label="Audio output",
+                    source="upload",
+                    type="filepath",
+                )
+            with gr.Column():
+                gr_iam = gr.Dropdown(
+                    choices=STATE.all_characters, label="I am", value=STATE.iam)
+                gr_chars = gr.CheckboxGroup(
+                    STATE.all_characters, label="Characters", value=STATE.names)
+                gr_reset_button = gr.Button(value="Reset conversation")
+                with gr.Accordion("Settings", open=False):
+                    openai_api_key_textbox = gr.Textbox(
+                        placeholder="Paste your OpenAI API key here",
+                        show_label=False,
+                        lines=1,
+                        type="password",
+                    )
+                    elevenlabs_api_key_textbox = gr.Textbox(
+                        placeholder="Paste your ElevenLabs API key here",
+                        show_label=False,
+                        lines=1,
+                        type="password",
+                    )
+                    gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
+                                           label='GPT Model behind conversation', value=STATE.model)
+                    gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
+                                              label="Max tokens", step=1)
+                    gr_temperature = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
+    with gr.Tab("New Characters"):
+        gr_make_voice_button = gr.Button(value="Update Characters")
+        gr_voice_data = gr.Textbox(
+            lines=25, label="Character YAML config", value=STATE.characters_yaml)
+        gr_make_voice_output = gr.Textbox(
+            lines=2, label="Character creation logs...")
+    gr.HTML('''<center>
+    Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
+    <br>
+    Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+    </center>
+    ''')
+    # Buttons and actions
+    gr_mic.change(step_mic, gr_mic, gr_convo_output)
+    openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
+    elevenlabs_api_key_textbox.change(
+        set_elevenlabs_key, elevenlabs_api_key_textbox, None)
+    gr_add_button.click(step_continue, None, gr_convo_output)
+    gr_reset_button.click(
+        reset,
+        inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
+        outputs=[gr_convo_output],
+    )
+    gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
+    gr_playaudio_button.click(play_audio, None, None)
+    gr_make_voice_button.click(
+        make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+openai==0.27.0
+elevenlabslib
+sounddevice==0.4.6
+soundfile==0.12.1
+gradio==3.19.1
+scipy==1.10.1
+SpeechRecognition==3.9.0
+pytube==12.1.2

src/__pycache__/elevenlabs.cpython-310.pyc ADDED Viewed

Binary file (4.12 kB). View file

src/__pycache__/elevenlabs.cpython-39.pyc ADDED Viewed

Binary file (4.11 kB). View file

src/__pycache__/openailib.cpython-310.pyc ADDED Viewed

Binary file (1.23 kB). View file

src/__pycache__/openailib.cpython-39.pyc ADDED Viewed

Binary file (1.23 kB). View file

src/__pycache__/tube.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

src/__pycache__/tube.cpython-39.pyc ADDED Viewed

Binary file (1.81 kB). View file

src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (639 Bytes). View file

src/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (637 Bytes). View file

src/elevenlabs.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import asyncio
+import io
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Dict, List, Union, Tuple
+import sounddevice as sd
+import soundfile as sf
+from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
+from .utils import timeit
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
+@dataclass
+class Speaker:
+    name: str
+    voice: ElevenLabsVoice
+    color: str
+    description: str = None
+async def text_to_speechbytes_async(text, speaker, loop):
+    with ThreadPoolExecutor() as executor:
+        speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
+    return speech_bytes
+async def play_history(history: List[Tuple[Speaker, str]]):
+    loop = asyncio.get_event_loop()
+    # Create a list of tasks for all text_to_speechbytes function calls
+    tasks = [text_to_speechbytes_async(
+        text, speaker, loop) for speaker, text in history]
+    # Run tasks concurrently, waiting for the first one to complete
+    for speech_bytes in await asyncio.gather(*tasks):
+        audioFile = io.BytesIO(speech_bytes)
+        soundFile = sf.SoundFile(audioFile)
+        sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
+async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
+    loop = asyncio.get_event_loop()
+    # Create a list of tasks for all text_to_speechbytes function calls
+    tasks = [text_to_speechbytes_async(
+        text, speaker, loop) for speaker, text in history]
+    # Run tasks concurrently, waiting for the first one to complete
+    all_speech_bytes = await asyncio.gather(*tasks)
+    # Combine all audio bytes into a single audio file
+    concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
+    # Save the combined audio file to disk
+    with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
+        with sf.SoundFile(
+            audio_savepath, mode='w',
+            samplerate=soundFile.samplerate,
+            channels=soundFile.channels,
+        ) as outputFile:
+            outputFile.write(soundFile.read())
+def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
+    log.info(f"Getting voice {voice}...")
+    _available_voices = USER.get_voices_by_name(voice)
+    if _available_voices:
+        log.info(f"Voice {voice} already exists, found {_available_voices}.")
+        return _available_voices[0]
+    return None
+@timeit
+def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
+    _voice = check_voice_exists(voice)
+    if _voice is not None:
+        return _voice
+    else:
+        if USER.get_voice_clone_available():
+            assert audio_path is not None, "audio_path must be provided"
+            assert isinstance(audio_path, list), "audio_path must be a list"
+            log.info(f"Cloning voice {voice}...")
+            _audio_source_dict = {
+                # Audio path is a PosixPath
+                _.name: open(_, "rb").read() for _ in audio_path
+            }
+            newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
+            return newVoice
+    raise ValueError(
+        f"Voice {voice} does not exist and cloning is not available.")
+@timeit
+def text_to_speech(text: str, voice: ElevenLabsVoice):
+    log.info(f"Generating audio using voice {voice}...")
+    time_start = time.time()
+    voice.generate_and_play_audio(text, playInBackground=False)
+    duration = time.time() - time_start
+    return duration
+@timeit
+def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
+    log.info(f"Generating audio for voice {voice} text {text}...")
+    audio_bytes = voice.generate_audio_bytes(text)
+    return audio_bytes

src/openailib.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import logging
+import os
+from .utils import timeit
+import openai
+openai.api_key = os.getenv("OPENAI_API_KEY")
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+@timeit
+def speech_to_text(audio_path):
+    log.info("Transcribing audio...")
+    transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
+    text = transcript["text"]
+    log.info(f"Transcript: \n\t{text}")
+    return text
+@timeit
+def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
+    _prompt = [
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    if system:
+        _prompt = [
+            {
+                "role": "system",
+                "content": system,
+            },
+        ] + _prompt
+    log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
+    _response = openai.ChatCompletion.create(
+        model=model,
+        messages=_prompt,
+        temperature=temperature,
+        n=1,
+        max_tokens=max_tokens,
+    )
+    log.info(f"API reponse: \n\t{_response}")
+    response: str = _response['choices'][0]['message']['content']
+    return response

src/tube.py ADDED Viewed

	@@ -0,0 +1,64 @@

+'''
+Extract audio from a YouTube video
+Usage:
+    tube.py <url> <person> [-s <start_time>] [-d <duration>]
+'''
+import subprocess
+from pathlib import Path
+import datetime
+import argparse
+import os
+from pytube import YouTube
+# Define argparse arguments
+parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
+parser.add_argument('url', type=str, help='the YouTube video URL')
+parser.add_argument('person', type=str, help='the name of the person speaking')
+parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
+parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
+# 200 seconds seems to be max duration for single clips
+def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
+    # Download the YouTube video
+    youtube_object = YouTube(url)
+    stream = youtube_object.streams.first()
+    video_path = Path(stream.download(skip_existing=True))
+    # Convert start time to seconds
+    start_time_seconds = int(start_minute * 60)
+    # Format the start time in HH:MM:SS.mmm format
+    start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
+    start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
+    # Set the output path using the audio file name
+    output_path = video_path.parent / f"{label}.wav"
+    # Run ffmpeg to extract the audio
+    cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
+    if duration is not None:
+        # Format the duration in HH:MM:SS.mmm format
+        duration_formatted = str(datetime.timedelta(seconds=duration))
+        duration_formatted = duration_formatted[:11] + duration_formatted[12:]
+        cmd += ['-t', duration_formatted]
+    cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
+    subprocess.run(cmd)
+    # remove the extra .3gpp file that is created:
+    for file in os.listdir(video_path.parent):
+        if file.endswith(".3gpp"):
+            os.remove(os.path.join(video_path.parent, file))
+    return output_path
+if __name__ == '__main__':
+    # Parse the arguments
+    args = parser.parse_args()
+    # Extract the audio
+    extract_audio(args.url, args.person, args.start_time, args.duration)

src/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import time
+import logging
+log = logging.getLogger(__name__)
+# Decorator to time a function
+def timeit(func):
+    def timed(*args, **kwargs):
+        time_start = time.time()
+        result = func(*args, **kwargs)
+        _yellow = "\x1b[33;20m"
+        _reset = "\x1b[0m"
+        _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
+        log.info(_msg)
+        return result
+    return timed

voices.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+JoeBiden:
+  description: "Stumbles and stutters over words, uses old timey phrases."
+  references:
+    - url: "https://youtu.be/uhjVn-J_cVs"
+      start_minute: 0
+      duration_seconds: 120
+DonaldTrump:
+  description: "Bombastic quarrelsome narcisist who talks up his ideas."
+  references:
+    - url: "https://youtu.be/f0UB06v7yLY"
+      start_minute: 0
+      duration_seconds: 120
+ElonMusk:
+  description: "Visionary entrepreneur who loves low quality memes."
+  references:
+    - url: "https://youtu.be/DxREm3s1scA"
+      start_minute: 1.7
+      duration_seconds: 27
+    - url: "https://youtu.be/DxREm3s1scA"
+      start_minute: 18.5
+      duration_seconds: 60
+LexFridman:
+  description: "Depressing and lonely thinker, makes references to classic literature"
+  references:
+    - url: "https://youtu.be/DxREm3s1scA"
+      start_minute: 1
+      duration_seconds: 30
+    - url: "https://youtu.be/DxREm3s1scA"
+      start_minute: 41.4
+      duration_seconds: 30