speech2speech / app.py
hu-po's picture
release 0.1
d2d6865
raw
history blame
11.7 kB
import asyncio
import logging
import os
import random
from typing import Dict, List, Tuple
import gradio as gr
import yaml
from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
play_history, save_history)
from src.openailib import top_response, speech_to_text
from src.tube import extract_audio
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
def set_openai_key(openai_api_key_textbox):
log.info(f"Setting OpenAI key.")
os.environ["OPENAI_API_KEY"] = openai_api_key_textbox
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
def set_elevenlabs_key(elevenlabs_api_key_textbox):
log.info(f"Setting ElevenLabs key.")
os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox
import elevenlabs
elevenlabs.api_key = os.getenv("ELEVENLABS_API_KEY")
class ConversationState:
COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
'#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
AUDIO_SAVEDIR: str = os.path.join(
os.path.dirname(__file__), 'audio_export')
def __init__(self,
names: list = None,
iam: str = None,
model: str = "gpt-3.5-turbo",
max_tokens: int = 30,
temperature: float = 0.5,
history: list = None):
self.model = model
self.max_tokens = max_tokens
self.temperature = temperature
# Make sure save dir exists, make any necessary directories
os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
self.audio_savepath = os.path.join(
self.AUDIO_SAVEDIR, 'conversation.wav')
log.info(f"Resetting conversation")
with open(self.YAML_FILEPATH, 'r') as file:
self.characters_yaml = file.read()
file.seek(0)
self.characters_dict = yaml.safe_load(file)
self.all_characters = [
name for name in self.characters_dict.keys()]
self.names = names or random.choices(self.all_characters, k=2)
self.iam = iam or random.choice(self.names)
assert self.iam in self.names, f"{self.iam} not in {self.names}"
log.info(f"Loading voices")
self.speakers: Dict[str, Speaker] = {}
self.speakers_descriptions: str = ''
for i, name in enumerate(self.names):
if check_voice_exists(name) is None:
log.warning(f"Voice {name} does not exist")
continue
_speaker = Speaker(
name=name,
voice=get_make_voice(name),
color=self.COLORS[i % len(self.COLORS)],
description=self.characters_dict[name].get(
"description", None),
)
self.speakers[name] = _speaker
if _speaker.description is not None:
self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
# System is fed into OpenAI to condition the prompt
self.system = f"You create funny conversation dialogues."
self.system += f"This conversation is between {', '.join(self.names)}."
self.system += "Do not introduce new characters."
self.system += "Descriptions for each of the characters are:\n"
for speaker in self.speakers.values():
self.system += f"{speaker.name}: {speaker.description}\n"
self.system += "Only return one person's response at a time."
self.system += "Each response must start with the character name, then a colon, then their response in a single line."
self.system += "Keep the responses short and witty."
self.system += "Make sure the responses are only one sentence long."
self.system += "Do not continue a previous response. Always start a new response."
# History is fed in at every step
self.step = 0
if history is None:
self.history: List[Tuple[Speaker, str]] = []
def add_to_history(self, text: str, speaker: Speaker = None):
if speaker is None:
speaker = self.speakers[self.iam]
self.history.append((speaker, text))
def history_to_prompt(self) -> str:
prompt: str = ''
for speaker, text in self.history:
prompt += f"{speaker.name}:{text}\n"
return prompt
def html_history(self) -> str:
history_html: str = ''
for speaker, text in self.history:
_bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
history_html += _bubble
return history_html
# Storing state in the global scope like this is bad, but
# perfect is the enemy of good enough and gradio is kind of shit
STATE = ConversationState()
def reset(names, iam, model, max_tokens, temperature):
# Push new global state to the global scope
global STATE
STATE = ConversationState(
names=names,
iam=iam,
model=model,
max_tokens=max_tokens,
temperature=temperature,
)
return STATE.html_history()
def step_mic(audio):
global STATE
try:
request = speech_to_text(audio)
STATE.add_to_history(request)
except TypeError as e:
log.warning(e)
pass
return STATE.html_history()
def step_continue():
global STATE
response = top_response(STATE.history_to_prompt(),
system=STATE.system,
model=STATE.model,
max_tokens=STATE.max_tokens,
temperature=STATE.temperature,
)
for line in response.splitlines():
try:
# TODO: Add any filters here as assertion errors
if not line:
continue
assert ":" in line, f"Line {line} does not have a colon"
name, text = line.split(":")
assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
speaker = STATE.speakers[name]
assert len(text) > 0, f"Text {text} is empty"
STATE.add_to_history(text, speaker=speaker)
except AssertionError as e:
log.warning(e)
continue
return STATE.html_history()
def save_audio():
global STATE
log.info(f"Saving audio")
asyncio.run(save_history(STATE.history, STATE.audio_savepath))
return STATE.audio_savepath
def play_audio():
global STATE
log.info(f"Playing audio")
asyncio.run(play_history(STATE.history))
return STATE.html_history()
def make_voices(voices_yaml: str):
global STATE
try:
STATE.characters_dict = yaml.safe_load(voices_yaml)
for name, metadata in STATE.characters_dict.items():
videos = metadata['references']
assert isinstance(name, str), f"Name {name} is not a string"
assert isinstance(videos, list), f"Videos {videos} is not a list"
if check_voice_exists(name):
continue
audio_paths = []
for i, video in enumerate(videos):
assert isinstance(video, Dict), f"Video {video} is not a dict"
assert 'url' in video, f"Video {video} does not have a url"
url = video['url']
start_minute = video.get('start_minute', 0)
duration = video.get('duration_seconds', 120)
label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
output_path = extract_audio(url, label, start_minute, duration)
audio_paths.append(output_path)
get_make_voice(name, audio_paths)
except Exception as e:
raise e
# return f"Error: {e}"
return "Success"
# Define the main GradIO UI
with gr.Blocks() as demo:
gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
with gr.Tab("Conversation"):
gr_convo_output = gr.HTML()
with gr.Row():
with gr.Column():
gr_mic = gr.Audio(
label="Record audio into conversation",
source="microphone",
type="filepath",
)
gr_add_button = gr.Button(value="Add to conversation")
gr_playaudio_button = gr.Button(value="Play audio")
gr_saveaudio_button = gr.Button(value="Export audio")
gr_outputaudio = gr.Audio(
label="Audio output",
source="upload",
type="filepath",
)
with gr.Column():
gr_iam = gr.Dropdown(
choices=STATE.all_characters, label="I am", value=STATE.iam)
gr_chars = gr.CheckboxGroup(
STATE.all_characters, label="Characters", value=STATE.names)
gr_reset_button = gr.Button(value="Reset conversation")
with gr.Accordion("Settings", open=False):
openai_api_key_textbox = gr.Textbox(
placeholder="Paste your OpenAI API key here",
show_label=False,
lines=1,
type="password",
)
elevenlabs_api_key_textbox = gr.Textbox(
placeholder="Paste your ElevenLabs API key here",
show_label=False,
lines=1,
type="password",
)
gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
label='GPT Model behind conversation', value=STATE.model)
gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
label="Max tokens", step=1)
gr_temperature = gr.Slider(
minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
with gr.Tab("New Characters"):
gr_make_voice_button = gr.Button(value="Update Characters")
gr_voice_data = gr.Textbox(
lines=25, label="Character YAML config", value=STATE.characters_yaml)
gr_make_voice_output = gr.Textbox(
lines=2, label="Character creation logs...")
gr.HTML('''<center>
Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
<br>
Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</center>
''')
# Buttons and actions
gr_mic.change(step_mic, gr_mic, gr_convo_output)
openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
elevenlabs_api_key_textbox.change(
set_elevenlabs_key, elevenlabs_api_key_textbox, None)
gr_add_button.click(step_continue, None, gr_convo_output)
gr_reset_button.click(
reset,
inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
outputs=[gr_convo_output],
)
gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
gr_playaudio_button.click(play_audio, None, None)
gr_make_voice_button.click(
make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
)
if __name__ == "__main__":
demo.launch()