Spaces:
Build error
Build error
File size: 11,264 Bytes
d2d6865 3274df2 d2d6865 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
import asyncio
import logging
import os
import random
from typing import Dict, List, Tuple
import gradio as gr
import yaml
from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
play_history, save_history, set_elevenlabs_key)
from src.openailib import top_response, speech_to_text, set_openai_key
from src.tube import extract_audio
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
class ConversationState:
COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
'#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
AUDIO_SAVEDIR: str = os.path.join(
os.path.dirname(__file__), 'audio_export')
def __init__(self,
names: list = None,
iam: str = None,
model: str = "gpt-3.5-turbo",
max_tokens: int = 30,
temperature: float = 0.5,
history: list = None):
self.model = model
self.max_tokens = max_tokens
self.temperature = temperature
# Make sure save dir exists, make any necessary directories
os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
self.audio_savepath = os.path.join(
self.AUDIO_SAVEDIR, 'conversation.wav')
log.info(f"Resetting conversation")
with open(self.YAML_FILEPATH, 'r') as file:
self.characters_yaml = file.read()
file.seek(0)
self.characters_dict = yaml.safe_load(file)
self.all_characters = [
name for name in self.characters_dict.keys()]
self.names = names or random.choices(self.all_characters, k=2)
self.iam = iam or random.choice(self.names)
assert self.iam in self.names, f"{self.iam} not in {self.names}"
log.info(f"Loading voices")
self.speakers: Dict[str, Speaker] = {}
self.speakers_descriptions: str = ''
for i, name in enumerate(self.names):
if check_voice_exists(name) is None:
log.warning(f"Voice {name} does not exist")
continue
_speaker = Speaker(
name=name,
voice=get_make_voice(name),
color=self.COLORS[i % len(self.COLORS)],
description=self.characters_dict[name].get(
"description", None),
)
self.speakers[name] = _speaker
if _speaker.description is not None:
self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
# System is fed into OpenAI to condition the prompt
self.system = f"You create funny conversation dialogues."
self.system += f"This conversation is between {', '.join(self.names)}."
self.system += "Do not introduce new characters."
self.system += "Descriptions for each of the characters are:\n"
for speaker in self.speakers.values():
self.system += f"{speaker.name}: {speaker.description}\n"
self.system += "Only return one person's response at a time."
self.system += "Each response must start with the character name, then a colon, then their response in a single line."
self.system += "Keep the responses short and witty."
self.system += "Make sure the responses are only one sentence long."
self.system += "Do not continue a previous response. Always start a new response."
# History is fed in at every step
self.step = 0
if history is None:
self.history: List[Tuple[Speaker, str]] = []
def add_to_history(self, text: str, speaker: Speaker = None):
if speaker is None:
speaker = self.speakers[self.iam]
self.history.append((speaker, text))
def history_to_prompt(self) -> str:
prompt: str = ''
for speaker, text in self.history:
prompt += f"{speaker.name}:{text}\n"
return prompt
def html_history(self) -> str:
history_html: str = ''
for speaker, text in self.history:
_bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
history_html += _bubble
return history_html
# Storing state in the global scope like this is bad, but
# perfect is the enemy of good enough and gradio is kind of shit
STATE = ConversationState()
def reset(names, iam, model, max_tokens, temperature):
# Push new global state to the global scope
global STATE
STATE = ConversationState(
names=names,
iam=iam,
model=model,
max_tokens=max_tokens,
temperature=temperature,
)
return STATE.html_history()
def step_mic(audio):
global STATE
try:
request = speech_to_text(audio)
STATE.add_to_history(request)
except TypeError as e:
log.warning(e)
pass
return STATE.html_history()
def step_continue():
global STATE
response = top_response(STATE.history_to_prompt(),
system=STATE.system,
model=STATE.model,
max_tokens=STATE.max_tokens,
temperature=STATE.temperature,
)
for line in response.splitlines():
try:
# TODO: Add any filters here as assertion errors
if not line:
continue
assert ":" in line, f"Line {line} does not have a colon"
name, text = line.split(":")
assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
speaker = STATE.speakers[name]
assert len(text) > 0, f"Text {text} is empty"
STATE.add_to_history(text, speaker=speaker)
except AssertionError as e:
log.warning(e)
continue
return STATE.html_history()
def save_audio():
global STATE
log.info(f"Saving audio")
asyncio.run(save_history(STATE.history, STATE.audio_savepath))
return STATE.audio_savepath
def play_audio():
global STATE
log.info(f"Playing audio")
asyncio.run(play_history(STATE.history))
return STATE.html_history()
def make_voices(voices_yaml: str):
global STATE
try:
STATE.characters_dict = yaml.safe_load(voices_yaml)
for name, metadata in STATE.characters_dict.items():
videos = metadata['references']
assert isinstance(name, str), f"Name {name} is not a string"
assert isinstance(videos, list), f"Videos {videos} is not a list"
if check_voice_exists(name):
continue
audio_paths = []
for i, video in enumerate(videos):
assert isinstance(video, Dict), f"Video {video} is not a dict"
assert 'url' in video, f"Video {video} does not have a url"
url = video['url']
start_minute = video.get('start_minute', 0)
duration = video.get('duration_seconds', 120)
label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
output_path = extract_audio(url, label, start_minute, duration)
audio_paths.append(output_path)
get_make_voice(name, audio_paths)
except Exception as e:
raise e
# return f"Error: {e}"
return "Success"
# Define the main GradIO UI
with gr.Blocks() as demo:
gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
with gr.Tab("Conversation"):
gr_convo_output = gr.HTML()
with gr.Row():
with gr.Column():
gr_mic = gr.Audio(
label="Record audio into conversation",
source="microphone",
type="filepath",
)
gr_add_button = gr.Button(value="Add to conversation")
gr_playaudio_button = gr.Button(value="Play audio")
gr_saveaudio_button = gr.Button(value="Export audio")
gr_outputaudio = gr.Audio(
label="Audio output",
source="upload",
type="filepath",
)
with gr.Column():
gr_iam = gr.Dropdown(
choices=STATE.all_characters, label="I am", value=STATE.iam)
gr_chars = gr.CheckboxGroup(
STATE.all_characters, label="Characters", value=STATE.names)
gr_reset_button = gr.Button(value="Reset conversation")
with gr.Accordion("Settings", open=False):
openai_api_key_textbox = gr.Textbox(
placeholder="Paste your OpenAI API key here",
show_label=False,
lines=1,
type="password",
)
elevenlabs_api_key_textbox = gr.Textbox(
placeholder="Paste your ElevenLabs API key here",
show_label=False,
lines=1,
type="password",
)
gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
label='GPT Model behind conversation', value=STATE.model)
gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
label="Max tokens", step=1)
gr_temperature = gr.Slider(
minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
with gr.Tab("New Characters"):
gr_make_voice_button = gr.Button(value="Update Characters")
gr_voice_data = gr.Textbox(
lines=25, label="Character YAML config", value=STATE.characters_yaml)
gr_make_voice_output = gr.Textbox(
lines=2, label="Character creation logs...")
gr.HTML('''<center>
Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
<br>
Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</center>
''')
# Buttons and actions
gr_mic.change(step_mic, gr_mic, gr_convo_output)
openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
elevenlabs_api_key_textbox.change(
set_elevenlabs_key, elevenlabs_api_key_textbox, None)
gr_add_button.click(step_continue, None, gr_convo_output)
gr_reset_button.click(
reset,
inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
outputs=[gr_convo_output],
)
gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
gr_playaudio_button.click(play_audio, None, None)
gr_make_voice_button.click(
make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
)
if __name__ == "__main__":
demo.launch()
|