import asyncio import logging import os import random from typing import Dict, List, Tuple import gradio as gr import yaml from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice, play_history, save_history) from src.openailib import top_response, speech_to_text from src.tube import extract_audio logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) def set_openai_key(openai_api_key_textbox): log.info(f"Setting OpenAI key.") os.environ["OPENAI_API_KEY"] = openai_api_key_textbox import openai openai.api_key = os.getenv("OPENAI_API_KEY") def set_elevenlabs_key(elevenlabs_api_key_textbox): log.info(f"Setting ElevenLabs key.") os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox import elevenlabs elevenlabs.api_key = os.getenv("ELEVENLABS_API_KEY") class ConversationState: COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD', '#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1'] YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml') AUDIO_SAVEDIR: str = os.path.join( os.path.dirname(__file__), 'audio_export') def __init__(self, names: list = None, iam: str = None, model: str = "gpt-3.5-turbo", max_tokens: int = 30, temperature: float = 0.5, history: list = None): self.model = model self.max_tokens = max_tokens self.temperature = temperature # Make sure save dir exists, make any necessary directories os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True) self.audio_savepath = os.path.join( self.AUDIO_SAVEDIR, 'conversation.wav') log.info(f"Resetting conversation") with open(self.YAML_FILEPATH, 'r') as file: self.characters_yaml = file.read() file.seek(0) self.characters_dict = yaml.safe_load(file) self.all_characters = [ name for name in self.characters_dict.keys()] self.names = names or random.choices(self.all_characters, k=2) self.iam = iam or random.choice(self.names) assert self.iam in self.names, f"{self.iam} not in {self.names}" log.info(f"Loading voices") self.speakers: Dict[str, Speaker] = {} self.speakers_descriptions: str = '' for i, name in enumerate(self.names): if check_voice_exists(name) is None: log.warning(f"Voice {name} does not exist") continue _speaker = Speaker( name=name, voice=get_make_voice(name), color=self.COLORS[i % len(self.COLORS)], description=self.characters_dict[name].get( "description", None), ) self.speakers[name] = _speaker if _speaker.description is not None: self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n" # System is fed into OpenAI to condition the prompt self.system = f"You create funny conversation dialogues." self.system += f"This conversation is between {', '.join(self.names)}." self.system += "Do not introduce new characters." self.system += "Descriptions for each of the characters are:\n" for speaker in self.speakers.values(): self.system += f"{speaker.name}: {speaker.description}\n" self.system += "Only return one person's response at a time." self.system += "Each response must start with the character name, then a colon, then their response in a single line." self.system += "Keep the responses short and witty." self.system += "Make sure the responses are only one sentence long." self.system += "Do not continue a previous response. Always start a new response." # History is fed in at every step self.step = 0 if history is None: self.history: List[Tuple[Speaker, str]] = [] def add_to_history(self, text: str, speaker: Speaker = None): if speaker is None: speaker = self.speakers[self.iam] self.history.append((speaker, text)) def history_to_prompt(self) -> str: prompt: str = '' for speaker, text in self.history: prompt += f"{speaker.name}:{text}\n" return prompt def html_history(self) -> str: history_html: str = '' for speaker, text in self.history: _bubble = f"
{speaker.name}: {text}
" history_html += _bubble return history_html # Storing state in the global scope like this is bad, but # perfect is the enemy of good enough and gradio is kind of shit STATE = ConversationState() def reset(names, iam, model, max_tokens, temperature): # Push new global state to the global scope global STATE STATE = ConversationState( names=names, iam=iam, model=model, max_tokens=max_tokens, temperature=temperature, ) return STATE.html_history() def step_mic(audio): global STATE try: request = speech_to_text(audio) STATE.add_to_history(request) except TypeError as e: log.warning(e) pass return STATE.html_history() def step_continue(): global STATE response = top_response(STATE.history_to_prompt(), system=STATE.system, model=STATE.model, max_tokens=STATE.max_tokens, temperature=STATE.temperature, ) for line in response.splitlines(): try: # TODO: Add any filters here as assertion errors if not line: continue assert ":" in line, f"Line {line} does not have a colon" name, text = line.split(":") assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}" speaker = STATE.speakers[name] assert len(text) > 0, f"Text {text} is empty" STATE.add_to_history(text, speaker=speaker) except AssertionError as e: log.warning(e) continue return STATE.html_history() def save_audio(): global STATE log.info(f"Saving audio") asyncio.run(save_history(STATE.history, STATE.audio_savepath)) return STATE.audio_savepath def play_audio(): global STATE log.info(f"Playing audio") asyncio.run(play_history(STATE.history)) return STATE.html_history() def make_voices(voices_yaml: str): global STATE try: STATE.characters_dict = yaml.safe_load(voices_yaml) for name, metadata in STATE.characters_dict.items(): videos = metadata['references'] assert isinstance(name, str), f"Name {name} is not a string" assert isinstance(videos, list), f"Videos {videos} is not a list" if check_voice_exists(name): continue audio_paths = [] for i, video in enumerate(videos): assert isinstance(video, Dict), f"Video {video} is not a dict" assert 'url' in video, f"Video {video} does not have a url" url = video['url'] start_minute = video.get('start_minute', 0) duration = video.get('duration_seconds', 120) label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}") output_path = extract_audio(url, label, start_minute, duration) audio_paths.append(output_path) get_make_voice(name, audio_paths) except Exception as e: raise e # return f"Error: {e}" return "Success" # Define the main GradIO UI with gr.Blocks() as demo: gr.HTML('''

Speech2Speech

''') with gr.Tab("Conversation"): gr_convo_output = gr.HTML() with gr.Row(): with gr.Column(): gr_mic = gr.Audio( label="Record audio into conversation", source="microphone", type="filepath", ) gr_add_button = gr.Button(value="Add to conversation") gr_playaudio_button = gr.Button(value="Play audio") gr_saveaudio_button = gr.Button(value="Export audio") gr_outputaudio = gr.Audio( label="Audio output", source="upload", type="filepath", ) with gr.Column(): gr_iam = gr.Dropdown( choices=STATE.all_characters, label="I am", value=STATE.iam) gr_chars = gr.CheckboxGroup( STATE.all_characters, label="Characters", value=STATE.names) gr_reset_button = gr.Button(value="Reset conversation") with gr.Accordion("Settings", open=False): openai_api_key_textbox = gr.Textbox( placeholder="Paste your OpenAI API key here", show_label=False, lines=1, type="password", ) elevenlabs_api_key_textbox = gr.Textbox( placeholder="Paste your ElevenLabs API key here", show_label=False, lines=1, type="password", ) gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"], label='GPT Model behind conversation', value=STATE.model) gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens, label="Max tokens", step=1) gr_temperature = gr.Slider( minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)") with gr.Tab("New Characters"): gr_make_voice_button = gr.Button(value="Update Characters") gr_voice_data = gr.Textbox( lines=25, label="Character YAML config", value=STATE.characters_yaml) gr_make_voice_output = gr.Textbox( lines=2, label="Character creation logs...") gr.HTML('''
Created by Hu Po GitHub: speech2speech
Duplicate this space:Duplicate Space
''') # Buttons and actions gr_mic.change(step_mic, gr_mic, gr_convo_output) openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None) elevenlabs_api_key_textbox.change( set_elevenlabs_key, elevenlabs_api_key_textbox, None) gr_add_button.click(step_continue, None, gr_convo_output) gr_reset_button.click( reset, inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature], outputs=[gr_convo_output], ) gr_saveaudio_button.click(save_audio, None, gr_outputaudio) gr_playaudio_button.click(play_audio, None, None) gr_make_voice_button.click( make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output, ) if __name__ == "__main__": demo.launch()