File size: 11,264 Bytes
d2d6865
 
 
 
 
 
 
 
 
 
3274df2
 
d2d6865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import asyncio
import logging
import os
import random
from typing import Dict, List, Tuple

import gradio as gr
import yaml

from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
                            play_history, save_history, set_elevenlabs_key)
from src.openailib import top_response, speech_to_text, set_openai_key
from src.tube import extract_audio

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


class ConversationState:
    COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
                    '#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
    YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
    AUDIO_SAVEDIR: str = os.path.join(
        os.path.dirname(__file__), 'audio_export')

    def __init__(self,
                 names: list = None,
                 iam: str = None,
                 model: str = "gpt-3.5-turbo",
                 max_tokens: int = 30,
                 temperature: float = 0.5,
                 history: list = None):
        self.model = model
        self.max_tokens = max_tokens
        self.temperature = temperature
        # Make sure save dir exists, make any necessary directories
        os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
        self.audio_savepath = os.path.join(
            self.AUDIO_SAVEDIR, 'conversation.wav')
        log.info(f"Resetting conversation")
        with open(self.YAML_FILEPATH, 'r') as file:
            self.characters_yaml = file.read()
            file.seek(0)
            self.characters_dict = yaml.safe_load(file)
            self.all_characters = [
                name for name in self.characters_dict.keys()]
        self.names = names or random.choices(self.all_characters, k=2)
        self.iam = iam or random.choice(self.names)
        assert self.iam in self.names, f"{self.iam} not in {self.names}"
        log.info(f"Loading voices")
        self.speakers: Dict[str, Speaker] = {}
        self.speakers_descriptions: str = ''
        for i, name in enumerate(self.names):
            if check_voice_exists(name) is None:
                log.warning(f"Voice {name} does not exist")
                continue
            _speaker = Speaker(
                name=name,
                voice=get_make_voice(name),
                color=self.COLORS[i % len(self.COLORS)],
                description=self.characters_dict[name].get(
                    "description", None),
            )
            self.speakers[name] = _speaker
            if _speaker.description is not None:
                self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
        # System is fed into OpenAI to condition the prompt
        self.system = f"You create funny conversation dialogues."
        self.system += f"This conversation is between {', '.join(self.names)}."
        self.system += "Do not introduce new characters."
        self.system += "Descriptions for each of the characters are:\n"
        for speaker in self.speakers.values():
            self.system += f"{speaker.name}: {speaker.description}\n"
        self.system += "Only return one person's response at a time."
        self.system += "Each response must start with the character name, then a colon, then their response in a single line."
        self.system += "Keep the responses short and witty."
        self.system += "Make sure the responses are only one sentence long."
        self.system += "Do not continue a previous response. Always start a new response."
        # History is fed in at every step
        self.step = 0
        if history is None:
            self.history: List[Tuple[Speaker, str]] = []

    def add_to_history(self, text: str, speaker: Speaker = None):
        if speaker is None:
            speaker = self.speakers[self.iam]
        self.history.append((speaker, text))

    def history_to_prompt(self) -> str:
        prompt: str = ''
        for speaker, text in self.history:
            prompt += f"{speaker.name}:{text}\n"
        return prompt

    def html_history(self) -> str:
        history_html: str = ''
        for speaker, text in self.history:
            _bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
            history_html += _bubble
        return history_html


# Storing state in the global scope like this is bad, but
# perfect is the enemy of good enough and gradio is kind of shit
STATE = ConversationState()


def reset(names, iam, model, max_tokens, temperature):
    # Push new global state to the global scope
    global STATE
    STATE = ConversationState(
        names=names,
        iam=iam,
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
    )
    return STATE.html_history()


def step_mic(audio):
    global STATE
    try:
        request = speech_to_text(audio)
        STATE.add_to_history(request)
    except TypeError as e:
        log.warning(e)
        pass
    return STATE.html_history()


def step_continue():
    global STATE
    response = top_response(STATE.history_to_prompt(),
                            system=STATE.system,
                            model=STATE.model,
                            max_tokens=STATE.max_tokens,
                            temperature=STATE.temperature,
                            )
    for line in response.splitlines():
        try:
            # TODO: Add any filters here as assertion errors
            if not line:
                continue
            assert ":" in line, f"Line {line} does not have a colon"
            name, text = line.split(":")
            assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
            speaker = STATE.speakers[name]
            assert len(text) > 0, f"Text {text} is empty"
            STATE.add_to_history(text, speaker=speaker)
        except AssertionError as e:
            log.warning(e)
            continue
    return STATE.html_history()


def save_audio():
    global STATE
    log.info(f"Saving audio")
    asyncio.run(save_history(STATE.history, STATE.audio_savepath))
    return STATE.audio_savepath


def play_audio():
    global STATE
    log.info(f"Playing audio")
    asyncio.run(play_history(STATE.history))
    return STATE.html_history()


def make_voices(voices_yaml: str):
    global STATE
    try:
        STATE.characters_dict = yaml.safe_load(voices_yaml)
        for name, metadata in STATE.characters_dict.items():
            videos = metadata['references']
            assert isinstance(name, str), f"Name {name} is not a string"
            assert isinstance(videos, list), f"Videos {videos} is not a list"
            if check_voice_exists(name):
                continue
            audio_paths = []
            for i, video in enumerate(videos):
                assert isinstance(video, Dict), f"Video {video} is not a dict"
                assert 'url' in video, f"Video {video} does not have a url"
                url = video['url']
                start_minute = video.get('start_minute', 0)
                duration = video.get('duration_seconds', 120)
                label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
                output_path = extract_audio(url, label, start_minute, duration)
                audio_paths.append(output_path)
            get_make_voice(name, audio_paths)
    except Exception as e:
        raise e
        # return f"Error: {e}"
    return "Success"


# Define the main GradIO UI
with gr.Blocks() as demo:
    gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
    with gr.Tab("Conversation"):
        gr_convo_output = gr.HTML()
        with gr.Row():
            with gr.Column():
                gr_mic = gr.Audio(
                    label="Record audio into conversation",
                    source="microphone",
                    type="filepath",
                )
                gr_add_button = gr.Button(value="Add to conversation")
                gr_playaudio_button = gr.Button(value="Play audio")
                gr_saveaudio_button = gr.Button(value="Export audio")
                gr_outputaudio = gr.Audio(
                    label="Audio output",
                    source="upload",
                    type="filepath",
                )
            with gr.Column():
                gr_iam = gr.Dropdown(
                    choices=STATE.all_characters, label="I am", value=STATE.iam)
                gr_chars = gr.CheckboxGroup(
                    STATE.all_characters, label="Characters", value=STATE.names)
                gr_reset_button = gr.Button(value="Reset conversation")
                with gr.Accordion("Settings", open=False):
                    openai_api_key_textbox = gr.Textbox(
                        placeholder="Paste your OpenAI API key here",
                        show_label=False,
                        lines=1,
                        type="password",
                    )
                    elevenlabs_api_key_textbox = gr.Textbox(
                        placeholder="Paste your ElevenLabs API key here",
                        show_label=False,
                        lines=1,
                        type="password",
                    )
                    gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
                                           label='GPT Model behind conversation', value=STATE.model)
                    gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
                                              label="Max tokens", step=1)
                    gr_temperature = gr.Slider(
                        minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
    with gr.Tab("New Characters"):
        gr_make_voice_button = gr.Button(value="Update Characters")
        gr_voice_data = gr.Textbox(
            lines=25, label="Character YAML config", value=STATE.characters_yaml)
        gr_make_voice_output = gr.Textbox(
            lines=2, label="Character creation logs...")

    gr.HTML('''<center>
    Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
    <br>
    Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
    </center>
    ''')

    # Buttons and actions
    gr_mic.change(step_mic, gr_mic, gr_convo_output)
    openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
    elevenlabs_api_key_textbox.change(
        set_elevenlabs_key, elevenlabs_api_key_textbox, None)
    gr_add_button.click(step_continue, None, gr_convo_output)
    gr_reset_button.click(
        reset,
        inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
        outputs=[gr_convo_output],
    )
    gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
    gr_playaudio_button.click(play_audio, None, None)
    gr_make_voice_button.click(
        make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
    )

if __name__ == "__main__":
    demo.launch()