hu-po commited on
Commit
d2d6865
1 Parent(s): 924b0a9

release 0.1

Browse files
app.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ import random
5
+ from typing import Dict, List, Tuple
6
+
7
+ import gradio as gr
8
+ import yaml
9
+
10
+ from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
11
+ play_history, save_history)
12
+ from src.openailib import top_response, speech_to_text
13
+ from src.tube import extract_audio
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ log = logging.getLogger(__name__)
17
+
18
+
19
+ def set_openai_key(openai_api_key_textbox):
20
+ log.info(f"Setting OpenAI key.")
21
+ os.environ["OPENAI_API_KEY"] = openai_api_key_textbox
22
+ import openai
23
+ openai.api_key = os.getenv("OPENAI_API_KEY")
24
+
25
+
26
+ def set_elevenlabs_key(elevenlabs_api_key_textbox):
27
+ log.info(f"Setting ElevenLabs key.")
28
+ os.environ["ELEVENLABS_API_KEY"] = elevenlabs_api_key_textbox
29
+ import elevenlabs
30
+ elevenlabs.api_key = os.getenv("ELEVENLABS_API_KEY")
31
+
32
+
33
+ class ConversationState:
34
+ COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
35
+ '#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
36
+ YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
37
+ AUDIO_SAVEDIR: str = os.path.join(
38
+ os.path.dirname(__file__), 'audio_export')
39
+
40
+ def __init__(self,
41
+ names: list = None,
42
+ iam: str = None,
43
+ model: str = "gpt-3.5-turbo",
44
+ max_tokens: int = 30,
45
+ temperature: float = 0.5,
46
+ history: list = None):
47
+ self.model = model
48
+ self.max_tokens = max_tokens
49
+ self.temperature = temperature
50
+ # Make sure save dir exists, make any necessary directories
51
+ os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
52
+ self.audio_savepath = os.path.join(
53
+ self.AUDIO_SAVEDIR, 'conversation.wav')
54
+ log.info(f"Resetting conversation")
55
+ with open(self.YAML_FILEPATH, 'r') as file:
56
+ self.characters_yaml = file.read()
57
+ file.seek(0)
58
+ self.characters_dict = yaml.safe_load(file)
59
+ self.all_characters = [
60
+ name for name in self.characters_dict.keys()]
61
+ self.names = names or random.choices(self.all_characters, k=2)
62
+ self.iam = iam or random.choice(self.names)
63
+ assert self.iam in self.names, f"{self.iam} not in {self.names}"
64
+ log.info(f"Loading voices")
65
+ self.speakers: Dict[str, Speaker] = {}
66
+ self.speakers_descriptions: str = ''
67
+ for i, name in enumerate(self.names):
68
+ if check_voice_exists(name) is None:
69
+ log.warning(f"Voice {name} does not exist")
70
+ continue
71
+ _speaker = Speaker(
72
+ name=name,
73
+ voice=get_make_voice(name),
74
+ color=self.COLORS[i % len(self.COLORS)],
75
+ description=self.characters_dict[name].get(
76
+ "description", None),
77
+ )
78
+ self.speakers[name] = _speaker
79
+ if _speaker.description is not None:
80
+ self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
81
+ # System is fed into OpenAI to condition the prompt
82
+ self.system = f"You create funny conversation dialogues."
83
+ self.system += f"This conversation is between {', '.join(self.names)}."
84
+ self.system += "Do not introduce new characters."
85
+ self.system += "Descriptions for each of the characters are:\n"
86
+ for speaker in self.speakers.values():
87
+ self.system += f"{speaker.name}: {speaker.description}\n"
88
+ self.system += "Only return one person's response at a time."
89
+ self.system += "Each response must start with the character name, then a colon, then their response in a single line."
90
+ self.system += "Keep the responses short and witty."
91
+ self.system += "Make sure the responses are only one sentence long."
92
+ self.system += "Do not continue a previous response. Always start a new response."
93
+ # History is fed in at every step
94
+ self.step = 0
95
+ if history is None:
96
+ self.history: List[Tuple[Speaker, str]] = []
97
+
98
+ def add_to_history(self, text: str, speaker: Speaker = None):
99
+ if speaker is None:
100
+ speaker = self.speakers[self.iam]
101
+ self.history.append((speaker, text))
102
+
103
+ def history_to_prompt(self) -> str:
104
+ prompt: str = ''
105
+ for speaker, text in self.history:
106
+ prompt += f"{speaker.name}:{text}\n"
107
+ return prompt
108
+
109
+ def html_history(self) -> str:
110
+ history_html: str = ''
111
+ for speaker, text in self.history:
112
+ _bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
113
+ history_html += _bubble
114
+ return history_html
115
+
116
+
117
+ # Storing state in the global scope like this is bad, but
118
+ # perfect is the enemy of good enough and gradio is kind of shit
119
+ STATE = ConversationState()
120
+
121
+
122
+ def reset(names, iam, model, max_tokens, temperature):
123
+ # Push new global state to the global scope
124
+ global STATE
125
+ STATE = ConversationState(
126
+ names=names,
127
+ iam=iam,
128
+ model=model,
129
+ max_tokens=max_tokens,
130
+ temperature=temperature,
131
+ )
132
+ return STATE.html_history()
133
+
134
+
135
+ def step_mic(audio):
136
+ global STATE
137
+ try:
138
+ request = speech_to_text(audio)
139
+ STATE.add_to_history(request)
140
+ except TypeError as e:
141
+ log.warning(e)
142
+ pass
143
+ return STATE.html_history()
144
+
145
+
146
+ def step_continue():
147
+ global STATE
148
+ response = top_response(STATE.history_to_prompt(),
149
+ system=STATE.system,
150
+ model=STATE.model,
151
+ max_tokens=STATE.max_tokens,
152
+ temperature=STATE.temperature,
153
+ )
154
+ for line in response.splitlines():
155
+ try:
156
+ # TODO: Add any filters here as assertion errors
157
+ if not line:
158
+ continue
159
+ assert ":" in line, f"Line {line} does not have a colon"
160
+ name, text = line.split(":")
161
+ assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
162
+ speaker = STATE.speakers[name]
163
+ assert len(text) > 0, f"Text {text} is empty"
164
+ STATE.add_to_history(text, speaker=speaker)
165
+ except AssertionError as e:
166
+ log.warning(e)
167
+ continue
168
+ return STATE.html_history()
169
+
170
+
171
+ def save_audio():
172
+ global STATE
173
+ log.info(f"Saving audio")
174
+ asyncio.run(save_history(STATE.history, STATE.audio_savepath))
175
+ return STATE.audio_savepath
176
+
177
+
178
+ def play_audio():
179
+ global STATE
180
+ log.info(f"Playing audio")
181
+ asyncio.run(play_history(STATE.history))
182
+ return STATE.html_history()
183
+
184
+
185
+ def make_voices(voices_yaml: str):
186
+ global STATE
187
+ try:
188
+ STATE.characters_dict = yaml.safe_load(voices_yaml)
189
+ for name, metadata in STATE.characters_dict.items():
190
+ videos = metadata['references']
191
+ assert isinstance(name, str), f"Name {name} is not a string"
192
+ assert isinstance(videos, list), f"Videos {videos} is not a list"
193
+ if check_voice_exists(name):
194
+ continue
195
+ audio_paths = []
196
+ for i, video in enumerate(videos):
197
+ assert isinstance(video, Dict), f"Video {video} is not a dict"
198
+ assert 'url' in video, f"Video {video} does not have a url"
199
+ url = video['url']
200
+ start_minute = video.get('start_minute', 0)
201
+ duration = video.get('duration_seconds', 120)
202
+ label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
203
+ output_path = extract_audio(url, label, start_minute, duration)
204
+ audio_paths.append(output_path)
205
+ get_make_voice(name, audio_paths)
206
+ except Exception as e:
207
+ raise e
208
+ # return f"Error: {e}"
209
+ return "Success"
210
+
211
+
212
+ # Define the main GradIO UI
213
+ with gr.Blocks() as demo:
214
+ gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
215
+ with gr.Tab("Conversation"):
216
+ gr_convo_output = gr.HTML()
217
+ with gr.Row():
218
+ with gr.Column():
219
+ gr_mic = gr.Audio(
220
+ label="Record audio into conversation",
221
+ source="microphone",
222
+ type="filepath",
223
+ )
224
+ gr_add_button = gr.Button(value="Add to conversation")
225
+ gr_playaudio_button = gr.Button(value="Play audio")
226
+ gr_saveaudio_button = gr.Button(value="Export audio")
227
+ gr_outputaudio = gr.Audio(
228
+ label="Audio output",
229
+ source="upload",
230
+ type="filepath",
231
+ )
232
+ with gr.Column():
233
+ gr_iam = gr.Dropdown(
234
+ choices=STATE.all_characters, label="I am", value=STATE.iam)
235
+ gr_chars = gr.CheckboxGroup(
236
+ STATE.all_characters, label="Characters", value=STATE.names)
237
+ gr_reset_button = gr.Button(value="Reset conversation")
238
+ with gr.Accordion("Settings", open=False):
239
+ openai_api_key_textbox = gr.Textbox(
240
+ placeholder="Paste your OpenAI API key here",
241
+ show_label=False,
242
+ lines=1,
243
+ type="password",
244
+ )
245
+ elevenlabs_api_key_textbox = gr.Textbox(
246
+ placeholder="Paste your ElevenLabs API key here",
247
+ show_label=False,
248
+ lines=1,
249
+ type="password",
250
+ )
251
+ gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
252
+ label='GPT Model behind conversation', value=STATE.model)
253
+ gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
254
+ label="Max tokens", step=1)
255
+ gr_temperature = gr.Slider(
256
+ minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
257
+ with gr.Tab("New Characters"):
258
+ gr_make_voice_button = gr.Button(value="Update Characters")
259
+ gr_voice_data = gr.Textbox(
260
+ lines=25, label="Character YAML config", value=STATE.characters_yaml)
261
+ gr_make_voice_output = gr.Textbox(
262
+ lines=2, label="Character creation logs...")
263
+
264
+ gr.HTML('''<center>
265
+ Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
266
+ <br>
267
+ Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
268
+ </center>
269
+ ''')
270
+
271
+ # Buttons and actions
272
+ gr_mic.change(step_mic, gr_mic, gr_convo_output)
273
+ openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
274
+ elevenlabs_api_key_textbox.change(
275
+ set_elevenlabs_key, elevenlabs_api_key_textbox, None)
276
+ gr_add_button.click(step_continue, None, gr_convo_output)
277
+ gr_reset_button.click(
278
+ reset,
279
+ inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
280
+ outputs=[gr_convo_output],
281
+ )
282
+ gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
283
+ gr_playaudio_button.click(play_audio, None, None)
284
+ gr_make_voice_button.click(
285
+ make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
286
+ )
287
+
288
+ if __name__ == "__main__":
289
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai==0.27.0
2
+ elevenlabslib
3
+ sounddevice==0.4.6
4
+ soundfile==0.12.1
5
+ gradio==3.19.1
6
+ scipy==1.10.1
7
+ SpeechRecognition==3.9.0
8
+ pytube==12.1.2
src/__pycache__/elevenlabs.cpython-310.pyc ADDED
Binary file (4.12 kB). View file
 
src/__pycache__/elevenlabs.cpython-39.pyc ADDED
Binary file (4.11 kB). View file
 
src/__pycache__/openailib.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
src/__pycache__/openailib.cpython-39.pyc ADDED
Binary file (1.23 kB). View file
 
src/__pycache__/tube.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
src/__pycache__/tube.cpython-39.pyc ADDED
Binary file (1.81 kB). View file
 
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (639 Bytes). View file
 
src/__pycache__/utils.cpython-39.pyc ADDED
Binary file (637 Bytes). View file
 
src/elevenlabs.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import io
3
+ import logging
4
+ import os
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from dataclasses import dataclass
8
+ from typing import Dict, List, Union, Tuple
9
+
10
+ import sounddevice as sd
11
+ import soundfile as sf
12
+ from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
13
+
14
+ from .utils import timeit
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ log = logging.getLogger(__name__)
18
+
19
+ USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
20
+
21
+
22
+ @dataclass
23
+ class Speaker:
24
+ name: str
25
+ voice: ElevenLabsVoice
26
+ color: str
27
+ description: str = None
28
+
29
+
30
+ async def text_to_speechbytes_async(text, speaker, loop):
31
+ with ThreadPoolExecutor() as executor:
32
+ speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
33
+ return speech_bytes
34
+
35
+
36
+ async def play_history(history: List[Tuple[Speaker, str]]):
37
+ loop = asyncio.get_event_loop()
38
+
39
+ # Create a list of tasks for all text_to_speechbytes function calls
40
+ tasks = [text_to_speechbytes_async(
41
+ text, speaker, loop) for speaker, text in history]
42
+
43
+ # Run tasks concurrently, waiting for the first one to complete
44
+ for speech_bytes in await asyncio.gather(*tasks):
45
+ audioFile = io.BytesIO(speech_bytes)
46
+ soundFile = sf.SoundFile(audioFile)
47
+ sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
48
+
49
+
50
+ async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
51
+ loop = asyncio.get_event_loop()
52
+
53
+ # Create a list of tasks for all text_to_speechbytes function calls
54
+ tasks = [text_to_speechbytes_async(
55
+ text, speaker, loop) for speaker, text in history]
56
+
57
+ # Run tasks concurrently, waiting for the first one to complete
58
+ all_speech_bytes = await asyncio.gather(*tasks)
59
+
60
+ # Combine all audio bytes into a single audio file
61
+ concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
62
+
63
+ # Save the combined audio file to disk
64
+ with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
65
+ with sf.SoundFile(
66
+ audio_savepath, mode='w',
67
+ samplerate=soundFile.samplerate,
68
+ channels=soundFile.channels,
69
+ ) as outputFile:
70
+ outputFile.write(soundFile.read())
71
+
72
+
73
+ def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
74
+ log.info(f"Getting voice {voice}...")
75
+ _available_voices = USER.get_voices_by_name(voice)
76
+ if _available_voices:
77
+ log.info(f"Voice {voice} already exists, found {_available_voices}.")
78
+ return _available_voices[0]
79
+ return None
80
+
81
+
82
+ @timeit
83
+ def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
84
+ _voice = check_voice_exists(voice)
85
+ if _voice is not None:
86
+ return _voice
87
+ else:
88
+ if USER.get_voice_clone_available():
89
+ assert audio_path is not None, "audio_path must be provided"
90
+ assert isinstance(audio_path, list), "audio_path must be a list"
91
+ log.info(f"Cloning voice {voice}...")
92
+ _audio_source_dict = {
93
+ # Audio path is a PosixPath
94
+ _.name: open(_, "rb").read() for _ in audio_path
95
+ }
96
+ newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
97
+ return newVoice
98
+ raise ValueError(
99
+ f"Voice {voice} does not exist and cloning is not available.")
100
+
101
+
102
+ @timeit
103
+ def text_to_speech(text: str, voice: ElevenLabsVoice):
104
+ log.info(f"Generating audio using voice {voice}...")
105
+ time_start = time.time()
106
+ voice.generate_and_play_audio(text, playInBackground=False)
107
+ duration = time.time() - time_start
108
+ return duration
109
+
110
+
111
+ @timeit
112
+ def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
113
+ log.info(f"Generating audio for voice {voice} text {text}...")
114
+ audio_bytes = voice.generate_audio_bytes(text)
115
+ return audio_bytes
src/openailib.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from .utils import timeit
5
+
6
+ import openai
7
+ openai.api_key = os.getenv("OPENAI_API_KEY")
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ @timeit
14
+ def speech_to_text(audio_path):
15
+ log.info("Transcribing audio...")
16
+ transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
17
+ text = transcript["text"]
18
+ log.info(f"Transcript: \n\t{text}")
19
+ return text
20
+
21
+
22
+ @timeit
23
+ def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
24
+ _prompt = [
25
+ {
26
+ "role": "user",
27
+ "content": prompt,
28
+ },
29
+ ]
30
+ if system:
31
+ _prompt = [
32
+ {
33
+ "role": "system",
34
+ "content": system,
35
+ },
36
+ ] + _prompt
37
+ log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
38
+ _response = openai.ChatCompletion.create(
39
+ model=model,
40
+ messages=_prompt,
41
+ temperature=temperature,
42
+ n=1,
43
+ max_tokens=max_tokens,
44
+ )
45
+ log.info(f"API reponse: \n\t{_response}")
46
+ response: str = _response['choices'][0]['message']['content']
47
+ return response
src/tube.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Extract audio from a YouTube video
3
+
4
+ Usage:
5
+ tube.py <url> <person> [-s <start_time>] [-d <duration>]
6
+ '''
7
+
8
+ import subprocess
9
+ from pathlib import Path
10
+ import datetime
11
+ import argparse
12
+ import os
13
+ from pytube import YouTube
14
+
15
+ # Define argparse arguments
16
+ parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
17
+ parser.add_argument('url', type=str, help='the YouTube video URL')
18
+ parser.add_argument('person', type=str, help='the name of the person speaking')
19
+ parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
20
+ parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
21
+
22
+
23
+ # 200 seconds seems to be max duration for single clips
24
+ def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
25
+
26
+ # Download the YouTube video
27
+ youtube_object = YouTube(url)
28
+ stream = youtube_object.streams.first()
29
+ video_path = Path(stream.download(skip_existing=True))
30
+
31
+ # Convert start time to seconds
32
+ start_time_seconds = int(start_minute * 60)
33
+
34
+ # Format the start time in HH:MM:SS.mmm format
35
+ start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
36
+ start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
37
+
38
+ # Set the output path using the audio file name
39
+ output_path = video_path.parent / f"{label}.wav"
40
+
41
+ # Run ffmpeg to extract the audio
42
+ cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
43
+ if duration is not None:
44
+ # Format the duration in HH:MM:SS.mmm format
45
+ duration_formatted = str(datetime.timedelta(seconds=duration))
46
+ duration_formatted = duration_formatted[:11] + duration_formatted[12:]
47
+ cmd += ['-t', duration_formatted]
48
+ cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
49
+ subprocess.run(cmd)
50
+
51
+ # remove the extra .3gpp file that is created:
52
+ for file in os.listdir(video_path.parent):
53
+ if file.endswith(".3gpp"):
54
+ os.remove(os.path.join(video_path.parent, file))
55
+
56
+ return output_path
57
+
58
+ if __name__ == '__main__':
59
+
60
+ # Parse the arguments
61
+ args = parser.parse_args()
62
+
63
+ # Extract the audio
64
+ extract_audio(args.url, args.person, args.start_time, args.duration)
src/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+
4
+ log = logging.getLogger(__name__)
5
+
6
+ # Decorator to time a function
7
+ def timeit(func):
8
+ def timed(*args, **kwargs):
9
+ time_start = time.time()
10
+ result = func(*args, **kwargs)
11
+ _yellow = "\x1b[33;20m"
12
+ _reset = "\x1b[0m"
13
+ _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
14
+ log.info(_msg)
15
+ return result
16
+ return timed
voices.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ JoeBiden:
2
+ description: "Stumbles and stutters over words, uses old timey phrases."
3
+ references:
4
+ - url: "https://youtu.be/uhjVn-J_cVs"
5
+ start_minute: 0
6
+ duration_seconds: 120
7
+ DonaldTrump:
8
+ description: "Bombastic quarrelsome narcisist who talks up his ideas."
9
+ references:
10
+ - url: "https://youtu.be/f0UB06v7yLY"
11
+ start_minute: 0
12
+ duration_seconds: 120
13
+ ElonMusk:
14
+ description: "Visionary entrepreneur who loves low quality memes."
15
+ references:
16
+ - url: "https://youtu.be/DxREm3s1scA"
17
+ start_minute: 1.7
18
+ duration_seconds: 27
19
+ - url: "https://youtu.be/DxREm3s1scA"
20
+ start_minute: 18.5
21
+ duration_seconds: 60
22
+ LexFridman:
23
+ description: "Depressing and lonely thinker, makes references to classic literature"
24
+ references:
25
+ - url: "https://youtu.be/DxREm3s1scA"
26
+ start_minute: 1
27
+ duration_seconds: 30
28
+ - url: "https://youtu.be/DxREm3s1scA"
29
+ start_minute: 41.4
30
+ duration_seconds: 30