Spaces:

hu-po
/

speech2speech

Build error

App Files Files Community

hu-po commited on Mar 17, 2023

Commit

00ed2b4

•

1 Parent(s): d03c703

release 0.2

Browse files

Files changed (5) hide show

src/elevenlabs.py +1 -0
src/src/elevenlabs.py +0 -128
src/src/openailib.py +0 -52
src/src/tube.py +0 -64
src/src/utils.py +0 -16

src/elevenlabs.py CHANGED Viewed

@@ -19,6 +19,7 @@ log = logging.getLogger(__name__)
 try:
     USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
 except KeyError as e:
     log.warning("ELEVENLABS_API_KEY not found in environment variables.")
     pass

 try:
     USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
 except KeyError as e:
+    USER = None
     log.warning("ELEVENLABS_API_KEY not found in environment variables.")
     pass

src/src/elevenlabs.py DELETED Viewed

@@ -1,128 +0,0 @@
-import asyncio
-import io
-import logging
-import os
-import time
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass
-from typing import List, Union, Tuple
-import sounddevice as sd
-import soundfile as sf
-from elevenlabslib import ElevenLabsUser, ElevenLabsVoice
-from .utils import timeit
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(__name__)
-try:
-    USER = ElevenLabsUser(os.environ["ELEVENLABS_API_KEY"])
-except KeyError as e:
-    USER = None
-    log.warning("ELEVENLABS_API_KEY not found in environment variables.")
-    pass
-@dataclass
-class Speaker:
-    name: str
-    voice: ElevenLabsVoice
-    color: str
-    description: str = None
-async def text_to_speechbytes_async(text, speaker, loop):
-    with ThreadPoolExecutor() as executor:
-        speech_bytes = await loop.run_in_executor(executor, text_to_speechbytes, text, speaker.voice)
-    return speech_bytes
-async def play_history(history: List[Tuple[Speaker, str]]):
-    loop = asyncio.get_event_loop()
-    # Create a list of tasks for all text_to_speechbytes function calls
-    tasks = [text_to_speechbytes_async(
-        text, speaker, loop) for speaker, text in history]
-    # Run tasks concurrently, waiting for the first one to complete
-    for speech_bytes in await asyncio.gather(*tasks):
-        audioFile = io.BytesIO(speech_bytes)
-        soundFile = sf.SoundFile(audioFile)
-        sd.play(soundFile.read(), samplerate=soundFile.samplerate, blocking=True)
-async def save_history(history: List[Tuple[Speaker, str]], audio_savepath: str):
-    loop = asyncio.get_event_loop()
-    # Create a list of tasks for all text_to_speechbytes function calls
-    tasks = [text_to_speechbytes_async(
-        text, speaker, loop) for speaker, text in history]
-    # Run tasks concurrently, waiting for the first one to complete
-    all_speech_bytes = await asyncio.gather(*tasks)
-    # Combine all audio bytes into a single audio file
-    concatenated_audio = io.BytesIO(b''.join(all_speech_bytes))
-    # Save the combined audio file to disk
-    with sf.SoundFile(concatenated_audio, mode='r') as soundFile:
-        with sf.SoundFile(
-            audio_savepath, mode='w',
-            samplerate=soundFile.samplerate,
-            channels=soundFile.channels,
-        ) as outputFile:
-            outputFile.write(soundFile.read())
-def check_voice_exists(voice: Union[ElevenLabsVoice, str]) -> Union[ElevenLabsVoice, None]:
-    if USER is None:
-        log.warning(
-            "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
-        return None
-    log.info(f"Getting voice {voice}...")
-    _available_voices = USER.get_voices_by_name(voice)
-    if _available_voices:
-        log.info(f"Voice {voice} already exists, found {_available_voices}.")
-        return _available_voices[0]
-    return None
-@timeit
-def get_make_voice(voice: Union[ElevenLabsVoice, str], audio_path: List[str] = None) -> ElevenLabsVoice:
-    if USER is None:
-        log.warning(
-            "No ElevenLabsUser found, have you set the ELEVENLABS_API_KEY environment variable?")
-        return None
-    _voice = check_voice_exists(voice)
-    if _voice is not None:
-        return _voice
-    else:
-        if USER.get_voice_clone_available():
-            assert audio_path is not None, "audio_path must be provided"
-            assert isinstance(audio_path, list), "audio_path must be a list"
-            log.info(f"Cloning voice {voice}...")
-            _audio_source_dict = {
-                # Audio path is a PosixPath
-                _.name: open(_, "rb").read() for _ in audio_path
-            }
-            newVoice = USER.clone_voice_bytes(voice, _audio_source_dict)
-            return newVoice
-    raise ValueError(
-        f"Voice {voice} does not exist and cloning is not available.")
-@timeit
-def text_to_speech(text: str, voice: ElevenLabsVoice):
-    log.info(f"Generating audio using voice {voice}...")
-    time_start = time.time()
-    voice.generate_and_play_audio(text, playInBackground=False)
-    duration = time.time() - time_start
-    return duration
-@timeit
-def text_to_speechbytes(text: str, voice: ElevenLabsVoice):
-    log.info(f"Generating audio for voice {voice} text {text}...")
-    audio_bytes = voice.generate_audio_bytes(text)
-    return audio_bytes

src/src/openailib.py DELETED Viewed

@@ -1,52 +0,0 @@
-import logging
-import os
-from .utils import timeit
-import openai
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(__name__)
-try:
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-except KeyError as e:
-    log.warning("OPENAI_API_KEY not found in environment variables.")
-    pass
-@timeit
-def speech_to_text(audio_path):
-    log.info("Transcribing audio...")
-    transcript = openai.Audio.transcribe("whisper-1", open(audio_path, "rb"))
-    text = transcript["text"]
-    log.info(f"Transcript: \n\t{text}")
-    return text
-@timeit
-def top_response(prompt, system=None, model="gpt-3.5-turbo", max_tokens=20, temperature=0.8):
-    _prompt = [
-        {
-            "role": "user",
-            "content": prompt,
-        },
-    ]
-    if system:
-        _prompt = [
-            {
-                "role": "system",
-                "content": system,
-            },
-        ] + _prompt
-    log.info(f"API call to {model} with prompt: \n\n\t{_prompt}\n\n")
-    _response = openai.ChatCompletion.create(
-        model=model,
-        messages=_prompt,
-        temperature=temperature,
-        n=1,
-        max_tokens=max_tokens,
-    )
-    log.info(f"API reponse: \n\t{_response}")
-    response: str = _response['choices'][0]['message']['content']
-    return response

src/src/tube.py DELETED Viewed

@@ -1,64 +0,0 @@
-'''
-Extract audio from a YouTube video
-Usage:
-    tube.py <url> <person> [-s <start_time>] [-d <duration>]
-'''
-import subprocess
-from pathlib import Path
-import datetime
-import argparse
-import os
-from pytube import YouTube
-# Define argparse arguments
-parser = argparse.ArgumentParser(description='Extract audio from a YouTube video')
-parser.add_argument('url', type=str, help='the YouTube video URL')
-parser.add_argument('person', type=str, help='the name of the person speaking')
-parser.add_argument('-s', '--start-time', type=float, default=0, help='the start time in minutes for the extracted audio (default: 0)')
-parser.add_argument('-d', '--duration', type=int, help='the duration in seconds for the extracted audio (default: 60)')
-# 200 seconds seems to be max duration for single clips
-def extract_audio(url: str, label: str, start_minute: float = 0, duration: int = 200):
-    # Download the YouTube video
-    youtube_object = YouTube(url)
-    stream = youtube_object.streams.first()
-    video_path = Path(stream.download(skip_existing=True))
-    # Convert start time to seconds
-    start_time_seconds = int(start_minute * 60)
-    # Format the start time in HH:MM:SS.mmm format
-    start_time_formatted = str(datetime.timedelta(seconds=start_time_seconds))
-    start_time_formatted = start_time_formatted[:11] + start_time_formatted[12:]
-    # Set the output path using the audio file name
-    output_path = video_path.parent / f"{label}.wav"
-    # Run ffmpeg to extract the audio
-    cmd = ['ffmpeg', '-y', '-i', str(video_path), '-ss', start_time_formatted]
-    if duration is not None:
-        # Format the duration in HH:MM:SS.mmm format
-        duration_formatted = str(datetime.timedelta(seconds=duration))
-        duration_formatted = duration_formatted[:11] + duration_formatted[12:]
-        cmd += ['-t', duration_formatted]
-    cmd += ['-q:a', '0', '-map', 'a', str(output_path)]
-    subprocess.run(cmd)
-    # remove the extra .3gpp file that is created:
-    for file in os.listdir(video_path.parent):
-        if file.endswith(".3gpp"):
-            os.remove(os.path.join(video_path.parent, file))
-    return output_path
-if __name__ == '__main__':
-    # Parse the arguments
-    args = parser.parse_args()
-    # Extract the audio
-    extract_audio(args.url, args.person, args.start_time, args.duration)

src/src/utils.py DELETED Viewed

@@ -1,16 +0,0 @@
-import time
-import logging
-log = logging.getLogger(__name__)
-# Decorator to time a function
-def timeit(func):
-    def timed(*args, **kwargs):
-        time_start = time.time()
-        result = func(*args, **kwargs)
-        _yellow = "\x1b[33;20m"
-        _reset = "\x1b[0m"
-        _msg = f"{_yellow}{func.__name__} duration: {time.time() - time_start:.2f} seconds{_reset}"
-        log.info(_msg)
-        return result
-    return timed