import os from io import BytesIO from typing import IO, Optional import time import uuid from pathlib import Path from pydub import AudioSegment import gradio as gr from elevenlabs import Voice, VoiceSettings, save from elevenlabs.client import ElevenLabs def generate_random_filename(parent, extension="txt"): """ Generates a random filename using UUID and current timestamp. Args: extension (str): The file extension for the generated filename. Default is 'txt'. Returns: str: A random filename with the specified extension. """ # Generate a random UUID random_uuid = uuid.uuid4() # Get the current timestamp timestamp = int(time.time()) # Combine UUID and timestamp to create a unique filename filename = f"{random_uuid}_{timestamp}.{extension}" file_path = os.path.join(parent, filename) return file_path ELEVEN_LABS_MODEL = os.getenv("ELEVEN_LABS_MODEL", "eleven_multilingual_v2") ELEVEN_LABS_LANGUAGE_SUPPORTS = [ "English", "Chinese", "Spanish", "Hindi", "Portuguese", "French", "German", "Japanese", "Arabic", "Korean", "Indonesian", "Italian", "Dutch", "Turkish", "Polish", "Swedish", "Filipino", "Malay", "Russian", "Romanian", "Ukrainian", "Greek", "Czech", "Danish", "Finnish", "Bulgarian", "Croatian", "Slovak", "Tamil", ] class ElevenLabsPipeline: def __init__(self): eleven_labs_api_key = os.getenv("ELEVENLABS_TOKEN") if eleven_labs_api_key is None: raise Exception("ELEVENLABS_API_KEY 환경변수를 설정해주세요.") self.client = ElevenLabs( api_key=eleven_labs_api_key, # Defaults to ELEVEN_API_KEY ) #os.makedirs("./tmp", exist_ok=True) os.makedirs("/tmp/elevenlabs", exist_ok=True) def clone_voice(self, audio, name, description=None): response = self.client.voices.get_all() for voice in response.voices: if voice.name == name: return "존재하는 음성입니다. 음성 생성을 시작해주세요." try: voice = self.client.clone( name=name, description=description, # Optional files=[audio], ) return "Voice Clone을 성공적으로 생성했습니다." except Exception as e: return str(e) def _get_voice(self, name: str): response = self.client.voices.get_all() current_voice = None for voice in response.voices: if voice.name == name: current_voice = voice break return current_voice def generate_voice( self, text: str, audio: str = None, language: str = "ko", mute_before_ms: Optional[int] = 0, mute_after_ms: Optional[int] = 0, stability: float = 0.5, similarity_boost: float = 0.75, style: float = 0.0, use_speaker_boost=True, ) -> str: if audio is not None: name = Path(audio).stem self.clone_voice(audio, name) else: gr.Info("음성이 안주어졌습니다. 기본 음성으로 생성하겠습니다.", duration=2) name = "Laura" current_voice = self._get_voice(name) if current_voice is None: current_voice = self._get_voice(name) response = self.client.generate( text=text, model=ELEVEN_LABS_MODEL, voice=Voice( voice_id=current_voice.voice_id, settings=VoiceSettings( stability=stability, similarity_boost=similarity_boost, style=style, use_speaker_boost=use_speaker_boost, language=language, ), ), ) # Create a BytesIO object to hold the audio data in memory audio_stream = BytesIO() # Write each chunk of audio data to the stream for chunk in response: if chunk: audio_stream.write(chunk) # Reset stream position to the beginning audio_stream.seek(0) # Load the audio stream into an AudioSegment audio_segment = AudioSegment.from_file(audio_stream, format="mp3") # Create silent segments for before and after mute_before = AudioSegment.silent(duration=mute_before_ms) mute_after = AudioSegment.silent(duration=mute_after_ms) # Concatenate the segments combined_segment = mute_before + audio_segment + mute_after #tmp_file = generate_random_filename("./tmp", "mp3") tmp_file = generate_random_filename("/tmp/elevenlabs", "mp3") # Export the combined audio to the specified file combined_segment.export(tmp_file, format="mp3", bitrate="128k") return tmp_file