Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,982 Bytes
d98c79a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import os
from io import BytesIO
from typing import IO, Optional
import time
import uuid
from pathlib import Path
from pydub import AudioSegment
import gradio as gr
from elevenlabs import Voice, VoiceSettings, save
from elevenlabs.client import ElevenLabs
def generate_random_filename(parent, extension="txt"):
"""
Generates a random filename using UUID and current timestamp.
Args:
extension (str): The file extension for the generated filename. Default is 'txt'.
Returns:
str: A random filename with the specified extension.
"""
# Generate a random UUID
random_uuid = uuid.uuid4()
# Get the current timestamp
timestamp = int(time.time())
# Combine UUID and timestamp to create a unique filename
filename = f"{random_uuid}_{timestamp}.{extension}"
file_path = os.path.join(parent, filename)
return file_path
ELEVEN_LABS_MODEL = os.getenv("ELEVEN_LABS_MODEL", "eleven_multilingual_v2")
ELEVEN_LABS_LANGUAGE_SUPPORTS = [
"English",
"Chinese",
"Spanish",
"Hindi",
"Portuguese",
"French",
"German",
"Japanese",
"Arabic",
"Korean",
"Indonesian",
"Italian",
"Dutch",
"Turkish",
"Polish",
"Swedish",
"Filipino",
"Malay",
"Russian",
"Romanian",
"Ukrainian",
"Greek",
"Czech",
"Danish",
"Finnish",
"Bulgarian",
"Croatian",
"Slovak",
"Tamil",
]
class ElevenLabsPipeline:
def __init__(self):
eleven_labs_api_key = os.getenv("ELEVENLABS_API_KEY", "sk_f4f7d77bc8065b15824cf52ea46c7d99e0e5db2a0f93b673")
if eleven_labs_api_key is None:
raise Exception("ELEVENLABS_API_KEY ํ๊ฒฝ๋ณ์๋ฅผ ์ค์ ํด์ฃผ์ธ์.")
self.client = ElevenLabs(
api_key=eleven_labs_api_key, # Defaults to ELEVEN_API_KEY
)
os.makedirs("./tmp", exist_ok=True)
def clone_voice(self, audio, name, description=None):
response = self.client.voices.get_all()
for voice in response.voices:
if voice.name == name:
return "์กด์ฌํ๋ ์์ฑ์
๋๋ค. ์์ฑ ์์ฑ์ ์์ํด์ฃผ์ธ์."
try:
voice = self.client.clone(
name=name,
description=description, # Optional
files=[audio],
)
return "Voice Clone์ ์ฑ๊ณต์ ์ผ๋ก ์์ฑํ์ต๋๋ค."
except Exception as e:
return str(e)
def _get_voice(self, name: str):
response = self.client.voices.get_all()
current_voice = None
for voice in response.voices:
if voice.name == name:
current_voice = voice
break
return current_voice
def generate_voice(
self,
text: str,
audio: str = None,
language: str = "ko",
mute_before_ms: Optional[int] = 0,
mute_after_ms: Optional[int] = 0,
stability: float = 0.5,
similarity_boost: float = 0.75,
style: float = 0.0,
use_speaker_boost=True,
) -> str:
if audio is not None:
name = Path(audio).stem
self.clone_voice(audio, name)
else:
gr.Info("์์ฑ์ด ์์ฃผ์ด์ก์ต๋๋ค. ๊ธฐ๋ณธ ์์ฑ์ผ๋ก ์์ฑํ๊ฒ ์ต๋๋ค.", duration=2)
name = "Laura"
current_voice = self._get_voice(name)
if current_voice is None:
current_voice = self._get_voice(name)
response = self.client.generate(
text=text,
model=ELEVEN_LABS_MODEL,
voice=Voice(
voice_id=current_voice.voice_id,
settings=VoiceSettings(
stability=stability,
similarity_boost=similarity_boost,
style=style,
use_speaker_boost=use_speaker_boost,
language=language,
),
),
)
# Create a BytesIO object to hold the audio data in memory
audio_stream = BytesIO()
# Write each chunk of audio data to the stream
for chunk in response:
if chunk:
audio_stream.write(chunk)
# Reset stream position to the beginning
audio_stream.seek(0)
# Load the audio stream into an AudioSegment
audio_segment = AudioSegment.from_file(audio_stream, format="mp3")
# Create silent segments for before and after
mute_before = AudioSegment.silent(duration=mute_before_ms)
mute_after = AudioSegment.silent(duration=mute_after_ms)
# Concatenate the segments
combined_segment = mute_before + audio_segment + mute_after
tmp_file = generate_random_filename("./tmp", "mp3")
# Export the combined audio to the specified file
combined_segment.export(tmp_file, format="mp3", bitrate="128k")
return tmp_file |