mantrakp
commited on
Commit
β’
acde4c3
1
Parent(s):
84df228
Refactor app.py to add audio tab and update gradio UI
Browse files- app.py +5 -2
- config.py +4 -0
- requirements.txt +2 -0
- tabs/audios/events.py +65 -0
- tabs/audios/load_models.py +17 -0
- tabs/audios/ui.py +49 -0
- tabs/images/load_models.py +1 -0
app.py
CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
|
|
2 |
|
3 |
from config import css
|
4 |
from tabs.images.ui import image_tab
|
|
|
|
|
5 |
|
6 |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
7 |
# Header
|
@@ -16,13 +18,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
|
16 |
with gr.Tabs():
|
17 |
with gr.Tab(label="πΌοΈ Image"):
|
18 |
image_tab()
|
19 |
-
|
20 |
-
|
21 |
# with gr.Tab(label="π₯ Video"):
|
22 |
# video_tab()
|
23 |
# with gr.Tab(label="π Text"):
|
24 |
# text_tab()
|
25 |
|
|
|
26 |
demo.launch(
|
27 |
share=False,
|
28 |
debug=True,
|
|
|
2 |
|
3 |
from config import css
|
4 |
from tabs.images.ui import image_tab
|
5 |
+
from tabs.audios.ui import audio_tab
|
6 |
+
|
7 |
|
8 |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
9 |
# Header
|
|
|
18 |
with gr.Tabs():
|
19 |
with gr.Tab(label="πΌοΈ Image"):
|
20 |
image_tab()
|
21 |
+
with gr.Tab(label="π΅ Audio"):
|
22 |
+
audio_tab()
|
23 |
# with gr.Tab(label="π₯ Video"):
|
24 |
# video_tab()
|
25 |
# with gr.Tab(label="π Text"):
|
26 |
# text_tab()
|
27 |
|
28 |
+
|
29 |
demo.launch(
|
30 |
share=False,
|
31 |
debug=True,
|
config.py
CHANGED
@@ -73,3 +73,7 @@ class Config:
|
|
73 |
"compute_type": torch.bfloat16,
|
74 |
}
|
75 |
]
|
|
|
|
|
|
|
|
|
|
73 |
"compute_type": torch.bfloat16,
|
74 |
}
|
75 |
]
|
76 |
+
|
77 |
+
|
78 |
+
# Audios
|
79 |
+
AUDIOS_MODELS = [{"repo_id": "fal/AuraSR-v2"}]
|
requirements.txt
CHANGED
@@ -22,3 +22,5 @@ git+https://github.com/mantrakp04/BasicSR-fix.git
|
|
22 |
git+https://github.com/TencentARC/GFPGAN.git
|
23 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
24 |
aura_sr
|
|
|
|
|
|
22 |
git+https://github.com/TencentARC/GFPGAN.git
|
23 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
24 |
aura_sr
|
25 |
+
deepfilternet
|
26 |
+
styletts2
|
tabs/audios/events.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gc
|
3 |
+
import tempfile
|
4 |
+
from uuid import uuid4
|
5 |
+
|
6 |
+
import spaces
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
from df.enhance import enhance, load_audio, save_audio
|
10 |
+
|
11 |
+
from config import Config
|
12 |
+
from .load_models import *
|
13 |
+
|
14 |
+
|
15 |
+
# Helper functions
|
16 |
+
def create_temp_file():
|
17 |
+
return tempfile.NamedTemporaryFile(delete=False)
|
18 |
+
|
19 |
+
|
20 |
+
@spaces.GPU(duration=10)
|
21 |
+
def clear_audio(audio: np.ndarray):
|
22 |
+
# Save the audio file
|
23 |
+
audio_file = create_temp_file()
|
24 |
+
np.save(audio_file.name, audio)
|
25 |
+
|
26 |
+
# Load the audio file
|
27 |
+
audio, _ = load_audio(audio_file.name, sr=df_state.sr())
|
28 |
+
enhanced = enhance(df_model, df_state, audio)
|
29 |
+
|
30 |
+
# Save the enhanced audio file
|
31 |
+
save_audio(audio_file.name, enhanced, df_state.sr())
|
32 |
+
|
33 |
+
return gr.update( # speaker_audio, output_audio
|
34 |
+
value=audio_file.name,
|
35 |
+
)
|
36 |
+
|
37 |
+
|
38 |
+
@spaces.GPU(duration=20)
|
39 |
+
def gen_audio(
|
40 |
+
text,
|
41 |
+
language,
|
42 |
+
speaker_audio: np.ndarray,
|
43 |
+
tts_alpha,
|
44 |
+
tts_beta,
|
45 |
+
tts_diffusion_steps,
|
46 |
+
tts_embedding_scale,
|
47 |
+
):
|
48 |
+
# Save the speaker audio file
|
49 |
+
speaker_audio_file = create_temp_file()
|
50 |
+
np.save(speaker_audio_file.name, speaker_audio)
|
51 |
+
|
52 |
+
# Generate the audio
|
53 |
+
output = styletts2_model.inference(
|
54 |
+
text=text,
|
55 |
+
target_voice_path=speaker_audio_file.name,
|
56 |
+
output_wav_file=create_temp_file().name,
|
57 |
+
alpha=float(tts_alpha),
|
58 |
+
beta=float(tts_beta),
|
59 |
+
diffusion_steps=int(tts_diffusion_steps),
|
60 |
+
embedding_scale=int(tts_embedding_scale),
|
61 |
+
)
|
62 |
+
|
63 |
+
return gr.update( # output_audio
|
64 |
+
value=output,
|
65 |
+
)
|
tabs/audios/load_models.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from df.enhance import init_df
|
3 |
+
from styletts2 import tts
|
4 |
+
|
5 |
+
from config import Config
|
6 |
+
|
7 |
+
|
8 |
+
def init_sys():
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
|
11 |
+
df_model, df_state, _ = init_df()
|
12 |
+
|
13 |
+
styletts2_model = tts.StyleTTS2()
|
14 |
+
|
15 |
+
return device, df_model, df_state, styletts2_model
|
16 |
+
|
17 |
+
device, df_model, df_state, styletts2_model = init_sys()
|
tabs/audios/ui.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from config import Config
|
4 |
+
from .events import *
|
5 |
+
|
6 |
+
|
7 |
+
def audio_tab():
|
8 |
+
with gr.Row():
|
9 |
+
with gr.Column():
|
10 |
+
with gr.Group():
|
11 |
+
with gr.Group():
|
12 |
+
text = gr.Textbox(lines=5, label="Enter text")
|
13 |
+
language = gr.Dropdown(
|
14 |
+
label="Language",
|
15 |
+
choices=["en"],
|
16 |
+
value="en",
|
17 |
+
)
|
18 |
+
|
19 |
+
with gr.Accordion('Voice Clone', open=True):
|
20 |
+
speaker_audio = gr.Audio(label="Upload Audio", type='numpy')
|
21 |
+
clear_speaker_audio = gr.Button(label="Clear Audio")
|
22 |
+
|
23 |
+
with gr.Column():
|
24 |
+
output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
|
25 |
+
clear_output_audio = gr.Button(label="Clear Audio")
|
26 |
+
generate_audio = gr.Button(label="Generate Audio")
|
27 |
+
|
28 |
+
with gr.Accordion('Advance Settings', open=True):
|
29 |
+
settings = [
|
30 |
+
('Alpha', 'tts_alpha', 'float', 0.0, 1.0, 0.3, 0.1,),
|
31 |
+
('Beta', 'tts_beta', 'float', 0.0, 1.0, 0.7, 0.1,),
|
32 |
+
('Diffusion Steps', 'tts_diffusion_steps', 'int', 1, 100, 10, 1,),
|
33 |
+
('Embedding Scale', 'tts_embedding_scale', 'int', 0, 10, 1, 1,),
|
34 |
+
]
|
35 |
+
for label, key, type_, min_, max_, value, step in settings:
|
36 |
+
globals()[key] = gr.Slider(label=label, minimum=min_, maximum=max_, value=value, step=step)
|
37 |
+
|
38 |
+
|
39 |
+
# Events
|
40 |
+
# Clear Audio
|
41 |
+
clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
|
42 |
+
clear_output_audio.click(clear_audio, output_audio, output_audio)
|
43 |
+
|
44 |
+
# Generate Audio
|
45 |
+
generate_audio.click(
|
46 |
+
gen_audio,
|
47 |
+
[text, language, speaker_audio, tts_alpha, tts_beta, tts_diffusion_steps, tts_embedding_scale], # type: ignore
|
48 |
+
[output_audio]
|
49 |
+
)
|
tabs/images/load_models.py
CHANGED
@@ -10,6 +10,7 @@ from diffusers.schedulers import *
|
|
10 |
|
11 |
from config import Config
|
12 |
|
|
|
13 |
def init_sys():
|
14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
|
|
|
10 |
|
11 |
from config import Config
|
12 |
|
13 |
+
|
14 |
def init_sys():
|
15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|