Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import tempfile
|
|
8 |
import numpy as np
|
9 |
import os
|
10 |
|
11 |
-
# Set MPS
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
# Title and Description
|
@@ -22,7 +22,7 @@ Only the first 10 seconds of the audio will be processed.
|
|
22 |
# Upload audio file
|
23 |
uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
|
24 |
|
25 |
-
# Model Parameters
|
26 |
st.sidebar.title("Model Parameters")
|
27 |
model_name = st.sidebar.selectbox("Select Model", ["basic", "speech"], index=0)
|
28 |
ddim_steps = st.sidebar.slider("DDIM Steps", min_value=10, max_value=100, value=50)
|
@@ -30,18 +30,27 @@ guidance_scale = st.sidebar.slider("Guidance Scale", min_value=1.0, max_value=10
|
|
30 |
random_seed = st.sidebar.number_input("Random Seed", min_value=0, value=42, step=1)
|
31 |
latent_t_per_second = 12.8
|
32 |
|
33 |
-
# Helper function
|
34 |
def plot_spectrogram(waveform, sample_rate, title):
|
35 |
plt.figure(figsize=(10, 4))
|
36 |
-
spectrogram =
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
plt.colorbar(format="%+2.0f dB")
|
42 |
plt.title(title)
|
43 |
-
plt.xlabel("Time")
|
44 |
-
plt.ylabel("Frequency")
|
45 |
plt.tight_layout()
|
46 |
st.pyplot(plt)
|
47 |
|
@@ -49,7 +58,6 @@ def plot_spectrogram(waveform, sample_rate, title):
|
|
49 |
if uploaded_file and st.button("Enhance Audio"):
|
50 |
st.write("Processing audio...")
|
51 |
|
52 |
-
# Create temp directory for saving files
|
53 |
with tempfile.TemporaryDirectory() as temp_dir:
|
54 |
input_path = os.path.join(temp_dir, "input.wav")
|
55 |
truncated_path = os.path.join(temp_dir, "truncated.wav")
|
@@ -59,7 +67,7 @@ if uploaded_file and st.button("Enhance Audio"):
|
|
59 |
with open(input_path, "wb") as f:
|
60 |
f.write(uploaded_file.read())
|
61 |
|
62 |
-
# Load
|
63 |
waveform, sample_rate = torchaudio.load(input_path)
|
64 |
max_samples = sample_rate * 10 # First 10 seconds
|
65 |
if waveform.size(1) > max_samples:
|
@@ -85,11 +93,12 @@ if uploaded_file and st.button("Enhance Audio"):
|
|
85 |
)
|
86 |
|
87 |
# Save enhanced audio
|
88 |
-
|
|
|
89 |
|
90 |
-
# Plot
|
91 |
st.write("Enhanced Audio Spectrogram:")
|
92 |
-
plot_spectrogram(
|
93 |
|
94 |
# Display audio players and download link
|
95 |
st.audio(truncated_path, format="audio/wav")
|
|
|
8 |
import numpy as np
|
9 |
import os
|
10 |
|
11 |
+
# Set device (MPS for Mac, CUDA for other GPUs, otherwise CPU)
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
# Title and Description
|
|
|
22 |
# Upload audio file
|
23 |
uploaded_file = st.file_uploader("Upload an audio file (WAV format)", type=["wav"])
|
24 |
|
25 |
+
# Sidebar: Model Parameters
|
26 |
st.sidebar.title("Model Parameters")
|
27 |
model_name = st.sidebar.selectbox("Select Model", ["basic", "speech"], index=0)
|
28 |
ddim_steps = st.sidebar.slider("DDIM Steps", min_value=10, max_value=100, value=50)
|
|
|
30 |
random_seed = st.sidebar.number_input("Random Seed", min_value=0, value=42, step=1)
|
31 |
latent_t_per_second = 12.8
|
32 |
|
33 |
+
# Helper function: Plot linear STFT spectrogram
|
34 |
def plot_spectrogram(waveform, sample_rate, title):
|
35 |
plt.figure(figsize=(10, 4))
|
36 |
+
spectrogram = torch.stft(
|
37 |
+
torch.tensor(waveform),
|
38 |
+
n_fft=2048,
|
39 |
+
hop_length=512,
|
40 |
+
win_length=2048,
|
41 |
+
return_complex=True,
|
42 |
+
).abs().numpy()
|
43 |
+
plt.imshow(
|
44 |
+
np.log1p(spectrogram),
|
45 |
+
aspect="auto",
|
46 |
+
origin="lower",
|
47 |
+
extent=[0, waveform.shape[-1] / sample_rate, 0, sample_rate / 2],
|
48 |
+
cmap="viridis",
|
49 |
+
)
|
50 |
plt.colorbar(format="%+2.0f dB")
|
51 |
plt.title(title)
|
52 |
+
plt.xlabel("Time (s)")
|
53 |
+
plt.ylabel("Frequency (Hz)")
|
54 |
plt.tight_layout()
|
55 |
st.pyplot(plt)
|
56 |
|
|
|
58 |
if uploaded_file and st.button("Enhance Audio"):
|
59 |
st.write("Processing audio...")
|
60 |
|
|
|
61 |
with tempfile.TemporaryDirectory() as temp_dir:
|
62 |
input_path = os.path.join(temp_dir, "input.wav")
|
63 |
truncated_path = os.path.join(temp_dir, "truncated.wav")
|
|
|
67 |
with open(input_path, "wb") as f:
|
68 |
f.write(uploaded_file.read())
|
69 |
|
70 |
+
# Load and truncate the first 10 seconds
|
71 |
waveform, sample_rate = torchaudio.load(input_path)
|
72 |
max_samples = sample_rate * 10 # First 10 seconds
|
73 |
if waveform.size(1) > max_samples:
|
|
|
93 |
)
|
94 |
|
95 |
# Save enhanced audio
|
96 |
+
output_waveform = waveform_sr.detach().numpy()
|
97 |
+
save_wave(torch.tensor(output_waveform), inputpath=truncated_path, savepath=temp_dir, name="output", samplerate=48000)
|
98 |
|
99 |
+
# Plot enhanced spectrogram
|
100 |
st.write("Enhanced Audio Spectrogram:")
|
101 |
+
plot_spectrogram(output_waveform, 48000, title="Enhanced Audio Spectrogram")
|
102 |
|
103 |
# Display audio players and download link
|
104 |
st.audio(truncated_path, format="audio/wav")
|