Spaces:
Runtime error
Runtime error
Aspik101
commited on
Commit
β’
d8d0b7f
0
Parent(s):
Duplicate from Lajonbot/Marketplace-audio
Browse files- .gitattributes +37 -0
- README.md +14 -0
- app.py +300 -0
- model/__pycache__/bart.cpython-310.pyc +0 -0
- model/__pycache__/modules.cpython-310.pyc +0 -0
- model/bart.py +151 -0
- model/modules.py +95 -0
- muzyka_AI.mp4 +3 -0
- prompt.txt +1 -0
- requirements.txt +13 -0
- temp_audio.wav +0 -0
- temp_file.wav +0 -0
- utils/__pycache__/audio_utils.cpython-310.pyc +0 -0
- utils/audio_utils.py +247 -0
- voice_cloning_fraud.mp4 +3 -0
.gitattributes
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
muzyka_AI.mp4 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
voice_cloning_fraud.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Lp Music Caps
|
3 |
+
emoji: π΅π΅π΅
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.33.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
duplicated_from: Lajonbot/Marketplace-audio
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import VitsModel, AutoTokenizer
|
2 |
+
import soundfile as sf
|
3 |
+
import torch
|
4 |
+
from datetime import datetime
|
5 |
+
import random
|
6 |
+
import time
|
7 |
+
from ctransformers import AutoModelForCausalLM
|
8 |
+
from datetime import datetime
|
9 |
+
import whisper
|
10 |
+
from transformers import VitsModel, AutoTokenizer
|
11 |
+
import torch
|
12 |
+
from transformers import MusicgenForConditionalGeneration, AutoProcessor, set_seed
|
13 |
+
import torch
|
14 |
+
import numpy as np
|
15 |
+
import os
|
16 |
+
import argparse
|
17 |
+
import gradio as gr
|
18 |
+
from timeit import default_timer as timer
|
19 |
+
import torch
|
20 |
+
import numpy as np
|
21 |
+
import pandas as pd
|
22 |
+
from huggingface_hub import hf_hub_download
|
23 |
+
from model.bart import BartCaptionModel
|
24 |
+
from utils.audio_utils import load_audio, STR_CH_FIRST
|
25 |
+
from diffusers import DiffusionPipeline
|
26 |
+
|
27 |
+
from PIL import Image
|
28 |
+
|
29 |
+
def image_grid(imgs, rows, cols):
|
30 |
+
assert len(imgs) == rows*cols
|
31 |
+
|
32 |
+
w, h = imgs[0].size
|
33 |
+
grid = Image.new('RGB', size=(cols*w, rows*h))
|
34 |
+
grid_w, grid_h = grid.size
|
35 |
+
|
36 |
+
for i, img in enumerate(imgs):
|
37 |
+
grid.paste(img, box=(i%cols*w, i//cols*h))
|
38 |
+
return grid
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def save_to_txt(text_to_save):
|
43 |
+
with open('prompt.txt', 'w', encoding='utf-8') as f:
|
44 |
+
f.write(text_to_save)
|
45 |
+
|
46 |
+
def read_txt():
|
47 |
+
with open('prompt.txt') as f:
|
48 |
+
lines = f.readlines()
|
49 |
+
return lines
|
50 |
+
|
51 |
+
##### Chat z LLAMA ####
|
52 |
+
##### Chat z LLAMA ####
|
53 |
+
##### Chat z LLAMA ####
|
54 |
+
params = {
|
55 |
+
"max_new_tokens":512,
|
56 |
+
"stop":["<end>" ,"<|endoftext|>","[", "<user>"],
|
57 |
+
"temperature":0.7,
|
58 |
+
"top_p":0.8,
|
59 |
+
"stream":True,
|
60 |
+
"batch_size": 8}
|
61 |
+
|
62 |
+
|
63 |
+
whisper_model = whisper.load_model("medium").to("cuda")
|
64 |
+
print("Whisper Loaded!")
|
65 |
+
llm = AutoModelForCausalLM.from_pretrained("Aspik101/trurl-2-7b-pl-instruct_GGML", model_type="llama")
|
66 |
+
print("LLM Loaded!")
|
67 |
+
tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol")
|
68 |
+
tts_model.to("cuda")
|
69 |
+
print("TTS Loaded!")
|
70 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol")
|
71 |
+
|
72 |
+
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
|
73 |
+
torch_dtype=torch.float16,
|
74 |
+
use_safetensors=True,
|
75 |
+
variant="fp16").to("cuda")
|
76 |
+
print("DiffusionPipeline Loaded!")
|
77 |
+
|
78 |
+
model_audio_gen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("cuda")
|
79 |
+
processor_audio_gen = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
80 |
+
|
81 |
+
with gr.Blocks() as chat_demo:
|
82 |
+
chatbot = gr.Chatbot()
|
83 |
+
audio_input = gr.Audio(source="microphone", type="filepath", show_label=False)
|
84 |
+
submit_audio = gr.Button("Submit Audio")
|
85 |
+
clear = gr.Button("Clear")
|
86 |
+
audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False)
|
87 |
+
|
88 |
+
def translate(audio):
|
89 |
+
print("__WysyΕam nagranie do whisper!")
|
90 |
+
transcription = whisper_model.transcribe(audio, language="pl")
|
91 |
+
return transcription["text"]
|
92 |
+
|
93 |
+
def read_text(text):
|
94 |
+
print("Tutaj jest tekst to przeczytania!", text[-1][-1])
|
95 |
+
inputs = tokenizer(text[-1][-1], return_tensors="pt").to("cuda")
|
96 |
+
with torch.no_grad():
|
97 |
+
output = tts_model(**inputs).waveform.squeeze().cpu().numpy()
|
98 |
+
sf.write('temp_file.wav', output, tts_model.config.sampling_rate)
|
99 |
+
return 'temp_file.wav'
|
100 |
+
|
101 |
+
def user(audio_data, history):
|
102 |
+
if audio_data:
|
103 |
+
user_message = translate(audio_data)
|
104 |
+
print("USER!:")
|
105 |
+
print("", history + [[user_message, None]])
|
106 |
+
return history + [[user_message, None]]
|
107 |
+
|
108 |
+
def parse_history(hist):
|
109 |
+
history_ = ""
|
110 |
+
for q, a in hist:
|
111 |
+
history_ += f"<user>: {q } \n"
|
112 |
+
if a:
|
113 |
+
history_ += f"<assistant>: {a} \n"
|
114 |
+
return history_
|
115 |
+
|
116 |
+
def bot(history):
|
117 |
+
print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}")
|
118 |
+
prompt = f"JesteΕ AI assystentem. Odpowiadaj krΓ³tko i po polsku. {parse_history(history)}. <assistant>:"
|
119 |
+
stream = llm(prompt, **params)
|
120 |
+
history[-1][1] = ""
|
121 |
+
answer_save = ""
|
122 |
+
for character in stream:
|
123 |
+
history[-1][1] += character
|
124 |
+
answer_save += character
|
125 |
+
time.sleep(0.005)
|
126 |
+
yield history
|
127 |
+
|
128 |
+
submit_audio.click(user, [audio_input, chatbot], [chatbot], queue=False).then(bot, chatbot, chatbot).then(read_text, chatbot, audio_output)
|
129 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
130 |
+
|
131 |
+
|
132 |
+
##### Audio Gen ####
|
133 |
+
##### Audio Gen ####
|
134 |
+
##### Audio Gen ####
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
sampling_rate = model_audio_gen.audio_encoder.config.sampling_rate
|
139 |
+
frame_rate = model_audio_gen.audio_encoder.config.frame_rate
|
140 |
+
text_encoder = model_audio_gen.get_text_encoder()
|
141 |
+
|
142 |
+
def generate_audio(decade, genre, instrument, guidance_scale=8, audio_length_in_s=20, seed=0):
|
143 |
+
prompt = " ".join([decade, genre, 'track with ', instrument])
|
144 |
+
save_to_txt(prompt)
|
145 |
+
inputs = processor_audio_gen(
|
146 |
+
text=[prompt, "drums"],
|
147 |
+
padding=True,
|
148 |
+
return_tensors="pt",
|
149 |
+
).to(device)
|
150 |
+
|
151 |
+
with torch.no_grad():
|
152 |
+
encoder_outputs = text_encoder(**inputs)
|
153 |
+
|
154 |
+
max_new_tokens = int(frame_rate * audio_length_in_s)
|
155 |
+
|
156 |
+
set_seed(seed)
|
157 |
+
audio_values = model_audio_gen.generate(inputs.input_ids[0][None, :], attention_mask=inputs.attention_mask, encoder_outputs=encoder_outputs, do_sample=True, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens)
|
158 |
+
sf.write('generated_audio.wav', audio_values.cpu()[0][0], 32_000)
|
159 |
+
audio_values = (audio_values.cpu().numpy() * 32767).astype(np.int16)
|
160 |
+
return (sampling_rate, audio_values)
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
audio_gen = gr.Interface(
|
165 |
+
fn=generate_audio,
|
166 |
+
inputs=[
|
167 |
+
# gr.Text(label="Negative prompt", value="drums"),
|
168 |
+
gr.Radio(["50s", " 60s", "70s", "80s", "90s"], label="decade", info=""),
|
169 |
+
gr.Radio(["classic", "rock", "pop", "metal", "jazz", "synth"], label="genre", info=""),
|
170 |
+
gr.Radio(["acoustic guitar", "electric guitar", "drums", "saxophone", "keyboard", "accordion", "fiddle"], label="instrument", info=""),
|
171 |
+
gr.Slider(1.5, 10, value=8, step=0.5, label="Guidance scale"),
|
172 |
+
gr.Slider(5, 30, value=20, step=5, label="Audio length in s"),
|
173 |
+
# gr.Slider(0, 10, value=0, step=1, label="Seed"),
|
174 |
+
],
|
175 |
+
outputs=[
|
176 |
+
gr.Audio(label="Generated Music", type="numpy"),
|
177 |
+
]#,
|
178 |
+
# examples=EXAMPLES,
|
179 |
+
)
|
180 |
+
|
181 |
+
#### Audio desc and Stable ###
|
182 |
+
#### Audio desc and Stable ###
|
183 |
+
#### Audio desc and Stable ###
|
184 |
+
|
185 |
+
if os.path.isfile("transfer.pth") == False:
|
186 |
+
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
|
187 |
+
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
|
188 |
+
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
|
189 |
+
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
|
190 |
+
|
191 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
192 |
+
|
193 |
+
example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
|
194 |
+
model = BartCaptionModel(max_length = 128)
|
195 |
+
pretrained_object = torch.load('./transfer.pth', map_location='cpu')
|
196 |
+
state_dict = pretrained_object['state_dict']
|
197 |
+
model.load_state_dict(state_dict)
|
198 |
+
if torch.cuda.is_available():
|
199 |
+
torch.cuda.set_device(device)
|
200 |
+
model = model.cuda(device)
|
201 |
+
model.eval()
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
def get_audio(audio_path, duration=10, target_sr=16000):
|
208 |
+
n_samples = int(duration * target_sr)
|
209 |
+
audio, sr = load_audio(
|
210 |
+
path= audio_path,
|
211 |
+
ch_format= STR_CH_FIRST,
|
212 |
+
sample_rate= target_sr,
|
213 |
+
downmix_to_mono= True,
|
214 |
+
)
|
215 |
+
if len(audio.shape) == 2:
|
216 |
+
audio = audio.mean(0, False) # to mono
|
217 |
+
input_size = int(n_samples)
|
218 |
+
if audio.shape[-1] < input_size: # pad sequence
|
219 |
+
pad = np.zeros(input_size)
|
220 |
+
pad[: audio.shape[-1]] = audio
|
221 |
+
audio = pad
|
222 |
+
ceil = int(audio.shape[-1] // n_samples)
|
223 |
+
audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
|
224 |
+
return audio
|
225 |
+
|
226 |
+
def captioning(audio_path):
|
227 |
+
audio_tensor = get_audio(audio_path = audio_path)
|
228 |
+
if torch.cuda.is_available():
|
229 |
+
audio_tensor = audio_tensor.to(device)
|
230 |
+
with torch.no_grad():
|
231 |
+
output = model.generate(
|
232 |
+
samples=audio_tensor,
|
233 |
+
num_beams=5,
|
234 |
+
)
|
235 |
+
inference = ""
|
236 |
+
number_of_chunks = range(audio_tensor.shape[0])
|
237 |
+
for chunk, text in zip(number_of_chunks, output):
|
238 |
+
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
|
239 |
+
inference += f"{time}\n{text} \n \n"
|
240 |
+
return inference
|
241 |
+
|
242 |
+
title = ""
|
243 |
+
description = ""
|
244 |
+
|
245 |
+
article = ""
|
246 |
+
def captioning():
|
247 |
+
audio_path = 'generated_audio.wav'
|
248 |
+
audio_tensor = get_audio(audio_path=audio_path)
|
249 |
+
|
250 |
+
if torch.cuda.is_available():
|
251 |
+
audio_tensor = audio_tensor.to(device)
|
252 |
+
|
253 |
+
with torch.no_grad():
|
254 |
+
output = model.generate(
|
255 |
+
samples=audio_tensor,
|
256 |
+
num_beams=5)
|
257 |
+
|
258 |
+
inference = ""
|
259 |
+
number_of_chunks = range(audio_tensor.shape[0])
|
260 |
+
for chunk, text in zip(number_of_chunks, output):
|
261 |
+
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
|
262 |
+
inference += f"{time}\n{text} \n \n"
|
263 |
+
|
264 |
+
prompt = read_txt()
|
265 |
+
print(prompt[0])
|
266 |
+
# Generuj obraz na podstawie tekstu
|
267 |
+
#generated_images = pipe(prompt=prompt[0]*5 + inference + prompt[0]*5).images
|
268 |
+
#image = generated_images[0]
|
269 |
+
|
270 |
+
num_images = 3
|
271 |
+
prompt = [prompt[0]*5 + inference + prompt[0]*5] * num_images
|
272 |
+
images = pipe(prompt, height=768, width=768).images
|
273 |
+
grid = image_grid(images, rows=1, cols=3)
|
274 |
+
|
275 |
+
return inference, grid
|
276 |
+
|
277 |
+
audio_desc = gr.Interface(fn=captioning,
|
278 |
+
inputs=None,
|
279 |
+
outputs=[
|
280 |
+
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
|
281 |
+
gr.Image(label="Generated Image") # Dodane wyjΕcie dla obrazu
|
282 |
+
],
|
283 |
+
title=title,
|
284 |
+
description=description,
|
285 |
+
article=article,
|
286 |
+
cache_examples=False
|
287 |
+
)
|
288 |
+
|
289 |
+
music = gr.Video("muzyka_AI.mp4")
|
290 |
+
voice_cloning = gr.Video("voice_cloning_fraud.mp4")
|
291 |
+
|
292 |
+
##### Run Alll #######
|
293 |
+
##### Run Alll #######
|
294 |
+
##### Run Alll #######
|
295 |
+
|
296 |
+
|
297 |
+
demo_all = gr.TabbedInterface([music, audio_gen, audio_desc, voice_cloning, chat_demo], ["1.Music", "2.Audio Generation", "3.Image Generation", "4.Voice Cloning", "5.Chat with LLama"])
|
298 |
+
|
299 |
+
demo_all.queue()
|
300 |
+
demo_all.launch()
|
model/__pycache__/bart.cpython-310.pyc
ADDED
Binary file (4.53 kB). View file
|
|
model/__pycache__/modules.cpython-310.pyc
ADDED
Binary file (3.24 kB). View file
|
|
model/bart.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
from .modules import AudioEncoder
|
6 |
+
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
|
7 |
+
|
8 |
+
class BartCaptionModel(nn.Module):
|
9 |
+
def __init__(self, n_mels=128, num_of_conv=6, sr=16000, duration=10, max_length=128, label_smoothing=0.1, bart_type="facebook/bart-base", audio_dim=768):
|
10 |
+
super(BartCaptionModel, self).__init__()
|
11 |
+
# non-finetunning case
|
12 |
+
bart_config = BartConfig.from_pretrained(bart_type)
|
13 |
+
self.tokenizer = BartTokenizer.from_pretrained(bart_type)
|
14 |
+
self.bart = BartForConditionalGeneration(bart_config)
|
15 |
+
|
16 |
+
self.n_sample = sr * duration
|
17 |
+
self.hop_length = int(0.01 * sr) # hard coding hop_size
|
18 |
+
self.n_frames = int(self.n_sample // self.hop_length)
|
19 |
+
self.num_of_stride_conv = num_of_conv - 1
|
20 |
+
self.n_ctx = int(self.n_frames // 2**self.num_of_stride_conv) + 1
|
21 |
+
self.audio_encoder = AudioEncoder(
|
22 |
+
n_mels = n_mels, # hard coding n_mel
|
23 |
+
n_ctx = self.n_ctx,
|
24 |
+
audio_dim = audio_dim,
|
25 |
+
text_dim = self.bart.config.hidden_size,
|
26 |
+
num_of_stride_conv = self.num_of_stride_conv
|
27 |
+
)
|
28 |
+
|
29 |
+
self.max_length = max_length
|
30 |
+
self.loss_fct = nn.CrossEntropyLoss(label_smoothing= label_smoothing, ignore_index=-100)
|
31 |
+
|
32 |
+
@property
|
33 |
+
def device(self):
|
34 |
+
return list(self.parameters())[0].device
|
35 |
+
|
36 |
+
def shift_tokens_right(self, input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
37 |
+
"""
|
38 |
+
Shift input ids one token to the right.ls
|
39 |
+
"""
|
40 |
+
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
41 |
+
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
42 |
+
shifted_input_ids[:, 0] = decoder_start_token_id
|
43 |
+
|
44 |
+
if pad_token_id is None:
|
45 |
+
raise ValueError("self.model.config.pad_token_id has to be defined.")
|
46 |
+
# replace possible -100 values in labels by `pad_token_id`
|
47 |
+
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
48 |
+
return shifted_input_ids
|
49 |
+
|
50 |
+
def forward_encoder(self, audio):
|
51 |
+
audio_embs = self.audio_encoder(audio)
|
52 |
+
encoder_outputs = self.bart.model.encoder(
|
53 |
+
input_ids=None,
|
54 |
+
inputs_embeds=audio_embs,
|
55 |
+
return_dict=True
|
56 |
+
)["last_hidden_state"]
|
57 |
+
return encoder_outputs, audio_embs
|
58 |
+
|
59 |
+
def forward_decoder(self, text, encoder_outputs):
|
60 |
+
text = self.tokenizer(text,
|
61 |
+
padding='longest',
|
62 |
+
truncation=True,
|
63 |
+
max_length=self.max_length,
|
64 |
+
return_tensors="pt")
|
65 |
+
input_ids = text["input_ids"].to(self.device)
|
66 |
+
attention_mask = text["attention_mask"].to(self.device)
|
67 |
+
|
68 |
+
decoder_targets = input_ids.masked_fill(
|
69 |
+
input_ids == self.tokenizer.pad_token_id, -100
|
70 |
+
)
|
71 |
+
|
72 |
+
decoder_input_ids = self.shift_tokens_right(
|
73 |
+
decoder_targets, self.bart.config.pad_token_id, self.bart.config.decoder_start_token_id
|
74 |
+
)
|
75 |
+
|
76 |
+
decoder_outputs = self.bart(
|
77 |
+
input_ids=None,
|
78 |
+
attention_mask=None,
|
79 |
+
decoder_input_ids=decoder_input_ids,
|
80 |
+
decoder_attention_mask=attention_mask,
|
81 |
+
inputs_embeds=None,
|
82 |
+
labels=None,
|
83 |
+
encoder_outputs=(encoder_outputs,),
|
84 |
+
return_dict=True
|
85 |
+
)
|
86 |
+
lm_logits = decoder_outputs["logits"]
|
87 |
+
loss = self.loss_fct(lm_logits.view(-1, self.tokenizer.vocab_size), decoder_targets.view(-1))
|
88 |
+
return loss
|
89 |
+
|
90 |
+
def forward(self, audio, text):
|
91 |
+
encoder_outputs, _ = self.forward_encoder(audio)
|
92 |
+
loss = self.forward_decoder(text, encoder_outputs)
|
93 |
+
return loss
|
94 |
+
|
95 |
+
def generate(self,
|
96 |
+
samples,
|
97 |
+
use_nucleus_sampling=False,
|
98 |
+
num_beams=5,
|
99 |
+
max_length=128,
|
100 |
+
min_length=2,
|
101 |
+
top_p=0.9,
|
102 |
+
repetition_penalty=1.0,
|
103 |
+
):
|
104 |
+
|
105 |
+
# self.bart.force_bos_token_to_be_generated = True
|
106 |
+
audio_embs = self.audio_encoder(samples)
|
107 |
+
encoder_outputs = self.bart.model.encoder(
|
108 |
+
input_ids=None,
|
109 |
+
attention_mask=None,
|
110 |
+
head_mask=None,
|
111 |
+
inputs_embeds=audio_embs,
|
112 |
+
output_attentions=None,
|
113 |
+
output_hidden_states=None,
|
114 |
+
return_dict=True)
|
115 |
+
|
116 |
+
input_ids = torch.zeros((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
|
117 |
+
input_ids[:, 0] = self.bart.config.decoder_start_token_id
|
118 |
+
decoder_attention_mask = torch.ones((encoder_outputs['last_hidden_state'].size(0), 1)).long().to(self.device)
|
119 |
+
if use_nucleus_sampling:
|
120 |
+
outputs = self.bart.generate(
|
121 |
+
input_ids=None,
|
122 |
+
attention_mask=None,
|
123 |
+
decoder_input_ids=input_ids,
|
124 |
+
decoder_attention_mask=decoder_attention_mask,
|
125 |
+
encoder_outputs=encoder_outputs,
|
126 |
+
max_length=max_length,
|
127 |
+
min_length=min_length,
|
128 |
+
do_sample=True,
|
129 |
+
top_p=top_p,
|
130 |
+
num_return_sequences=1,
|
131 |
+
repetition_penalty=1.1)
|
132 |
+
else:
|
133 |
+
outputs = self.bart.generate(input_ids=None,
|
134 |
+
attention_mask=None,
|
135 |
+
decoder_input_ids=input_ids,
|
136 |
+
decoder_attention_mask=decoder_attention_mask,
|
137 |
+
encoder_outputs=encoder_outputs,
|
138 |
+
head_mask=None,
|
139 |
+
decoder_head_mask=None,
|
140 |
+
inputs_embeds=None,
|
141 |
+
decoder_inputs_embeds=None,
|
142 |
+
use_cache=None,
|
143 |
+
output_attentions=None,
|
144 |
+
output_hidden_states=None,
|
145 |
+
max_length=max_length,
|
146 |
+
min_length=min_length,
|
147 |
+
num_beams=num_beams,
|
148 |
+
repetition_penalty=repetition_penalty)
|
149 |
+
|
150 |
+
captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
151 |
+
return captions
|
model/modules.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### code reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
import numpy as np
|
7 |
+
import torch.nn.functional as F
|
8 |
+
from torch import Tensor, nn
|
9 |
+
from typing import Dict, Iterable, Optional
|
10 |
+
|
11 |
+
# hard-coded audio hyperparameters
|
12 |
+
SAMPLE_RATE = 16000
|
13 |
+
N_FFT = 1024
|
14 |
+
N_MELS = 128
|
15 |
+
HOP_LENGTH = int(0.01 * SAMPLE_RATE)
|
16 |
+
DURATION = 10
|
17 |
+
N_SAMPLES = int(DURATION * SAMPLE_RATE)
|
18 |
+
N_FRAMES = N_SAMPLES // HOP_LENGTH + 1
|
19 |
+
|
20 |
+
def sinusoids(length, channels, max_timescale=10000):
|
21 |
+
"""Returns sinusoids for positional embedding"""
|
22 |
+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
23 |
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
|
24 |
+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
25 |
+
return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
|
26 |
+
|
27 |
+
class MelEncoder(nn.Module):
|
28 |
+
"""
|
29 |
+
time-frequency represntation
|
30 |
+
"""
|
31 |
+
def __init__(self,
|
32 |
+
sample_rate= 16000,
|
33 |
+
f_min=0,
|
34 |
+
f_max=8000,
|
35 |
+
n_fft=1024,
|
36 |
+
win_length=1024,
|
37 |
+
hop_length = int(0.01 * 16000),
|
38 |
+
n_mels = 128,
|
39 |
+
power = None,
|
40 |
+
pad= 0,
|
41 |
+
normalized= False,
|
42 |
+
center= True,
|
43 |
+
pad_mode= "reflect"
|
44 |
+
):
|
45 |
+
super(MelEncoder, self).__init__()
|
46 |
+
self.window = torch.hann_window(win_length)
|
47 |
+
self.spec_fn = torchaudio.transforms.Spectrogram(
|
48 |
+
n_fft = n_fft,
|
49 |
+
win_length = win_length,
|
50 |
+
hop_length = hop_length,
|
51 |
+
power = power
|
52 |
+
)
|
53 |
+
self.mel_scale = torchaudio.transforms.MelScale(
|
54 |
+
n_mels,
|
55 |
+
sample_rate,
|
56 |
+
f_min,
|
57 |
+
f_max,
|
58 |
+
n_fft // 2 + 1)
|
59 |
+
|
60 |
+
self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
|
61 |
+
|
62 |
+
def forward(self, wav):
|
63 |
+
spec = self.spec_fn(wav)
|
64 |
+
power_spec = spec.real.abs().pow(2)
|
65 |
+
mel_spec = self.mel_scale(power_spec)
|
66 |
+
mel_spec = self.amplitude_to_db(mel_spec) # Log10(max(reference value and amin))
|
67 |
+
return mel_spec
|
68 |
+
|
69 |
+
class AudioEncoder(nn.Module):
|
70 |
+
def __init__(
|
71 |
+
self, n_mels: int, n_ctx: int, audio_dim: int, text_dim: int, num_of_stride_conv: int,
|
72 |
+
):
|
73 |
+
super().__init__()
|
74 |
+
self.mel_encoder = MelEncoder(n_mels=n_mels)
|
75 |
+
self.conv1 = nn.Conv1d(n_mels, audio_dim, kernel_size=3, padding=1)
|
76 |
+
self.conv_stack = nn.ModuleList([])
|
77 |
+
for _ in range(num_of_stride_conv):
|
78 |
+
self.conv_stack.append(
|
79 |
+
nn.Conv1d(audio_dim, audio_dim, kernel_size=3, stride=2, padding=1)
|
80 |
+
)
|
81 |
+
# self.proj = nn.Linear(audio_dim, text_dim, bias=False)
|
82 |
+
self.register_buffer("positional_embedding", sinusoids(n_ctx, text_dim))
|
83 |
+
|
84 |
+
def forward(self, x: Tensor):
|
85 |
+
"""
|
86 |
+
x : torch.Tensor, shape = (batch_size, waveform)
|
87 |
+
single channel wavform
|
88 |
+
"""
|
89 |
+
x = self.mel_encoder(x) # (batch_size, n_mels, n_ctx)
|
90 |
+
x = F.gelu(self.conv1(x))
|
91 |
+
for conv in self.conv_stack:
|
92 |
+
x = F.gelu(conv(x))
|
93 |
+
x = x.permute(0, 2, 1)
|
94 |
+
x = (x + self.positional_embedding).to(x.dtype)
|
95 |
+
return x
|
muzyka_AI.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:480742943da1b14e194684919a8e531e383503318c28420a29f723468c3407dc
|
3 |
+
size 6376447
|
prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ma to byΔ
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchaudio
|
3 |
+
transformers
|
4 |
+
ctransformers --no-binary=ctransformers
|
5 |
+
librosa >= 0.8
|
6 |
+
pip>=23.2
|
7 |
+
gradio_client==0.2.7
|
8 |
+
invisible_watermark
|
9 |
+
safetensors
|
10 |
+
diffusers
|
11 |
+
soundfile
|
12 |
+
openai-whisper
|
13 |
+
accelerate
|
temp_audio.wav
ADDED
Binary file (72.7 kB). View file
|
|
temp_file.wav
ADDED
Binary file (228 kB). View file
|
|
utils/__pycache__/audio_utils.cpython-310.pyc
ADDED
Binary file (7.74 kB). View file
|
|
utils/audio_utils.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
STR_CLIP_ID = 'clip_id'
|
2 |
+
STR_AUDIO_SIGNAL = 'audio_signal'
|
3 |
+
STR_TARGET_VECTOR = 'target_vector'
|
4 |
+
|
5 |
+
|
6 |
+
STR_CH_FIRST = 'channels_first'
|
7 |
+
STR_CH_LAST = 'channels_last'
|
8 |
+
|
9 |
+
import io
|
10 |
+
import os
|
11 |
+
import tqdm
|
12 |
+
import logging
|
13 |
+
import subprocess
|
14 |
+
from typing import Tuple
|
15 |
+
from pathlib import Path
|
16 |
+
|
17 |
+
# import librosa
|
18 |
+
import numpy as np
|
19 |
+
import soundfile as sf
|
20 |
+
|
21 |
+
import itertools
|
22 |
+
from numpy.fft import irfft
|
23 |
+
|
24 |
+
def _resample_load_ffmpeg(path: str, sample_rate: int, downmix_to_mono: bool) -> Tuple[np.ndarray, int]:
|
25 |
+
"""
|
26 |
+
Decoding, downmixing, and downsampling by librosa.
|
27 |
+
Returns a channel-first audio signal.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
path:
|
31 |
+
sample_rate:
|
32 |
+
downmix_to_mono:
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
(audio signal, sample rate)
|
36 |
+
"""
|
37 |
+
|
38 |
+
def _decode_resample_by_ffmpeg(filename, sr):
|
39 |
+
"""decode, downmix, and resample audio file"""
|
40 |
+
channel_cmd = '-ac 1 ' if downmix_to_mono else '' # downmixing option
|
41 |
+
resampling_cmd = f'-ar {str(sr)}' if sr else '' # downsampling option
|
42 |
+
cmd = f"ffmpeg -i \"{filename}\" {channel_cmd} {resampling_cmd} -f wav -"
|
43 |
+
p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
44 |
+
out, err = p.communicate()
|
45 |
+
return out
|
46 |
+
|
47 |
+
src, sr = sf.read(io.BytesIO(_decode_resample_by_ffmpeg(path, sr=sample_rate)))
|
48 |
+
return src.T, sr
|
49 |
+
|
50 |
+
|
51 |
+
def _resample_load_librosa(path: str, sample_rate: int, downmix_to_mono: bool, **kwargs) -> Tuple[np.ndarray, int]:
|
52 |
+
"""
|
53 |
+
Decoding, downmixing, and downsampling by librosa.
|
54 |
+
Returns a channel-first audio signal.
|
55 |
+
"""
|
56 |
+
src, sr = librosa.load(path, sr=sample_rate, mono=downmix_to_mono, **kwargs)
|
57 |
+
return src, sr
|
58 |
+
|
59 |
+
|
60 |
+
def load_audio(
|
61 |
+
path: str or Path,
|
62 |
+
ch_format: str,
|
63 |
+
sample_rate: int = None,
|
64 |
+
downmix_to_mono: bool = False,
|
65 |
+
resample_by: str = 'ffmpeg',
|
66 |
+
**kwargs,
|
67 |
+
) -> Tuple[np.ndarray, int]:
|
68 |
+
"""A wrapper of librosa.load that:
|
69 |
+
- forces the returned audio to be 2-dim,
|
70 |
+
- defaults to sr=None, and
|
71 |
+
- defaults to downmix_to_mono=False.
|
72 |
+
|
73 |
+
The audio decoding is done by `audioread` or `soundfile` package and ultimately, often by ffmpeg.
|
74 |
+
The resampling is done by `librosa`'s child package `resampy`.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
path: audio file path
|
78 |
+
ch_format: one of 'channels_first' or 'channels_last'
|
79 |
+
sample_rate: target sampling rate. if None, use the rate of the audio file
|
80 |
+
downmix_to_mono:
|
81 |
+
resample_by (str): 'librosa' or 'ffmpeg'. it decides backend for audio decoding and resampling.
|
82 |
+
**kwargs: keyword args for librosa.load - offset, duration, dtype, res_type.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
(audio, sr) tuple
|
86 |
+
"""
|
87 |
+
if ch_format not in (STR_CH_FIRST, STR_CH_LAST):
|
88 |
+
raise ValueError(f'ch_format is wrong here -> {ch_format}')
|
89 |
+
|
90 |
+
if os.stat(path).st_size > 8000:
|
91 |
+
if resample_by == 'librosa':
|
92 |
+
src, sr = _resample_load_librosa(path, sample_rate, downmix_to_mono, **kwargs)
|
93 |
+
elif resample_by == 'ffmpeg':
|
94 |
+
src, sr = _resample_load_ffmpeg(path, sample_rate, downmix_to_mono)
|
95 |
+
else:
|
96 |
+
raise NotImplementedError(f'resample_by: "{resample_by}" is not supposred yet')
|
97 |
+
else:
|
98 |
+
raise ValueError('Given audio is too short!')
|
99 |
+
return src, sr
|
100 |
+
|
101 |
+
# if src.ndim == 1:
|
102 |
+
# src = np.expand_dims(src, axis=0)
|
103 |
+
# # now always 2d and channels_first
|
104 |
+
|
105 |
+
# if ch_format == STR_CH_FIRST:
|
106 |
+
# return src, sr
|
107 |
+
# else:
|
108 |
+
# return src.T, sr
|
109 |
+
|
110 |
+
def ms(x):
|
111 |
+
"""Mean value of signal `x` squared.
|
112 |
+
:param x: Dynamic quantity.
|
113 |
+
:returns: Mean squared of `x`.
|
114 |
+
"""
|
115 |
+
return (np.abs(x)**2.0).mean()
|
116 |
+
|
117 |
+
def normalize(y, x=None):
|
118 |
+
"""normalize power in y to a (standard normal) white noise signal.
|
119 |
+
Optionally normalize to power in signal `x`.
|
120 |
+
#The mean power of a Gaussian with :math:`\\mu=0` and :math:`\\sigma=1` is 1.
|
121 |
+
"""
|
122 |
+
if x is not None:
|
123 |
+
x = ms(x)
|
124 |
+
else:
|
125 |
+
x = 1.0
|
126 |
+
return y * np.sqrt(x / ms(y))
|
127 |
+
|
128 |
+
def noise(N, color='white', state=None):
|
129 |
+
"""Noise generator.
|
130 |
+
:param N: Amount of samples.
|
131 |
+
:param color: Color of noise.
|
132 |
+
:param state: State of PRNG.
|
133 |
+
:type state: :class:`np.random.RandomState`
|
134 |
+
"""
|
135 |
+
try:
|
136 |
+
return _noise_generators[color](N, state)
|
137 |
+
except KeyError:
|
138 |
+
raise ValueError("Incorrect color.")
|
139 |
+
|
140 |
+
def white(N, state=None):
|
141 |
+
"""
|
142 |
+
White noise.
|
143 |
+
:param N: Amount of samples.
|
144 |
+
:param state: State of PRNG.
|
145 |
+
:type state: :class:`np.random.RandomState`
|
146 |
+
White noise has a constant power density. It's narrowband spectrum is therefore flat.
|
147 |
+
The power in white noise will increase by a factor of two for each octave band,
|
148 |
+
and therefore increases with 3 dB per octave.
|
149 |
+
"""
|
150 |
+
state = np.random.RandomState() if state is None else state
|
151 |
+
return state.randn(N)
|
152 |
+
|
153 |
+
def pink(N, state=None):
|
154 |
+
"""
|
155 |
+
Pink noise.
|
156 |
+
:param N: Amount of samples.
|
157 |
+
:param state: State of PRNG.
|
158 |
+
:type state: :class:`np.random.RandomState`
|
159 |
+
Pink noise has equal power in bands that are proportionally wide.
|
160 |
+
Power density decreases with 3 dB per octave.
|
161 |
+
"""
|
162 |
+
state = np.random.RandomState() if state is None else state
|
163 |
+
uneven = N % 2
|
164 |
+
X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
|
165 |
+
S = np.sqrt(np.arange(len(X)) + 1.) # +1 to avoid divide by zero
|
166 |
+
y = (irfft(X / S)).real
|
167 |
+
if uneven:
|
168 |
+
y = y[:-1]
|
169 |
+
return normalize(y)
|
170 |
+
|
171 |
+
def blue(N, state=None):
|
172 |
+
"""
|
173 |
+
Blue noise.
|
174 |
+
:param N: Amount of samples.
|
175 |
+
:param state: State of PRNG.
|
176 |
+
:type state: :class:`np.random.RandomState`
|
177 |
+
Power increases with 6 dB per octave.
|
178 |
+
Power density increases with 3 dB per octave.
|
179 |
+
"""
|
180 |
+
state = np.random.RandomState() if state is None else state
|
181 |
+
uneven = N % 2
|
182 |
+
X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
|
183 |
+
S = np.sqrt(np.arange(len(X))) # Filter
|
184 |
+
y = (irfft(X * S)).real
|
185 |
+
if uneven:
|
186 |
+
y = y[:-1]
|
187 |
+
return normalize(y)
|
188 |
+
|
189 |
+
def brown(N, state=None):
|
190 |
+
"""
|
191 |
+
Violet noise.
|
192 |
+
:param N: Amount of samples.
|
193 |
+
:param state: State of PRNG.
|
194 |
+
:type state: :class:`np.random.RandomState`
|
195 |
+
Power decreases with -3 dB per octave.
|
196 |
+
Power density decreases with 6 dB per octave.
|
197 |
+
"""
|
198 |
+
state = np.random.RandomState() if state is None else state
|
199 |
+
uneven = N % 2
|
200 |
+
X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
|
201 |
+
S = (np.arange(len(X)) + 1) # Filter
|
202 |
+
y = (irfft(X / S)).real
|
203 |
+
if uneven:
|
204 |
+
y = y[:-1]
|
205 |
+
return normalize(y)
|
206 |
+
|
207 |
+
def violet(N, state=None):
|
208 |
+
"""
|
209 |
+
Violet noise. Power increases with 6 dB per octave.
|
210 |
+
:param N: Amount of samples.
|
211 |
+
:param state: State of PRNG.
|
212 |
+
:type state: :class:`np.random.RandomState`
|
213 |
+
Power increases with +9 dB per octave.
|
214 |
+
Power density increases with +6 dB per octave.
|
215 |
+
"""
|
216 |
+
state = np.random.RandomState() if state is None else state
|
217 |
+
uneven = N % 2
|
218 |
+
X = state.randn(N // 2 + 1 + uneven) + 1j * state.randn(N // 2 + 1 + uneven)
|
219 |
+
S = (np.arange(len(X))) # Filter
|
220 |
+
y = (irfft(X * S)).real
|
221 |
+
if uneven:
|
222 |
+
y = y[:-1]
|
223 |
+
return normalize(y)
|
224 |
+
|
225 |
+
_noise_generators = {
|
226 |
+
'white': white,
|
227 |
+
'pink': pink,
|
228 |
+
'blue': blue,
|
229 |
+
'brown': brown,
|
230 |
+
'violet': violet,
|
231 |
+
}
|
232 |
+
|
233 |
+
def noise_generator(N=44100, color='white', state=None):
|
234 |
+
"""Noise generator.
|
235 |
+
:param N: Amount of unique samples to generate.
|
236 |
+
:param color: Color of noise.
|
237 |
+
Generate `N` amount of unique samples and cycle over these samples.
|
238 |
+
"""
|
239 |
+
#yield from itertools.cycle(noise(N, color)) # Python 3.3
|
240 |
+
for sample in itertools.cycle(noise(N, color, state)):
|
241 |
+
yield sample
|
242 |
+
|
243 |
+
def heaviside(N):
|
244 |
+
"""Heaviside.
|
245 |
+
Returns the value 0 for `x < 0`, 1 for `x > 0`, and 1/2 for `x = 0`.
|
246 |
+
"""
|
247 |
+
return 0.5 * (np.sign(N) + 1)
|
voice_cloning_fraud.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:440d118fdb3e6e210c5435cec6bf50d1c61190a2e567b62ba39137cc9274ce3b
|
3 |
+
size 4672978
|