Spaces:
Running
Running
import sys | |
import os | |
import torch | |
import librosa | |
from open_clip import create_model | |
from training.data import get_audio_features | |
from training.data import int16_to_float32, float32_to_int16 | |
from transformers import RobertaTokenizer | |
tokenize = RobertaTokenizer.from_pretrained("roberta-base") | |
def tokenizer(text): | |
result = tokenize( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=77, | |
return_tensors="pt", | |
) | |
return {k: v.squeeze(0) for k, v in result.items()} | |
PRETRAINED_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/checkpoints/epoch_top_0_audioset_no_fusion.pt" | |
WAVE_48k_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/audio/machine.wav" | |
def infer_text(): | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
precision = "fp32" | |
amodel = "HTSAT-tiny" # or 'PANN-14' | |
tmodel = "roberta" # the best text encoder in our training | |
enable_fusion = False # False if you do not want to use the fusion model | |
fusion_type = "aff_2d" | |
pretrained = PRETRAINED_PATH | |
model, model_cfg = create_model( | |
amodel, | |
tmodel, | |
pretrained, | |
precision=precision, | |
device=device, | |
enable_fusion=enable_fusion, | |
fusion_type=fusion_type, | |
) | |
# load the text, can be a list (i.e. batch size) | |
text_data = ["I love the contrastive learning", "I love the pretrain model"] | |
# tokenize for roberta, if you want to tokenize for another text encoder, please refer to data.py#L43-90 | |
text_data = tokenizer(text_data) | |
text_embed = model.get_text_embedding(text_data) | |
print(text_embed.size()) | |
def infer_audio(): | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
precision = "fp32" | |
amodel = "HTSAT-tiny" # or 'PANN-14' | |
tmodel = "roberta" # the best text encoder in our training | |
enable_fusion = False # False if you do not want to use the fusion model | |
fusion_type = "aff_2d" | |
pretrained = PRETRAINED_PATH | |
model, model_cfg = create_model( | |
amodel, | |
tmodel, | |
pretrained, | |
precision=precision, | |
device=device, | |
enable_fusion=enable_fusion, | |
fusion_type=fusion_type, | |
) | |
# load the waveform of the shape (T,), should resample to 48000 | |
audio_waveform, sr = librosa.load(WAVE_48k_PATH, sr=48000) | |
# quantize | |
audio_waveform = int16_to_float32(float32_to_int16(audio_waveform)) | |
audio_waveform = torch.from_numpy(audio_waveform).float() | |
audio_dict = {} | |
# the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode | |
import ipdb | |
ipdb.set_trace() | |
audio_dict = get_audio_features( | |
audio_dict, | |
audio_waveform, | |
480000, | |
data_truncating="fusion", | |
data_filling="repeatpad", | |
audio_cfg=model_cfg["audio_cfg"], | |
) | |
# can send a list to the model, to process many audio tracks in one time (i.e. batch size) | |
audio_embed = model.get_audio_embedding([audio_dict]) | |
print(audio_embed.size()) | |
import ipdb | |
ipdb.set_trace() | |
if __name__ == "__main__": | |
infer_text() | |
infer_audio() | |