Spaces:
Sleeping
Sleeping
DavidCombei
commited on
Commit
•
d8876d6
1
Parent(s):
43524c8
Upload 9 files
Browse files- PhantomNET.py +168 -0
- PhantomNet_Finetuned_V2.pt +3 -0
- app.py +143 -0
- final_model_ensemble.pkl +3 -0
- model1_ensemble.pkl +3 -0
- model2_ensemble.pkl +3 -0
- model3_ensemble.pkl +3 -0
- model4_ensemble.pkl +3 -0
- requirements.txt +94 -0
PhantomNET.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import math
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
class MultiHeadAttention(nn.Module):
|
7 |
+
def __init__(self, d_model, num_heads):
|
8 |
+
super(MultiHeadAttention, self).__init__()
|
9 |
+
self.d_model = d_model
|
10 |
+
self.num_heads = num_heads
|
11 |
+
self.d_k = d_model // num_heads
|
12 |
+
self.W_q = nn.Linear(d_model, d_model) # query
|
13 |
+
self.W_k = nn.Linear(d_model, d_model) # key
|
14 |
+
self.W_v = nn.Linear(d_model, d_model) # value
|
15 |
+
self.W_o = nn.Linear(d_model, d_model) # output
|
16 |
+
|
17 |
+
def scaled_dot_product_attention(self, Q, K, V, mask=None):
|
18 |
+
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
|
19 |
+
if mask is not None:
|
20 |
+
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
|
21 |
+
attn_probs = torch.softmax(attn_scores, dim=-1)
|
22 |
+
output = torch.matmul(attn_probs, V)
|
23 |
+
return output
|
24 |
+
|
25 |
+
def split_heads(self, x):
|
26 |
+
batch_size, seq_length, d_model = x.size()
|
27 |
+
return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
|
28 |
+
|
29 |
+
def combine_heads(self, x):
|
30 |
+
batch_size, _, seq_length, d_k = x.size()
|
31 |
+
return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
|
32 |
+
|
33 |
+
def forward(self, Q, K, V, mask=None):
|
34 |
+
Q = self.split_heads(self.W_q(Q))
|
35 |
+
K = self.split_heads(self.W_k(K))
|
36 |
+
V = self.split_heads(self.W_v(V))
|
37 |
+
attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
|
38 |
+
output = self.W_o(self.combine_heads(attn_output))
|
39 |
+
return output
|
40 |
+
|
41 |
+
|
42 |
+
class PositionWiseFeedForward(nn.Module):
|
43 |
+
def __init__(self, d_model, d_ff):
|
44 |
+
super(PositionWiseFeedForward, self).__init__()
|
45 |
+
self.fc1 = nn.Linear(d_model, d_ff)
|
46 |
+
self.fc2 = nn.Linear(d_ff, d_model)
|
47 |
+
self.relu = nn.ReLU()
|
48 |
+
|
49 |
+
def forward(self, x):
|
50 |
+
return self.fc2(self.relu(self.fc1(x)))
|
51 |
+
|
52 |
+
|
53 |
+
class PositionalEncoding(nn.Module):
|
54 |
+
def __init__(self, d_model, max_seq_length):
|
55 |
+
super(PositionalEncoding, self).__init__()
|
56 |
+
pe = torch.zeros(max_seq_length, d_model)
|
57 |
+
position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
|
58 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
|
59 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
60 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
61 |
+
self.register_buffer('pe', pe.unsqueeze(0))
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
self.pe = self.pe.to(x.device)
|
65 |
+
return x + self.pe[:, :x.size(1)]
|
66 |
+
|
67 |
+
|
68 |
+
class EncoderLayer(nn.Module):
|
69 |
+
def __init__(self, d_model, num_heads, d_ff, dropout):
|
70 |
+
super(EncoderLayer, self).__init__()
|
71 |
+
self.self_attn = MultiHeadAttention(d_model, num_heads)
|
72 |
+
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
|
73 |
+
self.norm1 = nn.LayerNorm(d_model)
|
74 |
+
self.norm2 = nn.LayerNorm(d_model)
|
75 |
+
self.dropout = nn.Dropout(dropout)
|
76 |
+
|
77 |
+
def forward(self, x, mask):
|
78 |
+
attn_output = self.self_attn(x, x, x, mask)
|
79 |
+
x = self.norm1(x + self.dropout(attn_output))
|
80 |
+
ff_output = self.feed_forward(x)
|
81 |
+
x = self.norm2(x + self.dropout(ff_output))
|
82 |
+
return x
|
83 |
+
|
84 |
+
|
85 |
+
class PhantomNet(nn.Module):
|
86 |
+
def __init__(self, use_mode, feature_size, conv_projection, num_classes, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1):
|
87 |
+
super(PhantomNet, self).__init__()
|
88 |
+
self.conv1 = nn.Conv1d(in_channels=1, out_channels=512, kernel_size=10, stride=5)
|
89 |
+
self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
|
90 |
+
self.conv3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
|
91 |
+
self.conv4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
|
92 |
+
self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
|
93 |
+
self.conv6 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
|
94 |
+
self.conv7 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
|
95 |
+
self.use_mode = use_mode
|
96 |
+
self.conv_projection = conv_projection
|
97 |
+
self.num_classes = num_classes
|
98 |
+
self.flatten = nn.Flatten()
|
99 |
+
self.sigmoid = nn.Sigmoid()
|
100 |
+
self.gelu = nn.GELU()
|
101 |
+
self.relu = nn.ReLU()
|
102 |
+
|
103 |
+
self.fcIntermidiate = nn.Linear(512, feature_size)
|
104 |
+
self.positional_encoding = PositionalEncoding(feature_size, 10000)
|
105 |
+
self.encoder_layers = nn.ModuleList(
|
106 |
+
[EncoderLayer(feature_size, num_heads, d_ff, dropout) for _ in range(num_layers)])
|
107 |
+
self.dropout = nn.Dropout(dropout)
|
108 |
+
|
109 |
+
if self.conv_projection:
|
110 |
+
self.convProjection = nn.Conv1d(feature_size, feature_size, kernel_size=128, stride=1)
|
111 |
+
|
112 |
+
self.fc1 = nn.Linear(feature_size, feature_size)
|
113 |
+
self.fc2 = nn.Linear(feature_size, 1, bias=True)
|
114 |
+
|
115 |
+
|
116 |
+
if self.use_mode == 'spoof':
|
117 |
+
#if there is a mismatch error, you will need to replace this input size.. currently working with 8 seconds samples
|
118 |
+
#just multiply 95.760 * seconds the get this layer's input size
|
119 |
+
#or I can just add another parameter to the model seq_length and input = seq_length * feature_size
|
120 |
+
self.fcSpoof = nn.Linear(286080, d_ff)
|
121 |
+
self.fcFinal = nn.Linear(d_ff,self.num_classes)
|
122 |
+
|
123 |
+
else:
|
124 |
+
self.fcSpoof = None
|
125 |
+
|
126 |
+
def forward(self, src):
|
127 |
+
src = src.unsqueeze(1)
|
128 |
+
src = self.gelu(self.conv1(src))
|
129 |
+
src = self.gelu(self.conv2(src))
|
130 |
+
src = self.gelu(self.conv3(src))
|
131 |
+
src = self.gelu(self.conv4(src))
|
132 |
+
src = self.gelu(self.conv5(src))
|
133 |
+
src = self.gelu(self.conv6(src))
|
134 |
+
src = self.gelu(self.conv7(src))
|
135 |
+
src = src.permute(0, 2, 1)
|
136 |
+
src = self.fcIntermidiate(src)
|
137 |
+
src = src.permute(0, 2, 1)
|
138 |
+
|
139 |
+
if self.conv_projection:
|
140 |
+
src = self.gelu(self.convProjection(src))
|
141 |
+
|
142 |
+
src = self.dropout(src)
|
143 |
+
src = src.transpose(1, 2)
|
144 |
+
src_embedded = self.dropout(self.positional_encoding(src))
|
145 |
+
|
146 |
+
enc_output = src_embedded
|
147 |
+
for enc_layer in self.encoder_layers:
|
148 |
+
enc_output = enc_layer(enc_output, None)
|
149 |
+
|
150 |
+
embeddings = self.fc1(enc_output)
|
151 |
+
flatten_embeddings = self.flatten(embeddings)
|
152 |
+
|
153 |
+
if self.use_mode == 'extractor':
|
154 |
+
return embeddings
|
155 |
+
elif self.use_mode == 'partialSpoof':
|
156 |
+
return self.fc2(embeddings)
|
157 |
+
elif self.use_mode == 'spoof':
|
158 |
+
out_fcSpoof= self.fcSpoof(flatten_embeddings)
|
159 |
+
output = self.fcFinal(out_fcSpoof)
|
160 |
+
# output = self.sigmoid(self.fcSpoof(flatten_embeddings))
|
161 |
+
# print(f"Model output shape: {output.shape}")
|
162 |
+
return output
|
163 |
+
else:
|
164 |
+
raise ValueError('Wrong use mode of PhantomNet, please pick between extractor, partialSpoof, or spoof')
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
|
PhantomNet_Finetuned_V2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:097a9ce5ba542ad1453200f6370a9dcc00af473b8569db150623426aaa58e534
|
3 |
+
size 2999053243
|
app.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PhantomNET import PhantomNet
|
2 |
+
import joblib
|
3 |
+
from transformers import AutoFeatureExtractor, Wav2Vec2Model
|
4 |
+
import torch
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
from sklearn.linear_model import LogisticRegression
|
8 |
+
import gradio as gr
|
9 |
+
from pytube import YouTube
|
10 |
+
|
11 |
+
class HuggingFaceFeatureExtractor:
|
12 |
+
def __init__(self, model_class, name):
|
13 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
|
15 |
+
self.model = model_class.from_pretrained(name, output_hidden_states=True)
|
16 |
+
self.model.eval()
|
17 |
+
self.model.to(self.device)
|
18 |
+
|
19 |
+
def __call__(self, audio, sr):
|
20 |
+
inputs = self.feature_extractor(
|
21 |
+
audio,
|
22 |
+
sampling_rate=sr,
|
23 |
+
return_tensors="pt",
|
24 |
+
padding=True,
|
25 |
+
)
|
26 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
27 |
+
with torch.no_grad():
|
28 |
+
outputs = self.model(**inputs)
|
29 |
+
return outputs.hidden_states[9], outputs.hidden_states[8], outputs.last_hidden_state
|
30 |
+
|
31 |
+
|
32 |
+
FEATURE_EXTRACTOR = {"wav2vec2-xls-r-2b": lambda: HuggingFaceFeatureExtractor(Wav2Vec2Model,
|
33 |
+
"facebook/wav2vec2-xls-r-2b")}
|
34 |
+
|
35 |
+
|
36 |
+
model1 = joblib.load('model1_ensemble.pkl')
|
37 |
+
model2 = joblib.load('model2_ensemble.pkl')
|
38 |
+
model3 = joblib.load('model3_ensemble.pkl')
|
39 |
+
model4 = joblib.load('model4_ensemble.pkl')
|
40 |
+
final_model = joblib.load('final_model_ensemble.pkl')
|
41 |
+
|
42 |
+
def download_audio_from_youtube(youtube_url, output_path='.'):
|
43 |
+
yt = YouTube(youtube_url)
|
44 |
+
audio_stream = yt.streams.filter(only_audio=True).first()
|
45 |
+
audio_file = audio_stream.download(output_path=output_path)
|
46 |
+
return audio_file
|
47 |
+
|
48 |
+
def segment_audio(audio, sr, segment_duration):
|
49 |
+
segment_samples = int(segment_duration * sr)
|
50 |
+
total_samples = len(audio)
|
51 |
+
segments = [audio[i:i + segment_samples] for i in range(0, total_samples, segment_samples)]
|
52 |
+
return segments
|
53 |
+
|
54 |
+
def classify_with_eer_threshold(probabilities, eer_thresh):
|
55 |
+
return (probabilities >= eer_thresh).astype(int)
|
56 |
+
|
57 |
+
def process_audio(input_data, segment_duration=3):
|
58 |
+
if input_data.startswith("http"):
|
59 |
+
file_audio = download_audio_from_youtube(input_data)
|
60 |
+
else:
|
61 |
+
file_audio = input_data
|
62 |
+
|
63 |
+
audio, sr = librosa.load(file_audio, sr=16000)
|
64 |
+
if len(audio.shape) > 1:
|
65 |
+
audio = audio[0]
|
66 |
+
|
67 |
+
segments = segment_audio(audio, sr, segment_duration)
|
68 |
+
all_embeddings_layer10 = []
|
69 |
+
all_embeddings_layer9 = []
|
70 |
+
all_embeddings_layer48 = []
|
71 |
+
|
72 |
+
# wav2vec2 extractor
|
73 |
+
a = FEATURE_EXTRACTOR['wav2vec2-xls-r-2b']()
|
74 |
+
for idx, segment in enumerate(segments):
|
75 |
+
p1, p2, p3 = a(segment, sr)
|
76 |
+
all_embeddings_layer10.append(p1)
|
77 |
+
all_embeddings_layer9.append(p2)
|
78 |
+
all_embeddings_layer48.append(p3)
|
79 |
+
|
80 |
+
embedding_layer10 = torch.cat(all_embeddings_layer10, dim=1)
|
81 |
+
embedding_layer9 = torch.cat(all_embeddings_layer9, dim=1)
|
82 |
+
embedding_layer48 = torch.cat(all_embeddings_layer48, dim=1)
|
83 |
+
|
84 |
+
wav2vec2_feature_layer10 = torch.mean(embedding_layer10, dim=1).cpu().numpy()
|
85 |
+
wav2vec2_feature_layer9 = torch.mean(embedding_layer9, dim=1).cpu().numpy()
|
86 |
+
wav2vec2_feature_layer48 = torch.mean(embedding_layer48, dim=1).cpu().numpy()
|
87 |
+
|
88 |
+
# PhantomNet extractor
|
89 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
90 |
+
model = PhantomNet(feature_size=1920, num_classes=2, conv_projection=False, use_mode='extractor').to(device)
|
91 |
+
state_dict = torch.load("PhantomNet/saved_models/PhantomNet_Finetuned_V2.pt", map_location=device)
|
92 |
+
model.load_state_dict(state_dict, strict=False)
|
93 |
+
model.eval()
|
94 |
+
|
95 |
+
all_embeddings_PhantomNet = []
|
96 |
+
for idx, segment in enumerate(segments):
|
97 |
+
segment_input = torch.Tensor(segment).unsqueeze(0).to(device)
|
98 |
+
p = model(segment_input).detach()
|
99 |
+
all_embeddings_PhantomNet.append(p)
|
100 |
+
|
101 |
+
embedding_PhantomNet = torch.cat(all_embeddings_PhantomNet, dim=1)
|
102 |
+
PhantomNet_feature = torch.mean(embedding_PhantomNet, dim=1)
|
103 |
+
|
104 |
+
wav2vec2_feature_layer9 = wav2vec2_feature_layer9.reshape(1, -1)
|
105 |
+
wav2vec2_feature_layer10 = wav2vec2_feature_layer10.reshape(1, -1)
|
106 |
+
wav2vec2_feature_layer48 = wav2vec2_feature_layer48.reshape(1, -1)
|
107 |
+
PhantomNet_feature = PhantomNet_feature.reshape(1, -1)
|
108 |
+
|
109 |
+
eval_prob1 = model1.predict_proba(wav2vec2_feature_layer10)[:, 1].reshape(-1, 1)
|
110 |
+
eval_prob2 = model2.predict_proba(wav2vec2_feature_layer9)[:, 1].reshape(-1, 1)
|
111 |
+
eval_prob3 = model3.predict_proba(wav2vec2_feature_layer48)[:, 1].reshape(-1, 1)
|
112 |
+
eval_prob4 = model4.predict_proba(PhantomNet_feature)[:, 1].reshape(-1, 1)
|
113 |
+
|
114 |
+
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
|
115 |
+
|
116 |
+
eer_thresh = 0.02 # eer during evaluation
|
117 |
+
|
118 |
+
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
|
119 |
+
|
120 |
+
y_pred_inference = classify_with_eer_threshold(final_prob, eer_thresh)
|
121 |
+
|
122 |
+
if y_pred_inference == 1:
|
123 |
+
return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%"
|
124 |
+
else:
|
125 |
+
return f"Real with a confidence of: {final_prob[0] * 100:.2f}%"
|
126 |
+
|
127 |
+
def gradio_interface(audio, youtube_link):
|
128 |
+
if youtube_link:
|
129 |
+
return process_audio(youtube_link)
|
130 |
+
elif audio:
|
131 |
+
return process_audio(audio)
|
132 |
+
else:
|
133 |
+
return "please upload audio or provide a YouTube link."
|
134 |
+
|
135 |
+
interface = gr.Interface(
|
136 |
+
fn=gradio_interface,
|
137 |
+
inputs=[gr.Audio(type="filepath", label="Upload Audio"), gr.Textbox(label="YouTube Link (Optional)")],
|
138 |
+
outputs="text",
|
139 |
+
title="AI4TRUST Development",
|
140 |
+
description="Upload an audio file or provide a YouTube link to check for authenticity.",
|
141 |
+
)
|
142 |
+
|
143 |
+
interface.launch(share=True)
|
final_model_ensemble.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c565c251a8fe83f6c27c923a23193f6445f5bbfc2b9c4f15dca78c21051eb7f
|
3 |
+
size 863
|
model1_ensemble.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e38fda1d6feb8411cca2164e37e34c1c1a8dafe034e0b87db09256f65e46d90c
|
3 |
+
size 16191
|
model2_ensemble.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa97634c8d25ca7eaa9aa9d67ed9354eda88ee8d641b143d8e901b445ab9f440
|
3 |
+
size 16191
|
model3_ensemble.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e9e3d20682f76a0f5871891075c4fe0b30ed686aa03549b25ba0b49fc0f3d75
|
3 |
+
size 16191
|
model4_ensemble.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6673686202f04eb52b8683f97c8929318485a78a81558533e03e259ebaf8268d
|
3 |
+
size 16191
|
requirements.txt
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.4.0
|
4 |
+
audioread==3.0.1
|
5 |
+
certifi==2024.7.4
|
6 |
+
cffi==1.17.0
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
contourpy==1.2.1
|
10 |
+
cycler==0.12.1
|
11 |
+
decorator==5.1.1
|
12 |
+
exceptiongroup==1.2.2
|
13 |
+
fastapi==0.114.2
|
14 |
+
ffmpeg-python==0.2.0
|
15 |
+
ffmpy==0.4.0
|
16 |
+
filelock==3.15.4
|
17 |
+
fonttools==4.53.1
|
18 |
+
fsspec==2024.6.1
|
19 |
+
future==1.0.0
|
20 |
+
gradio==4.44.0
|
21 |
+
gradio_client==1.3.0
|
22 |
+
h11==0.14.0
|
23 |
+
httpcore==1.0.5
|
24 |
+
httpx==0.27.2
|
25 |
+
huggingface-hub==0.24.5
|
26 |
+
HyperPyYAML==1.2.2
|
27 |
+
idna==3.7
|
28 |
+
importlib_resources==6.4.0
|
29 |
+
Jinja2==3.1.4
|
30 |
+
joblib==1.4.2
|
31 |
+
kiwisolver==1.4.5
|
32 |
+
lazy_loader==0.4
|
33 |
+
librosa==0.10.2.post1
|
34 |
+
llvmlite==0.43.0
|
35 |
+
markdown-it-py==3.0.0
|
36 |
+
MarkupSafe==2.1.5
|
37 |
+
matplotlib==3.9.2
|
38 |
+
mdurl==0.1.2
|
39 |
+
mplcursors==0.5.3
|
40 |
+
mpmath==1.3.0
|
41 |
+
msgpack==1.0.8
|
42 |
+
networkx==3.2.1
|
43 |
+
numba==0.60.0
|
44 |
+
numpy==1.26.4
|
45 |
+
orjson==3.10.7
|
46 |
+
packaging==24.1
|
47 |
+
pandas==2.2.2
|
48 |
+
pillow==10.4.0
|
49 |
+
platformdirs==4.2.2
|
50 |
+
pooch==1.8.2
|
51 |
+
pycparser==2.22
|
52 |
+
pydantic==2.9.1
|
53 |
+
pydantic_core==2.23.3
|
54 |
+
pydub==0.25.1
|
55 |
+
Pygments==2.18.0
|
56 |
+
pyparsing==3.1.2
|
57 |
+
python-dateutil==2.9.0.post0
|
58 |
+
python-multipart==0.0.9
|
59 |
+
pytube==15.0.0
|
60 |
+
pytz==2024.1
|
61 |
+
PyYAML==6.0.2
|
62 |
+
regex==2024.7.24
|
63 |
+
requests==2.32.3
|
64 |
+
rich==13.8.1
|
65 |
+
ruamel.yaml==0.18.6
|
66 |
+
ruamel.yaml.clib==0.2.8
|
67 |
+
ruff==0.6.5
|
68 |
+
safetensors==0.4.4
|
69 |
+
scikit-learn==1.5.1
|
70 |
+
scipy==1.13.1
|
71 |
+
semantic-version==2.10.0
|
72 |
+
sentencepiece==0.2.0
|
73 |
+
shellingham==1.5.4
|
74 |
+
six==1.16.0
|
75 |
+
sniffio==1.3.1
|
76 |
+
soundfile==0.12.1
|
77 |
+
soxr==0.4.0
|
78 |
+
speechbrain==1.0.0
|
79 |
+
starlette==0.38.5
|
80 |
+
sympy==1.13.1
|
81 |
+
threadpoolctl==3.5.0
|
82 |
+
tokenizers==0.19.1
|
83 |
+
tomlkit==0.12.0
|
84 |
+
torch==2.4.0
|
85 |
+
torchaudio==2.4.0
|
86 |
+
tqdm==4.66.5
|
87 |
+
transformers==4.44.0
|
88 |
+
typer==0.12.5
|
89 |
+
typing_extensions==4.12.2
|
90 |
+
tzdata==2024.1
|
91 |
+
urllib3==2.2.2
|
92 |
+
uvicorn==0.30.6
|
93 |
+
websockets==12.0
|
94 |
+
zipp==3.20.0
|