DavidCombei commited on
Commit
d8876d6
1 Parent(s): 43524c8

Upload 9 files

Browse files
PhantomNET.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+ import librosa
5
+
6
+ class MultiHeadAttention(nn.Module):
7
+ def __init__(self, d_model, num_heads):
8
+ super(MultiHeadAttention, self).__init__()
9
+ self.d_model = d_model
10
+ self.num_heads = num_heads
11
+ self.d_k = d_model // num_heads
12
+ self.W_q = nn.Linear(d_model, d_model) # query
13
+ self.W_k = nn.Linear(d_model, d_model) # key
14
+ self.W_v = nn.Linear(d_model, d_model) # value
15
+ self.W_o = nn.Linear(d_model, d_model) # output
16
+
17
+ def scaled_dot_product_attention(self, Q, K, V, mask=None):
18
+ attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
19
+ if mask is not None:
20
+ attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
21
+ attn_probs = torch.softmax(attn_scores, dim=-1)
22
+ output = torch.matmul(attn_probs, V)
23
+ return output
24
+
25
+ def split_heads(self, x):
26
+ batch_size, seq_length, d_model = x.size()
27
+ return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
28
+
29
+ def combine_heads(self, x):
30
+ batch_size, _, seq_length, d_k = x.size()
31
+ return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
32
+
33
+ def forward(self, Q, K, V, mask=None):
34
+ Q = self.split_heads(self.W_q(Q))
35
+ K = self.split_heads(self.W_k(K))
36
+ V = self.split_heads(self.W_v(V))
37
+ attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
38
+ output = self.W_o(self.combine_heads(attn_output))
39
+ return output
40
+
41
+
42
+ class PositionWiseFeedForward(nn.Module):
43
+ def __init__(self, d_model, d_ff):
44
+ super(PositionWiseFeedForward, self).__init__()
45
+ self.fc1 = nn.Linear(d_model, d_ff)
46
+ self.fc2 = nn.Linear(d_ff, d_model)
47
+ self.relu = nn.ReLU()
48
+
49
+ def forward(self, x):
50
+ return self.fc2(self.relu(self.fc1(x)))
51
+
52
+
53
+ class PositionalEncoding(nn.Module):
54
+ def __init__(self, d_model, max_seq_length):
55
+ super(PositionalEncoding, self).__init__()
56
+ pe = torch.zeros(max_seq_length, d_model)
57
+ position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
58
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
59
+ pe[:, 0::2] = torch.sin(position * div_term)
60
+ pe[:, 1::2] = torch.cos(position * div_term)
61
+ self.register_buffer('pe', pe.unsqueeze(0))
62
+
63
+ def forward(self, x):
64
+ self.pe = self.pe.to(x.device)
65
+ return x + self.pe[:, :x.size(1)]
66
+
67
+
68
+ class EncoderLayer(nn.Module):
69
+ def __init__(self, d_model, num_heads, d_ff, dropout):
70
+ super(EncoderLayer, self).__init__()
71
+ self.self_attn = MultiHeadAttention(d_model, num_heads)
72
+ self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
73
+ self.norm1 = nn.LayerNorm(d_model)
74
+ self.norm2 = nn.LayerNorm(d_model)
75
+ self.dropout = nn.Dropout(dropout)
76
+
77
+ def forward(self, x, mask):
78
+ attn_output = self.self_attn(x, x, x, mask)
79
+ x = self.norm1(x + self.dropout(attn_output))
80
+ ff_output = self.feed_forward(x)
81
+ x = self.norm2(x + self.dropout(ff_output))
82
+ return x
83
+
84
+
85
+ class PhantomNet(nn.Module):
86
+ def __init__(self, use_mode, feature_size, conv_projection, num_classes, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1):
87
+ super(PhantomNet, self).__init__()
88
+ self.conv1 = nn.Conv1d(in_channels=1, out_channels=512, kernel_size=10, stride=5)
89
+ self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
90
+ self.conv3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
91
+ self.conv4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
92
+ self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
93
+ self.conv6 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
94
+ self.conv7 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
95
+ self.use_mode = use_mode
96
+ self.conv_projection = conv_projection
97
+ self.num_classes = num_classes
98
+ self.flatten = nn.Flatten()
99
+ self.sigmoid = nn.Sigmoid()
100
+ self.gelu = nn.GELU()
101
+ self.relu = nn.ReLU()
102
+
103
+ self.fcIntermidiate = nn.Linear(512, feature_size)
104
+ self.positional_encoding = PositionalEncoding(feature_size, 10000)
105
+ self.encoder_layers = nn.ModuleList(
106
+ [EncoderLayer(feature_size, num_heads, d_ff, dropout) for _ in range(num_layers)])
107
+ self.dropout = nn.Dropout(dropout)
108
+
109
+ if self.conv_projection:
110
+ self.convProjection = nn.Conv1d(feature_size, feature_size, kernel_size=128, stride=1)
111
+
112
+ self.fc1 = nn.Linear(feature_size, feature_size)
113
+ self.fc2 = nn.Linear(feature_size, 1, bias=True)
114
+
115
+
116
+ if self.use_mode == 'spoof':
117
+ #if there is a mismatch error, you will need to replace this input size.. currently working with 8 seconds samples
118
+ #just multiply 95.760 * seconds the get this layer's input size
119
+ #or I can just add another parameter to the model seq_length and input = seq_length * feature_size
120
+ self.fcSpoof = nn.Linear(286080, d_ff)
121
+ self.fcFinal = nn.Linear(d_ff,self.num_classes)
122
+
123
+ else:
124
+ self.fcSpoof = None
125
+
126
+ def forward(self, src):
127
+ src = src.unsqueeze(1)
128
+ src = self.gelu(self.conv1(src))
129
+ src = self.gelu(self.conv2(src))
130
+ src = self.gelu(self.conv3(src))
131
+ src = self.gelu(self.conv4(src))
132
+ src = self.gelu(self.conv5(src))
133
+ src = self.gelu(self.conv6(src))
134
+ src = self.gelu(self.conv7(src))
135
+ src = src.permute(0, 2, 1)
136
+ src = self.fcIntermidiate(src)
137
+ src = src.permute(0, 2, 1)
138
+
139
+ if self.conv_projection:
140
+ src = self.gelu(self.convProjection(src))
141
+
142
+ src = self.dropout(src)
143
+ src = src.transpose(1, 2)
144
+ src_embedded = self.dropout(self.positional_encoding(src))
145
+
146
+ enc_output = src_embedded
147
+ for enc_layer in self.encoder_layers:
148
+ enc_output = enc_layer(enc_output, None)
149
+
150
+ embeddings = self.fc1(enc_output)
151
+ flatten_embeddings = self.flatten(embeddings)
152
+
153
+ if self.use_mode == 'extractor':
154
+ return embeddings
155
+ elif self.use_mode == 'partialSpoof':
156
+ return self.fc2(embeddings)
157
+ elif self.use_mode == 'spoof':
158
+ out_fcSpoof= self.fcSpoof(flatten_embeddings)
159
+ output = self.fcFinal(out_fcSpoof)
160
+ # output = self.sigmoid(self.fcSpoof(flatten_embeddings))
161
+ # print(f"Model output shape: {output.shape}")
162
+ return output
163
+ else:
164
+ raise ValueError('Wrong use mode of PhantomNet, please pick between extractor, partialSpoof, or spoof')
165
+
166
+
167
+
168
+
PhantomNet_Finetuned_V2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097a9ce5ba542ad1453200f6370a9dcc00af473b8569db150623426aaa58e534
3
+ size 2999053243
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PhantomNET import PhantomNet
2
+ import joblib
3
+ from transformers import AutoFeatureExtractor, Wav2Vec2Model
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ from sklearn.linear_model import LogisticRegression
8
+ import gradio as gr
9
+ from pytube import YouTube
10
+
11
+ class HuggingFaceFeatureExtractor:
12
+ def __init__(self, model_class, name):
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
15
+ self.model = model_class.from_pretrained(name, output_hidden_states=True)
16
+ self.model.eval()
17
+ self.model.to(self.device)
18
+
19
+ def __call__(self, audio, sr):
20
+ inputs = self.feature_extractor(
21
+ audio,
22
+ sampling_rate=sr,
23
+ return_tensors="pt",
24
+ padding=True,
25
+ )
26
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
27
+ with torch.no_grad():
28
+ outputs = self.model(**inputs)
29
+ return outputs.hidden_states[9], outputs.hidden_states[8], outputs.last_hidden_state
30
+
31
+
32
+ FEATURE_EXTRACTOR = {"wav2vec2-xls-r-2b": lambda: HuggingFaceFeatureExtractor(Wav2Vec2Model,
33
+ "facebook/wav2vec2-xls-r-2b")}
34
+
35
+
36
+ model1 = joblib.load('model1_ensemble.pkl')
37
+ model2 = joblib.load('model2_ensemble.pkl')
38
+ model3 = joblib.load('model3_ensemble.pkl')
39
+ model4 = joblib.load('model4_ensemble.pkl')
40
+ final_model = joblib.load('final_model_ensemble.pkl')
41
+
42
+ def download_audio_from_youtube(youtube_url, output_path='.'):
43
+ yt = YouTube(youtube_url)
44
+ audio_stream = yt.streams.filter(only_audio=True).first()
45
+ audio_file = audio_stream.download(output_path=output_path)
46
+ return audio_file
47
+
48
+ def segment_audio(audio, sr, segment_duration):
49
+ segment_samples = int(segment_duration * sr)
50
+ total_samples = len(audio)
51
+ segments = [audio[i:i + segment_samples] for i in range(0, total_samples, segment_samples)]
52
+ return segments
53
+
54
+ def classify_with_eer_threshold(probabilities, eer_thresh):
55
+ return (probabilities >= eer_thresh).astype(int)
56
+
57
+ def process_audio(input_data, segment_duration=3):
58
+ if input_data.startswith("http"):
59
+ file_audio = download_audio_from_youtube(input_data)
60
+ else:
61
+ file_audio = input_data
62
+
63
+ audio, sr = librosa.load(file_audio, sr=16000)
64
+ if len(audio.shape) > 1:
65
+ audio = audio[0]
66
+
67
+ segments = segment_audio(audio, sr, segment_duration)
68
+ all_embeddings_layer10 = []
69
+ all_embeddings_layer9 = []
70
+ all_embeddings_layer48 = []
71
+
72
+ # wav2vec2 extractor
73
+ a = FEATURE_EXTRACTOR['wav2vec2-xls-r-2b']()
74
+ for idx, segment in enumerate(segments):
75
+ p1, p2, p3 = a(segment, sr)
76
+ all_embeddings_layer10.append(p1)
77
+ all_embeddings_layer9.append(p2)
78
+ all_embeddings_layer48.append(p3)
79
+
80
+ embedding_layer10 = torch.cat(all_embeddings_layer10, dim=1)
81
+ embedding_layer9 = torch.cat(all_embeddings_layer9, dim=1)
82
+ embedding_layer48 = torch.cat(all_embeddings_layer48, dim=1)
83
+
84
+ wav2vec2_feature_layer10 = torch.mean(embedding_layer10, dim=1).cpu().numpy()
85
+ wav2vec2_feature_layer9 = torch.mean(embedding_layer9, dim=1).cpu().numpy()
86
+ wav2vec2_feature_layer48 = torch.mean(embedding_layer48, dim=1).cpu().numpy()
87
+
88
+ # PhantomNet extractor
89
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
90
+ model = PhantomNet(feature_size=1920, num_classes=2, conv_projection=False, use_mode='extractor').to(device)
91
+ state_dict = torch.load("PhantomNet/saved_models/PhantomNet_Finetuned_V2.pt", map_location=device)
92
+ model.load_state_dict(state_dict, strict=False)
93
+ model.eval()
94
+
95
+ all_embeddings_PhantomNet = []
96
+ for idx, segment in enumerate(segments):
97
+ segment_input = torch.Tensor(segment).unsqueeze(0).to(device)
98
+ p = model(segment_input).detach()
99
+ all_embeddings_PhantomNet.append(p)
100
+
101
+ embedding_PhantomNet = torch.cat(all_embeddings_PhantomNet, dim=1)
102
+ PhantomNet_feature = torch.mean(embedding_PhantomNet, dim=1)
103
+
104
+ wav2vec2_feature_layer9 = wav2vec2_feature_layer9.reshape(1, -1)
105
+ wav2vec2_feature_layer10 = wav2vec2_feature_layer10.reshape(1, -1)
106
+ wav2vec2_feature_layer48 = wav2vec2_feature_layer48.reshape(1, -1)
107
+ PhantomNet_feature = PhantomNet_feature.reshape(1, -1)
108
+
109
+ eval_prob1 = model1.predict_proba(wav2vec2_feature_layer10)[:, 1].reshape(-1, 1)
110
+ eval_prob2 = model2.predict_proba(wav2vec2_feature_layer9)[:, 1].reshape(-1, 1)
111
+ eval_prob3 = model3.predict_proba(wav2vec2_feature_layer48)[:, 1].reshape(-1, 1)
112
+ eval_prob4 = model4.predict_proba(PhantomNet_feature)[:, 1].reshape(-1, 1)
113
+
114
+ eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
115
+
116
+ eer_thresh = 0.02 # eer during evaluation
117
+
118
+ final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
119
+
120
+ y_pred_inference = classify_with_eer_threshold(final_prob, eer_thresh)
121
+
122
+ if y_pred_inference == 1:
123
+ return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%"
124
+ else:
125
+ return f"Real with a confidence of: {final_prob[0] * 100:.2f}%"
126
+
127
+ def gradio_interface(audio, youtube_link):
128
+ if youtube_link:
129
+ return process_audio(youtube_link)
130
+ elif audio:
131
+ return process_audio(audio)
132
+ else:
133
+ return "please upload audio or provide a YouTube link."
134
+
135
+ interface = gr.Interface(
136
+ fn=gradio_interface,
137
+ inputs=[gr.Audio(type="filepath", label="Upload Audio"), gr.Textbox(label="YouTube Link (Optional)")],
138
+ outputs="text",
139
+ title="AI4TRUST Development",
140
+ description="Upload an audio file or provide a YouTube link to check for authenticity.",
141
+ )
142
+
143
+ interface.launch(share=True)
final_model_ensemble.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c565c251a8fe83f6c27c923a23193f6445f5bbfc2b9c4f15dca78c21051eb7f
3
+ size 863
model1_ensemble.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e38fda1d6feb8411cca2164e37e34c1c1a8dafe034e0b87db09256f65e46d90c
3
+ size 16191
model2_ensemble.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa97634c8d25ca7eaa9aa9d67ed9354eda88ee8d641b143d8e901b445ab9f440
3
+ size 16191
model3_ensemble.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e9e3d20682f76a0f5871891075c4fe0b30ed686aa03549b25ba0b49fc0f3d75
3
+ size 16191
model4_ensemble.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6673686202f04eb52b8683f97c8929318485a78a81558533e03e259ebaf8268d
3
+ size 16191
requirements.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.4.0
4
+ audioread==3.0.1
5
+ certifi==2024.7.4
6
+ cffi==1.17.0
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ contourpy==1.2.1
10
+ cycler==0.12.1
11
+ decorator==5.1.1
12
+ exceptiongroup==1.2.2
13
+ fastapi==0.114.2
14
+ ffmpeg-python==0.2.0
15
+ ffmpy==0.4.0
16
+ filelock==3.15.4
17
+ fonttools==4.53.1
18
+ fsspec==2024.6.1
19
+ future==1.0.0
20
+ gradio==4.44.0
21
+ gradio_client==1.3.0
22
+ h11==0.14.0
23
+ httpcore==1.0.5
24
+ httpx==0.27.2
25
+ huggingface-hub==0.24.5
26
+ HyperPyYAML==1.2.2
27
+ idna==3.7
28
+ importlib_resources==6.4.0
29
+ Jinja2==3.1.4
30
+ joblib==1.4.2
31
+ kiwisolver==1.4.5
32
+ lazy_loader==0.4
33
+ librosa==0.10.2.post1
34
+ llvmlite==0.43.0
35
+ markdown-it-py==3.0.0
36
+ MarkupSafe==2.1.5
37
+ matplotlib==3.9.2
38
+ mdurl==0.1.2
39
+ mplcursors==0.5.3
40
+ mpmath==1.3.0
41
+ msgpack==1.0.8
42
+ networkx==3.2.1
43
+ numba==0.60.0
44
+ numpy==1.26.4
45
+ orjson==3.10.7
46
+ packaging==24.1
47
+ pandas==2.2.2
48
+ pillow==10.4.0
49
+ platformdirs==4.2.2
50
+ pooch==1.8.2
51
+ pycparser==2.22
52
+ pydantic==2.9.1
53
+ pydantic_core==2.23.3
54
+ pydub==0.25.1
55
+ Pygments==2.18.0
56
+ pyparsing==3.1.2
57
+ python-dateutil==2.9.0.post0
58
+ python-multipart==0.0.9
59
+ pytube==15.0.0
60
+ pytz==2024.1
61
+ PyYAML==6.0.2
62
+ regex==2024.7.24
63
+ requests==2.32.3
64
+ rich==13.8.1
65
+ ruamel.yaml==0.18.6
66
+ ruamel.yaml.clib==0.2.8
67
+ ruff==0.6.5
68
+ safetensors==0.4.4
69
+ scikit-learn==1.5.1
70
+ scipy==1.13.1
71
+ semantic-version==2.10.0
72
+ sentencepiece==0.2.0
73
+ shellingham==1.5.4
74
+ six==1.16.0
75
+ sniffio==1.3.1
76
+ soundfile==0.12.1
77
+ soxr==0.4.0
78
+ speechbrain==1.0.0
79
+ starlette==0.38.5
80
+ sympy==1.13.1
81
+ threadpoolctl==3.5.0
82
+ tokenizers==0.19.1
83
+ tomlkit==0.12.0
84
+ torch==2.4.0
85
+ torchaudio==2.4.0
86
+ tqdm==4.66.5
87
+ transformers==4.44.0
88
+ typer==0.12.5
89
+ typing_extensions==4.12.2
90
+ tzdata==2024.1
91
+ urllib3==2.2.2
92
+ uvicorn==0.30.6
93
+ websockets==12.0
94
+ zipp==3.20.0