Spaces:

DavidCombei
/

AI4TRUST_space

Sleeping

App Files Files Community

DavidCombei commited on Sep 15

Commit

d8876d6

•

1 Parent(s): 43524c8

Upload 9 files

Browse files

Files changed (9) hide show

PhantomNET.py +168 -0
PhantomNet_Finetuned_V2.pt +3 -0
app.py +143 -0
final_model_ensemble.pkl +3 -0
model1_ensemble.pkl +3 -0
model2_ensemble.pkl +3 -0
model3_ensemble.pkl +3 -0
model4_ensemble.pkl +3 -0
requirements.txt +94 -0

PhantomNET.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import torch.nn as nn
+import math
+import librosa
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, num_heads):
+        super(MultiHeadAttention, self).__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        self.W_q = nn.Linear(d_model, d_model)  # query
+        self.W_k = nn.Linear(d_model, d_model)  # key
+        self.W_v = nn.Linear(d_model, d_model)  # value
+        self.W_o = nn.Linear(d_model, d_model)  # output
+    def scaled_dot_product_attention(self, Q, K, V, mask=None):
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
+        if mask is not None:
+            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
+        attn_probs = torch.softmax(attn_scores, dim=-1)
+        output = torch.matmul(attn_probs, V)
+        return output
+    def split_heads(self, x):
+        batch_size, seq_length, d_model = x.size()
+        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
+    def combine_heads(self, x):
+        batch_size, _, seq_length, d_k = x.size()
+        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
+    def forward(self, Q, K, V, mask=None):
+        Q = self.split_heads(self.W_q(Q))
+        K = self.split_heads(self.W_k(K))
+        V = self.split_heads(self.W_v(V))
+        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
+        output = self.W_o(self.combine_heads(attn_output))
+        return output
+class PositionWiseFeedForward(nn.Module):
+    def __init__(self, d_model, d_ff):
+        super(PositionWiseFeedForward, self).__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)
+        self.fc2 = nn.Linear(d_ff, d_model)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        return self.fc2(self.relu(self.fc1(x)))
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_seq_length):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_seq_length, d_model)
+        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        self.pe = self.pe.to(x.device)
+        return x + self.pe[:, :x.size(1)]
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, num_heads, d_ff, dropout):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = MultiHeadAttention(d_model, num_heads)
+        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        attn_output = self.self_attn(x, x, x, mask)
+        x = self.norm1(x + self.dropout(attn_output))
+        ff_output = self.feed_forward(x)
+        x = self.norm2(x + self.dropout(ff_output))
+        return x
+class PhantomNet(nn.Module):
+    def __init__(self, use_mode, feature_size, conv_projection, num_classes, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1):
+        super(PhantomNet, self).__init__()
+        self.conv1 = nn.Conv1d(in_channels=1, out_channels=512, kernel_size=10, stride=5)
+        self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
+        self.conv3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
+        self.conv4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
+        self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2)
+        self.conv6 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
+        self.conv7 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=2, stride=2)
+        self.use_mode = use_mode
+        self.conv_projection = conv_projection
+        self.num_classes = num_classes
+        self.flatten = nn.Flatten()
+        self.sigmoid = nn.Sigmoid()
+        self.gelu = nn.GELU()
+        self.relu = nn.ReLU()
+        self.fcIntermidiate = nn.Linear(512, feature_size)
+        self.positional_encoding = PositionalEncoding(feature_size, 10000)
+        self.encoder_layers = nn.ModuleList(
+            [EncoderLayer(feature_size, num_heads, d_ff, dropout) for _ in range(num_layers)])
+        self.dropout = nn.Dropout(dropout)
+        if self.conv_projection:
+            self.convProjection = nn.Conv1d(feature_size, feature_size, kernel_size=128, stride=1)
+        self.fc1 = nn.Linear(feature_size, feature_size)
+        self.fc2 = nn.Linear(feature_size, 1, bias=True)
+        if self.use_mode == 'spoof':
+            #if there is a mismatch error, you will need to replace this input size.. currently working with 8 seconds samples
+            #just multiply 95.760 * seconds the get this layer's input size
+            #or I can just add another parameter to the model seq_length and input = seq_length * feature_size
+            self.fcSpoof = nn.Linear(286080, d_ff)
+            self.fcFinal = nn.Linear(d_ff,self.num_classes)
+        else:
+            self.fcSpoof = None
+    def forward(self, src):
+        src = src.unsqueeze(1)
+        src = self.gelu(self.conv1(src))
+        src = self.gelu(self.conv2(src))
+        src = self.gelu(self.conv3(src))
+        src = self.gelu(self.conv4(src))
+        src = self.gelu(self.conv5(src))
+        src = self.gelu(self.conv6(src))
+        src = self.gelu(self.conv7(src))
+        src = src.permute(0, 2, 1)
+        src = self.fcIntermidiate(src)
+        src = src.permute(0, 2, 1)
+        if self.conv_projection:
+            src = self.gelu(self.convProjection(src))
+        src = self.dropout(src)
+        src = src.transpose(1, 2)
+        src_embedded = self.dropout(self.positional_encoding(src))
+        enc_output = src_embedded
+        for enc_layer in self.encoder_layers:
+            enc_output = enc_layer(enc_output, None)
+        embeddings = self.fc1(enc_output)
+        flatten_embeddings = self.flatten(embeddings)
+        if self.use_mode == 'extractor':
+            return embeddings
+        elif self.use_mode == 'partialSpoof':
+            return self.fc2(embeddings)
+        elif self.use_mode == 'spoof':
+            out_fcSpoof= self.fcSpoof(flatten_embeddings)
+            output = self.fcFinal(out_fcSpoof)
+           # output = self.sigmoid(self.fcSpoof(flatten_embeddings))
+#            print(f"Model output shape: {output.shape}")
+            return output
+        else:
+            raise ValueError('Wrong use mode of PhantomNet, please pick between extractor, partialSpoof, or spoof')

PhantomNet_Finetuned_V2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:097a9ce5ba542ad1453200f6370a9dcc00af473b8569db150623426aaa58e534
+size 2999053243

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from PhantomNET import PhantomNet
+import joblib
+from transformers import AutoFeatureExtractor, Wav2Vec2Model
+import torch
+import librosa
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import gradio as gr
+from pytube import YouTube
+class HuggingFaceFeatureExtractor:
+    def __init__(self, model_class, name):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
+        self.model = model_class.from_pretrained(name, output_hidden_states=True)
+        self.model.eval()
+        self.model.to(self.device)
+    def __call__(self, audio, sr):
+        inputs = self.feature_extractor(
+            audio,
+            sampling_rate=sr,
+            return_tensors="pt",
+            padding=True,
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        return outputs.hidden_states[9], outputs.hidden_states[8], outputs.last_hidden_state
+FEATURE_EXTRACTOR = {"wav2vec2-xls-r-2b": lambda: HuggingFaceFeatureExtractor(Wav2Vec2Model,
+                                                                               "facebook/wav2vec2-xls-r-2b")}
+model1 = joblib.load('model1_ensemble.pkl')
+model2 = joblib.load('model2_ensemble.pkl')
+model3 = joblib.load('model3_ensemble.pkl')
+model4 = joblib.load('model4_ensemble.pkl')
+final_model = joblib.load('final_model_ensemble.pkl')
+def download_audio_from_youtube(youtube_url, output_path='.'):
+    yt = YouTube(youtube_url)
+    audio_stream = yt.streams.filter(only_audio=True).first()
+    audio_file = audio_stream.download(output_path=output_path)
+    return audio_file
+def segment_audio(audio, sr, segment_duration):
+    segment_samples = int(segment_duration * sr)
+    total_samples = len(audio)
+    segments = [audio[i:i + segment_samples] for i in range(0, total_samples, segment_samples)]
+    return segments
+def classify_with_eer_threshold(probabilities, eer_thresh):
+    return (probabilities >= eer_thresh).astype(int)
+def process_audio(input_data, segment_duration=3):
+    if input_data.startswith("http"):
+        file_audio = download_audio_from_youtube(input_data)
+    else:
+        file_audio = input_data
+    audio, sr = librosa.load(file_audio, sr=16000)
+    if len(audio.shape) > 1:
+        audio = audio[0]
+    segments = segment_audio(audio, sr, segment_duration)
+    all_embeddings_layer10 = []
+    all_embeddings_layer9 = []
+    all_embeddings_layer48 = []
+    # wav2vec2 extractor
+    a = FEATURE_EXTRACTOR['wav2vec2-xls-r-2b']()
+    for idx, segment in enumerate(segments):
+        p1, p2, p3 = a(segment, sr)
+        all_embeddings_layer10.append(p1)
+        all_embeddings_layer9.append(p2)
+        all_embeddings_layer48.append(p3)
+    embedding_layer10 = torch.cat(all_embeddings_layer10, dim=1)
+    embedding_layer9 = torch.cat(all_embeddings_layer9, dim=1)
+    embedding_layer48 = torch.cat(all_embeddings_layer48, dim=1)
+    wav2vec2_feature_layer10 = torch.mean(embedding_layer10, dim=1).cpu().numpy()
+    wav2vec2_feature_layer9 = torch.mean(embedding_layer9, dim=1).cpu().numpy()
+    wav2vec2_feature_layer48 = torch.mean(embedding_layer48, dim=1).cpu().numpy()
+    # PhantomNet extractor
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = PhantomNet(feature_size=1920, num_classes=2, conv_projection=False, use_mode='extractor').to(device)
+    state_dict = torch.load("PhantomNet/saved_models/PhantomNet_Finetuned_V2.pt", map_location=device)
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    all_embeddings_PhantomNet = []
+    for idx, segment in enumerate(segments):
+        segment_input = torch.Tensor(segment).unsqueeze(0).to(device)
+        p = model(segment_input).detach()
+        all_embeddings_PhantomNet.append(p)
+    embedding_PhantomNet = torch.cat(all_embeddings_PhantomNet, dim=1)
+    PhantomNet_feature = torch.mean(embedding_PhantomNet, dim=1)
+    wav2vec2_feature_layer9 = wav2vec2_feature_layer9.reshape(1, -1)
+    wav2vec2_feature_layer10 = wav2vec2_feature_layer10.reshape(1, -1)
+    wav2vec2_feature_layer48 = wav2vec2_feature_layer48.reshape(1, -1)
+    PhantomNet_feature = PhantomNet_feature.reshape(1, -1)
+    eval_prob1 = model1.predict_proba(wav2vec2_feature_layer10)[:, 1].reshape(-1, 1)
+    eval_prob2 = model2.predict_proba(wav2vec2_feature_layer9)[:, 1].reshape(-1, 1)
+    eval_prob3 = model3.predict_proba(wav2vec2_feature_layer48)[:, 1].reshape(-1, 1)
+    eval_prob4 = model4.predict_proba(PhantomNet_feature)[:, 1].reshape(-1, 1)
+    eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
+    eer_thresh = 0.02 # eer during evaluation
+    final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
+    y_pred_inference = classify_with_eer_threshold(final_prob, eer_thresh)
+    if y_pred_inference == 1:
+        return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%"
+    else:
+        return f"Real with a confidence of: {final_prob[0] * 100:.2f}%"
+def gradio_interface(audio, youtube_link):
+    if youtube_link:
+        return process_audio(youtube_link)
+    elif audio:
+        return process_audio(audio)
+    else:
+        return "please upload audio or provide a YouTube link."
+interface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[gr.Audio(type="filepath", label="Upload Audio"), gr.Textbox(label="YouTube Link (Optional)")],
+    outputs="text",
+    title="AI4TRUST Development",
+    description="Upload an audio file or provide a YouTube link to check for authenticity.",
+)
+interface.launch(share=True)

final_model_ensemble.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c565c251a8fe83f6c27c923a23193f6445f5bbfc2b9c4f15dca78c21051eb7f
+size 863

model1_ensemble.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e38fda1d6feb8411cca2164e37e34c1c1a8dafe034e0b87db09256f65e46d90c
+size 16191

model2_ensemble.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa97634c8d25ca7eaa9aa9d67ed9354eda88ee8d641b143d8e901b445ab9f440
+size 16191

model3_ensemble.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e9e3d20682f76a0f5871891075c4fe0b30ed686aa03549b25ba0b49fc0f3d75
+size 16191

model4_ensemble.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6673686202f04eb52b8683f97c8929318485a78a81558533e03e259ebaf8268d
+size 16191

requirements.txt ADDED Viewed

	@@ -0,0 +1,94 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.4.0
+audioread==3.0.1
+certifi==2024.7.4
+cffi==1.17.0
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+decorator==5.1.1
+exceptiongroup==1.2.2
+fastapi==0.114.2
+ffmpeg-python==0.2.0
+ffmpy==0.4.0
+filelock==3.15.4
+fonttools==4.53.1
+fsspec==2024.6.1
+future==1.0.0
+gradio==4.44.0
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.24.5
+HyperPyYAML==1.2.2
+idna==3.7
+importlib_resources==6.4.0
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.5
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mplcursors==0.5.3
+mpmath==1.3.0
+msgpack==1.0.8
+networkx==3.2.1
+numba==0.60.0
+numpy==1.26.4
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+platformdirs==4.2.2
+pooch==1.8.2
+pycparser==2.22
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytube==15.0.0
+pytz==2024.1
+PyYAML==6.0.2
+regex==2024.7.24
+requests==2.32.3
+rich==13.8.1
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+ruff==0.6.5
+safetensors==0.4.4
+scikit-learn==1.5.1
+scipy==1.13.1
+semantic-version==2.10.0
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soundfile==0.12.1
+soxr==0.4.0
+speechbrain==1.0.0
+starlette==0.38.5
+sympy==1.13.1
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+torch==2.4.0
+torchaudio==2.4.0
+tqdm==4.66.5
+transformers==4.44.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+websockets==12.0
+zipp==3.20.0