Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on 18 days ago

Commit

cdd8a58

verified ·

1 Parent(s): 18779ec

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -5

app.py CHANGED Viewed

@@ -24,6 +24,20 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
@@ -73,7 +87,6 @@ def predict(file_obj):
     # Read the file content
     try:
-        # Handle both string and file object cases
         if isinstance(file_obj, str):
             text = file_obj
         else:
@@ -81,6 +94,11 @@ def predict(file_obj):
     except Exception as e:
         return f"Error reading file: {str(e)}"
     # Load model and scaler
     try:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -106,20 +124,47 @@ def predict(file_obj):
             # Get k-mer vector
             kmer_vector = sequence_to_kmer_vector(seq)
             kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
-            # Predict
             with torch.no_grad():
-                output = model(torch.FloatTensor(kmer_vector).to(device))
                 probs = torch.softmax(output, dim=1)
             # Format results
             pred_class = 1 if probs[0][1] > probs[0][0] else 0
             pred_label = 'human' if pred_class == 1 else 'non-human'
             result = f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
-Non-human probability: {float(probs[0][0]):0.4f}"""
             results.append(result)
     except Exception as e:
         return f"Error processing sequences: {str(e)}"
@@ -136,4 +181,4 @@ iface = gr.Interface(
 # Launch the interface
 if __name__ == "__main__":
-    iface.launch()  # Remove share=True for Hugging Face Spaces

     def forward(self, x):
         return self.network(x)
+    def get_feature_importance(self, x):
+        """Calculate feature importance using gradient-based method"""
+        x.requires_grad_(True)
+        output = self.network(x)
+        importance = torch.zeros_like(x)
+        for i in range(output.shape[1]):
+            if x.grad is not None:
+                x.grad.zero_()
+            output[..., i].sum().backward(retain_graph=True)
+            importance += torch.abs(x.grad)
+        return importance
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
     # Read the file content
     try:
         if isinstance(file_obj, str):
             text = file_obj
         else:
     except Exception as e:
         return f"Error reading file: {str(e)}"
+    # Generate k-mer dictionary
+    k = 4  # k-mer size
+    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
+    kmer_dict = {km: i for i, km in enumerate(kmers)}
     # Load model and scaler
     try:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
             # Get k-mer vector
             kmer_vector = sequence_to_kmer_vector(seq)
             kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
+            X_tensor = torch.FloatTensor(kmer_vector).to(device)
+            # Get predictions and feature importance
             with torch.no_grad():
+                output = model(X_tensor)
                 probs = torch.softmax(output, dim=1)
+            # Calculate feature importance
+            importance = model.get_feature_importance(X_tensor)
+            kmer_importance = importance[0].cpu().numpy()
+            # Weight importance by actual k-mer frequency
+            kmer_importance *= kmer_vector[0]
+            # Get top 10 k-mers
+            top_k = 10
+            top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
+            important_kmers = [
+                {
+                    'kmer': list(kmer_dict.keys())[list(kmer_dict.values()).index(i)],
+                    'importance': float(kmer_importance[i]),
+                    'frequency': float(kmer_vector[0][i])
+                }
+                for i in top_indices
+            ]
             # Format results
             pred_class = 1 if probs[0][1] > probs[0][0] else 0
             pred_label = 'human' if pred_class == 1 else 'non-human'
             result = f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
+Non-human probability: {float(probs[0][0]):0.4f}
+Most influential k-mers:"""
+            for kmer in important_kmers:
+                result += f"\n  {kmer['kmer']}: importance={kmer['importance']:.4f}, frequency={kmer['frequency']:.4f}"
             results.append(result)
     except Exception as e:
         return f"Error processing sequences: {str(e)}"
 # Launch the interface
 if __name__ == "__main__":
+    iface.launch()