Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Dec 23, 2024

Commit

2243c0c

verified ·

1 Parent(s): 33ae1e8

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -61

app.py CHANGED Viewed

@@ -3,8 +3,12 @@ import torch
 import joblib
 import numpy as np
 from itertools import product
-from typing import Dict
 import torch.nn as nn
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
@@ -28,84 +32,125 @@ class VirusClassifier(nn.Module):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
-    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
-    kmer_dict = {kmer: 0 for kmer in kmers}
-    for i in range(len(sequence) - k + 1):
-        kmer = sequence[i:i+k]
-        if kmer in kmer_dict:  # only count valid kmers
-            kmer_dict[kmer] += 1
-    return np.array(list(kmer_dict.values()))
-def parse_fasta(fasta_content: str):
-    """Parse FASTA format string"""
-    sequences = []
-    current_header = None
-    current_sequence = []
-    for line in fasta_content.split('\n'):
-        line = line.strip()
-        if not line:
-            continue
-        if line.startswith('>'):
-            if current_header is not None:
-                sequences.append((current_header, ''.join(current_sequence)))
-            current_header = line[1:]
-            current_sequence = []
-        else:
-            current_sequence.append(line.upper())
-    if current_header is not None:
-        sequences.append((current_header, ''.join(current_sequence)))
-    return sequences
-def predict_sequence(fasta_content: str) -> str:
     """Process FASTA input and return formatted predictions"""
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    k = 4
-    # Load model and scaler
-    model = VirusClassifier(256).to(device)  # 256 = 4^4 for 4-mers
-    model.load_state_dict(torch.load('model.pt', map_location=device))
-    scaler = joblib.load('scaler.pkl')
-    model.eval()
-    # Process sequences
-    sequences = parse_fasta(fasta_content)
-    results = []
-    for header, seq in sequences:
-        # Convert sequence to k-mer vector
-        kmer_vector = sequence_to_kmer_vector(seq, k)
-        kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
-        # Get prediction
-        with torch.no_grad():
-            output = model(torch.FloatTensor(kmer_vector).to(device))
-            probs = torch.softmax(output, dim=1)
-        # Format result
-        pred_class = 1 if probs[0][1] > probs[0][0] else 0
-        pred_label = 'human' if pred_class == 1 else 'non-human'
-        result = f"""
 Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 """
-        results.append(result)
-    return "\n".join(results)
 # Create Gradio interface
 iface = gr.Interface(
     fn=predict_sequence,
     inputs=gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"]),
-    outputs=gr.Textbox(label="Prediction Results"),
     title="Virus Host Classifier",
     description="Upload a FASTA file to predict whether a virus sequence is likely to infect human or non-human hosts.",
     examples=[["example.fasta"]],

 import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
+    try:
+        kmers = [''.join(p) for p in product("ACGT", repeat=k)]
+        kmer_dict = {kmer: 0 for kmer in kmers}
+        for i in range(len(sequence) - k + 1):
+            kmer = sequence[i:i+k]
+            if kmer in kmer_dict:  # only count valid kmers
+                kmer_dict[kmer] += 1
+        return np.array(list(kmer_dict.values()))
+    except Exception as e:
+        logger.error(f"Error in sequence_to_kmer_vector: {str(e)}")
+        raise
+def parse_fasta(file_obj) -> list:
+    """Parse FASTA format from file object"""
+    try:
+        # Read the content from the file object
+        content = file_obj.decode('utf-8')
+        logger.info(f"Received file content length: {len(content)}")
+        sequences = []
+        current_header = None
+        current_sequence = []
+        for line in content.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith('>'):
+                if current_header is not None:
+                    sequences.append((current_header, ''.join(current_sequence)))
+                current_header = line[1:]
+                current_sequence = []
+            else:
+                current_sequence.append(line.upper())
+        if current_header is not None:
+            sequences.append((current_header, ''.join(current_sequence)))
+        logger.info(f"Parsed {len(sequences)} sequences from FASTA")
+        return sequences
+    except Exception as e:
+        logger.error(f"Error parsing FASTA: {str(e)}")
+        raise
+def predict_sequence(file_obj) -> str:
     """Process FASTA input and return formatted predictions"""
+    try:
+        logger.info("Starting prediction process")
+        if file_obj is None:
+            return "Please upload a FASTA file"
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        logger.info(f"Using device: {device}")
+        k = 4
+        # Load model and scaler
+        try:
+            logger.info("Loading model and scaler")
+            model = VirusClassifier(256).to(device)  # 256 = 4^4 for 4-mers
+            model.load_state_dict(torch.load('model.pt', map_location=device))
+            scaler = joblib.load('scaler.pkl')
+            model.eval()
+        except Exception as e:
+            logger.error(f"Error loading model or scaler: {str(e)}")
+            return f"Error loading model: {str(e)}"
+        # Process sequences
+        try:
+            sequences = parse_fasta(file_obj)
+        except Exception as e:
+            logger.error(f"Error parsing FASTA file: {str(e)}")
+            return f"Error parsing FASTA file: {str(e)}"
+        results = []
+        for header, seq in sequences:
+            logger.info(f"Processing sequence: {header}")
+            try:
+                # Convert sequence to k-mer vector
+                kmer_vector = sequence_to_kmer_vector(seq, k)
+                kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
+                # Get prediction
+                with torch.no_grad():
+                    output = model(torch.FloatTensor(kmer_vector).to(device))
+                    probs = torch.softmax(output, dim=1)
+                # Format result
+                pred_class = 1 if probs[0][1] > probs[0][0] else 0
+                pred_label = 'human' if pred_class == 1 else 'non-human'
+                result = f"""
 Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 """
+                results.append(result)
+                logger.info(f"Processed sequence {header} successfully")
+            except Exception as e:
+                logger.error(f"Error processing sequence {header}: {str(e)}")
+                results.append(f"Error processing sequence {header}: {str(e)}")
+        return "\n".join(results)
+    except Exception as e:
+        logger.error(f"Unexpected error in predict_sequence: {str(e)}")
+        return f"An unexpected error occurred: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(
     fn=predict_sequence,
     inputs=gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"]),
+    outputs=gr.Textbox(label="Prediction Results", lines=10),
     title="Virus Host Classifier",
     description="Upload a FASTA file to predict whether a virus sequence is likely to infect human or non-human hosts.",
     examples=[["example.fasta"]],