Spaces:

SyedSyab
/

AllergenePrediction

Runtime error

App Files Files Community

SyedSyab commited on Feb 29

Commit

d4a5429

•

1 Parent(s): d9199e0

model added

Browse files

Files changed (16) hide show

.gitignore +4 -0
app.py +42 -0
backend/LabelE.joblib +3 -0
backend/LabelESequence.joblib +3 -0
backend/LabelETargeted.joblib +3 -0
backend/__pycache__/gettingFromModel.cpython-39.pyc +0 -0
backend/__pycache__/labelingInput.cpython-39.pyc +0 -0
backend/gettingFromModel.py +30 -0
backend/labelingInput.py +215 -0
backend/xgboost_model.joblib +3 -0
frontend/__pycache__/gettingInput.cpython-39.pyc +0 -0
frontend/__pycache__/instructions.cpython-39.pyc +0 -0
frontend/__pycache__/selectingSeqType.cpython-39.pyc +0 -0
frontend/gettingInput.py +7 -0
frontend/instructions.py +17 -0
frontend/selectingSeqType.py +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+ExtraTreelasifier.py
+GBoosting.py
+ExtraData1.csv
+saving model.py

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import streamlit as st
+import pandas as pd
+import joblib
+import time
+# importing modules
+import frontend.instructions as fi
+fi.displayInstructionSection()
+import frontend.selectingSeqType as sst
+seqType = sst.selecType()
+import frontend.gettingInput as gi
+sequence = gi.gettingInput()
+from backend.labelingInput import returnValues
+from backend import labelingInput
+from backend import gettingFromModel
+#giveing data
+selectingView = st.selectbox("Select View",["Table View","Simple View"])
+modelPath = "backend/xgboost_model.joblib"
+LabelETargetedPath = "backend/LabelETargeted.joblib"
+LabelESequencePath = "backend/LabelESequence.joblib"
+# button
+predict = st.button('Predict')
+if predict:
+    if sequence == "":
+        st.error('No sequence: Please Provide Sequence First')
+    else:
+        with st.spinner("getting values..."):
+            labelingInput.giveValues(sequence)
+            data = returnValues(len(sequence), sequence)
+            print(data.columns)
+            st.success("Generated...")
+            for key,values in data.items():
+                    st.code(f"{key}: {values[0]}")
+            with st.spinner("Loading model..."):
+                model = joblib.load(modelPath)
+                LabelT = joblib.load(LabelETargetedPath)
+                LabelS = joblib.load(LabelESequencePath)
+                response = gettingFromModel.getResponse(data,model,LabelT,LabelS)
+                st.info(f'Predicted, it is "{response[0].upper()}"')
+                time.sleep(1)
+                st.bar_chart(data)

backend/LabelE.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
+size 1830380

backend/LabelESequence.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
+size 1830380

backend/LabelETargeted.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2f5afaa82a17b6d6fbf0ffc942210ceb13fbdf23297d224dff04a88ddaed455
+size 551

backend/__pycache__/gettingFromModel.cpython-39.pyc ADDED Viewed

Binary file (703 Bytes). View file

backend/__pycache__/labelingInput.cpython-39.pyc ADDED Viewed

Binary file (6.48 kB). View file

backend/gettingFromModel.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import datafit.datafit as df
+def getResponse(data, model, LabelT, LabelS):
+    print(data.columns)
+    # Transform using the pre-trained LabelEncoder
+    data["Sequence"] = LabelS.transform(data["Sequence"])
+    # Apply normalization if needed
+    data, _ = df.normalization(data)
+    # Make predictions
+    response = model.predict(data)
+    # Assuming 'response' is a binary prediction (0 or 1)
+    # If it's a probability, you might need to adjust the logic accordingly
+    print("Raw Predictions:")
+    print(response)
+    # If you want to interpret the predictions directly (0 or 1)
+    predicted_labels = response.astype(int)
+    print("Predicted Labels:")
+    print(predicted_labels)
+    # If you want to use inverse_transform for better interpretation
+    # Uncomment the following lines
+    inverse_labels = LabelT.inverse_transform(predicted_labels)
+    print("Inverse Transformed Labels:")
+    print(inverse_labels)
+    return inverse_labels

backend/labelingInput.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#==================================================================================================#
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+from Bio.SeqUtils import ProtParam
+import streamlit as st
+from Bio.SeqUtils import IsoelectricPoint
+# The variable 'seq' should be initialized with a valid protein sequence
+seq = ""
+def giveValues(seq1):
+    global seq
+    seq = seq1
+def structure(seq):
+    alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
+    return alpha_helix, beta_sheet, turn
+def calculate_net_charge(protein_sequence):
+    # Define the charges of amino acids
+    amino_acid_charges = {
+        'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
+        'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
+        'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
+        'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
+    }
+    # Calculate net charge
+    net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)
+    return net_charge
+def calculate_electric_potential(protein_sequence, pH=7.0):
+    # Define the pKa values of amino acids
+    amino_acid_pKa = {
+        'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
+        'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
+        'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
+        'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
+    }
+    # Calculate the electric potential
+    electric_potential = 0.0
+    for aa in protein_sequence:
+        charge = 0.0
+        if aa in amino_acid_pKa:
+            pKa = amino_acid_pKa[aa]
+            charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))
+        electric_potential += charge
+    return electric_potential
+def molWeight(seq):
+    """
+    Calculate the molecular weight of a protein sequence.
+    Parameters:
+    - seq (str): Protein sequence.
+    Returns:
+    - float: Molecular weight.
+    """
+    moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
+    return moleWeight1
+def hydrophobicityValues(seq):
+    """
+    Calculate the hydrophobicity index of a protein sequence.
+    Parameters:
+    - seq (str): Protein sequence.
+    Returns:
+    - float: Hydrophobicity index.
+    """
+    hydrophobicity_values = {
+        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
+        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
+        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
+        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
+    }
+    hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
+    return hydrophobicity
+def helix_propensity_score(sequence):
+    """
+    Calculate the helix propensity score of a protein sequence.
+    Parameters:
+    - sequence (str): Protein sequence.
+    Returns:
+    - float: Helix propensity score.
+    """
+    helix_propensity_values = {
+        'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
+        'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
+        'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
+        'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
+    }
+    helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
+    return helix_propensity_score
+def abundance_dispersion(sequence):
+    """
+    Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.
+    Parameters:
+    - sequence (str): Protein sequence.
+    Returns:
+    - tuple: Abundance and dispersion.
+    """
+    beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
+    beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
+    abundance = beta_strand_count
+    dispersion = 0
+    if beta_strand_count > 1:
+        positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
+        mean_position = sum(positions) / beta_strand_count
+        dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
+        dispersion = dispersion ** 0.5
+    return abundance, dispersion
+def BSpropensity_score(sequence):
+    """
+    Calculate the beta strand propensity score of a protein sequence.
+    Parameters:
+    - sequence (str): Protein sequence.
+    Returns:
+    - float: Beta strand propensity score.
+    """
+    beta_strand_propensity_values = {
+        'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
+        'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
+        'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
+        'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
+    }
+    beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
+    return beta_strand_propensity_score
+def getamino_acid(seq):
+    aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
+    return aminoAcidPercent
+def returnValues(seqLen,seq_unce):
+    """
+    Return labeled data based on protein sequence features.
+    Returns:
+    - pd.DataFrame: Labeled data.
+    """
+    global seq
+    alpha_helix, beta_sheet, turn = structure(seq)
+    if seq is None:
+        raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")
+    # Convert the single sequence to a list
+    seq_list = [seq]
+    percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
+    data = pd.DataFrame({
+        "Sequence": [seq_unce],
+        # "Sequence Length": [seqLen],
+        "net_charge": calculate_net_charge(seq),
+        "isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
+        'alpha_helix':[alpha_helix],
+        'beta_sheet':[beta_sheet],
+        'turn':[turn],
+        "Molecular Weight": molWeight(seq),
+        'percent_A':[percent_A],
+        'percent_C':[percent_C],
+        'percent_D':[percent_D],
+        'percent_E':[percent_E],
+        'percent_F':[percent_F],
+        'percent_G':[percent_G],
+        'percent_H':[percent_H],
+        'percent_I':[percent_I],
+        'percent_K':[percent_K],
+        'percent_L':[percent_L],
+        'percent_M':[percent_M],
+        'percent_N':[percent_N],
+        'percent_P':[percent_P],
+        'percent_Q':[percent_Q],
+        'percent_R':[percent_R],
+        'percent_S':[percent_S],
+        'percent_T':[percent_T],
+        'percent_V':[percent_V],
+        'percent_W':[percent_W],
+        'percent_Y':[percent_Y],
+        "Hydrophobicity": hydrophobicityValues(seq),
+        "Electrical Potential": calculate_electric_potential(seq),
+        "Abundance": abundance_dispersion(seq)[0],
+        "Dispersion": abundance_dispersion(seq)[1],
+        # "Helix Propensity Score": helix_propensity_score(seq),
+        # "Beta strand propensity values": BSpropensity_score(seq),
+    })
+    return data
+# Example of how to use the functions:
+# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
+# data = returnValues()
+# print(data)

backend/xgboost_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65a3b30630861e9c72c2d21d26586cca14c31871d116235bd223ed3d014edd22
+size 2401574

frontend/__pycache__/gettingInput.cpython-39.pyc ADDED Viewed

Binary file (389 Bytes). View file

frontend/__pycache__/instructions.cpython-39.pyc ADDED Viewed

Binary file (1.2 kB). View file

frontend/__pycache__/selectingSeqType.cpython-39.pyc ADDED Viewed

Binary file (367 Bytes). View file

frontend/gettingInput.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import streamlit as st
+def gettingInput():
+    seq = st.text_area("Enter the sequence")
+    seq = seq.replace("\n", '')
+    seq = seq.strip()
+    return seq

frontend/instructions.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import streamlit as st
+def displayInstructionSection():
+    st.title("Alerginity Prediction of protien")
+    st.header("Instructions")
+    st.write("Allerginity Prediction of protien tools is specialy desinged for to find that the protien is allergin or nor, the machine learning model is specaily designed for a protien sequence, through this the model is able to predict its status")
+    st.markdown("""
+            - Select which type of seq you want to predict for
+            - Enter the sequnce in the input box
+            - Click the predict button
+            - wait for the magic
+            - and boom, you got the result
+            """)
+    st.write("There are many tools and servers for rediction the alergenity status for sequences you want to predict, but this tool gives you 94% accuracy.")
+    st.write("you can find the official documentation in the github repository here: http://github.com/syabahmad/alernonpred")

frontend/selectingSeqType.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+def selecType():
+    type = st.selectbox("Select Seq Type",["Protien", "DNA/RNA"])
+    return type