Spaces:
Runtime error
Runtime error
model added
Browse files- .gitignore +4 -0
- app.py +42 -0
- backend/LabelE.joblib +3 -0
- backend/LabelESequence.joblib +3 -0
- backend/LabelETargeted.joblib +3 -0
- backend/__pycache__/gettingFromModel.cpython-39.pyc +0 -0
- backend/__pycache__/labelingInput.cpython-39.pyc +0 -0
- backend/gettingFromModel.py +30 -0
- backend/labelingInput.py +215 -0
- backend/xgboost_model.joblib +3 -0
- frontend/__pycache__/gettingInput.cpython-39.pyc +0 -0
- frontend/__pycache__/instructions.cpython-39.pyc +0 -0
- frontend/__pycache__/selectingSeqType.cpython-39.pyc +0 -0
- frontend/gettingInput.py +7 -0
- frontend/instructions.py +17 -0
- frontend/selectingSeqType.py +5 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ExtraTreelasifier.py
|
2 |
+
GBoosting.py
|
3 |
+
ExtraData1.csv
|
4 |
+
saving model.py
|
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import joblib
|
4 |
+
import time
|
5 |
+
# importing modules
|
6 |
+
|
7 |
+
import frontend.instructions as fi
|
8 |
+
fi.displayInstructionSection()
|
9 |
+
import frontend.selectingSeqType as sst
|
10 |
+
seqType = sst.selecType()
|
11 |
+
import frontend.gettingInput as gi
|
12 |
+
sequence = gi.gettingInput()
|
13 |
+
from backend.labelingInput import returnValues
|
14 |
+
from backend import labelingInput
|
15 |
+
from backend import gettingFromModel
|
16 |
+
#giveing data
|
17 |
+
|
18 |
+
selectingView = st.selectbox("Select View",["Table View","Simple View"])
|
19 |
+
modelPath = "backend/xgboost_model.joblib"
|
20 |
+
LabelETargetedPath = "backend/LabelETargeted.joblib"
|
21 |
+
LabelESequencePath = "backend/LabelESequence.joblib"
|
22 |
+
# button
|
23 |
+
predict = st.button('Predict')
|
24 |
+
if predict:
|
25 |
+
if sequence == "":
|
26 |
+
st.error('No sequence: Please Provide Sequence First')
|
27 |
+
else:
|
28 |
+
with st.spinner("getting values..."):
|
29 |
+
labelingInput.giveValues(sequence)
|
30 |
+
data = returnValues(len(sequence), sequence)
|
31 |
+
print(data.columns)
|
32 |
+
st.success("Generated...")
|
33 |
+
for key,values in data.items():
|
34 |
+
st.code(f"{key}: {values[0]}")
|
35 |
+
with st.spinner("Loading model..."):
|
36 |
+
model = joblib.load(modelPath)
|
37 |
+
LabelT = joblib.load(LabelETargetedPath)
|
38 |
+
LabelS = joblib.load(LabelESequencePath)
|
39 |
+
response = gettingFromModel.getResponse(data,model,LabelT,LabelS)
|
40 |
+
st.info(f'Predicted, it is "{response[0].upper()}"')
|
41 |
+
time.sleep(1)
|
42 |
+
st.bar_chart(data)
|
backend/LabelE.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
|
3 |
+
size 1830380
|
backend/LabelESequence.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
|
3 |
+
size 1830380
|
backend/LabelETargeted.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2f5afaa82a17b6d6fbf0ffc942210ceb13fbdf23297d224dff04a88ddaed455
|
3 |
+
size 551
|
backend/__pycache__/gettingFromModel.cpython-39.pyc
ADDED
Binary file (703 Bytes). View file
|
|
backend/__pycache__/labelingInput.cpython-39.pyc
ADDED
Binary file (6.48 kB). View file
|
|
backend/gettingFromModel.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datafit.datafit as df
|
2 |
+
def getResponse(data, model, LabelT, LabelS):
|
3 |
+
print(data.columns)
|
4 |
+
|
5 |
+
# Transform using the pre-trained LabelEncoder
|
6 |
+
data["Sequence"] = LabelS.transform(data["Sequence"])
|
7 |
+
|
8 |
+
# Apply normalization if needed
|
9 |
+
data, _ = df.normalization(data)
|
10 |
+
|
11 |
+
# Make predictions
|
12 |
+
response = model.predict(data)
|
13 |
+
|
14 |
+
# Assuming 'response' is a binary prediction (0 or 1)
|
15 |
+
# If it's a probability, you might need to adjust the logic accordingly
|
16 |
+
print("Raw Predictions:")
|
17 |
+
print(response)
|
18 |
+
|
19 |
+
# If you want to interpret the predictions directly (0 or 1)
|
20 |
+
predicted_labels = response.astype(int)
|
21 |
+
print("Predicted Labels:")
|
22 |
+
print(predicted_labels)
|
23 |
+
|
24 |
+
# If you want to use inverse_transform for better interpretation
|
25 |
+
# Uncomment the following lines
|
26 |
+
inverse_labels = LabelT.inverse_transform(predicted_labels)
|
27 |
+
print("Inverse Transformed Labels:")
|
28 |
+
print(inverse_labels)
|
29 |
+
|
30 |
+
return inverse_labels
|
backend/labelingInput.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#==================================================================================================#
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.preprocessing import LabelEncoder
|
5 |
+
from Bio.SeqUtils import ProtParam
|
6 |
+
import streamlit as st
|
7 |
+
from Bio.SeqUtils import IsoelectricPoint
|
8 |
+
# The variable 'seq' should be initialized with a valid protein sequence
|
9 |
+
seq = ""
|
10 |
+
def giveValues(seq1):
|
11 |
+
global seq
|
12 |
+
seq = seq1
|
13 |
+
|
14 |
+
def structure(seq):
|
15 |
+
alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
|
16 |
+
return alpha_helix, beta_sheet, turn
|
17 |
+
|
18 |
+
def calculate_net_charge(protein_sequence):
|
19 |
+
# Define the charges of amino acids
|
20 |
+
amino_acid_charges = {
|
21 |
+
'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
|
22 |
+
'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
|
23 |
+
'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
|
24 |
+
'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
|
25 |
+
}
|
26 |
+
|
27 |
+
# Calculate net charge
|
28 |
+
net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)
|
29 |
+
|
30 |
+
return net_charge
|
31 |
+
|
32 |
+
def calculate_electric_potential(protein_sequence, pH=7.0):
|
33 |
+
# Define the pKa values of amino acids
|
34 |
+
amino_acid_pKa = {
|
35 |
+
'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
|
36 |
+
'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
|
37 |
+
'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
|
38 |
+
'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
|
39 |
+
}
|
40 |
+
|
41 |
+
# Calculate the electric potential
|
42 |
+
electric_potential = 0.0
|
43 |
+
for aa in protein_sequence:
|
44 |
+
charge = 0.0
|
45 |
+
if aa in amino_acid_pKa:
|
46 |
+
pKa = amino_acid_pKa[aa]
|
47 |
+
charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))
|
48 |
+
|
49 |
+
electric_potential += charge
|
50 |
+
|
51 |
+
return electric_potential
|
52 |
+
|
53 |
+
def molWeight(seq):
|
54 |
+
"""
|
55 |
+
Calculate the molecular weight of a protein sequence.
|
56 |
+
|
57 |
+
Parameters:
|
58 |
+
- seq (str): Protein sequence.
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
- float: Molecular weight.
|
62 |
+
"""
|
63 |
+
moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
|
64 |
+
return moleWeight1
|
65 |
+
|
66 |
+
def hydrophobicityValues(seq):
|
67 |
+
"""
|
68 |
+
Calculate the hydrophobicity index of a protein sequence.
|
69 |
+
|
70 |
+
Parameters:
|
71 |
+
- seq (str): Protein sequence.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
- float: Hydrophobicity index.
|
75 |
+
"""
|
76 |
+
hydrophobicity_values = {
|
77 |
+
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
|
78 |
+
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
|
79 |
+
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
|
80 |
+
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
|
81 |
+
}
|
82 |
+
hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
|
83 |
+
return hydrophobicity
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
def helix_propensity_score(sequence):
|
89 |
+
"""
|
90 |
+
Calculate the helix propensity score of a protein sequence.
|
91 |
+
|
92 |
+
Parameters:
|
93 |
+
- sequence (str): Protein sequence.
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
- float: Helix propensity score.
|
97 |
+
"""
|
98 |
+
helix_propensity_values = {
|
99 |
+
'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
|
100 |
+
'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
|
101 |
+
'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
|
102 |
+
'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
|
103 |
+
}
|
104 |
+
helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
|
105 |
+
return helix_propensity_score
|
106 |
+
|
107 |
+
def abundance_dispersion(sequence):
|
108 |
+
"""
|
109 |
+
Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.
|
110 |
+
|
111 |
+
Parameters:
|
112 |
+
- sequence (str): Protein sequence.
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
- tuple: Abundance and dispersion.
|
116 |
+
"""
|
117 |
+
beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
|
118 |
+
beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
|
119 |
+
abundance = beta_strand_count
|
120 |
+
dispersion = 0
|
121 |
+
|
122 |
+
if beta_strand_count > 1:
|
123 |
+
positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
|
124 |
+
mean_position = sum(positions) / beta_strand_count
|
125 |
+
dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
|
126 |
+
dispersion = dispersion ** 0.5
|
127 |
+
|
128 |
+
return abundance, dispersion
|
129 |
+
|
130 |
+
def BSpropensity_score(sequence):
|
131 |
+
"""
|
132 |
+
Calculate the beta strand propensity score of a protein sequence.
|
133 |
+
|
134 |
+
Parameters:
|
135 |
+
- sequence (str): Protein sequence.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
- float: Beta strand propensity score.
|
139 |
+
"""
|
140 |
+
beta_strand_propensity_values = {
|
141 |
+
'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
|
142 |
+
'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
|
143 |
+
'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
|
144 |
+
'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
|
145 |
+
}
|
146 |
+
beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
|
147 |
+
return beta_strand_propensity_score
|
148 |
+
|
149 |
+
def getamino_acid(seq):
|
150 |
+
aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
|
151 |
+
return aminoAcidPercent
|
152 |
+
|
153 |
+
|
154 |
+
def returnValues(seqLen,seq_unce):
|
155 |
+
"""
|
156 |
+
Return labeled data based on protein sequence features.
|
157 |
+
|
158 |
+
Returns:
|
159 |
+
- pd.DataFrame: Labeled data.
|
160 |
+
"""
|
161 |
+
global seq
|
162 |
+
alpha_helix, beta_sheet, turn = structure(seq)
|
163 |
+
if seq is None:
|
164 |
+
raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")
|
165 |
+
|
166 |
+
# Convert the single sequence to a list
|
167 |
+
seq_list = [seq]
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
|
172 |
+
|
173 |
+
data = pd.DataFrame({
|
174 |
+
"Sequence": [seq_unce],
|
175 |
+
# "Sequence Length": [seqLen],
|
176 |
+
"net_charge": calculate_net_charge(seq),
|
177 |
+
"isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
|
178 |
+
'alpha_helix':[alpha_helix],
|
179 |
+
'beta_sheet':[beta_sheet],
|
180 |
+
'turn':[turn],
|
181 |
+
"Molecular Weight": molWeight(seq),
|
182 |
+
'percent_A':[percent_A],
|
183 |
+
'percent_C':[percent_C],
|
184 |
+
'percent_D':[percent_D],
|
185 |
+
'percent_E':[percent_E],
|
186 |
+
'percent_F':[percent_F],
|
187 |
+
'percent_G':[percent_G],
|
188 |
+
'percent_H':[percent_H],
|
189 |
+
'percent_I':[percent_I],
|
190 |
+
'percent_K':[percent_K],
|
191 |
+
'percent_L':[percent_L],
|
192 |
+
'percent_M':[percent_M],
|
193 |
+
'percent_N':[percent_N],
|
194 |
+
'percent_P':[percent_P],
|
195 |
+
'percent_Q':[percent_Q],
|
196 |
+
'percent_R':[percent_R],
|
197 |
+
'percent_S':[percent_S],
|
198 |
+
'percent_T':[percent_T],
|
199 |
+
'percent_V':[percent_V],
|
200 |
+
'percent_W':[percent_W],
|
201 |
+
'percent_Y':[percent_Y],
|
202 |
+
"Hydrophobicity": hydrophobicityValues(seq),
|
203 |
+
"Electrical Potential": calculate_electric_potential(seq),
|
204 |
+
"Abundance": abundance_dispersion(seq)[0],
|
205 |
+
"Dispersion": abundance_dispersion(seq)[1],
|
206 |
+
# "Helix Propensity Score": helix_propensity_score(seq),
|
207 |
+
# "Beta strand propensity values": BSpropensity_score(seq),
|
208 |
+
})
|
209 |
+
|
210 |
+
return data
|
211 |
+
|
212 |
+
# Example of how to use the functions:
|
213 |
+
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
|
214 |
+
# data = returnValues()
|
215 |
+
# print(data)
|
backend/xgboost_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65a3b30630861e9c72c2d21d26586cca14c31871d116235bd223ed3d014edd22
|
3 |
+
size 2401574
|
frontend/__pycache__/gettingInput.cpython-39.pyc
ADDED
Binary file (389 Bytes). View file
|
|
frontend/__pycache__/instructions.cpython-39.pyc
ADDED
Binary file (1.2 kB). View file
|
|
frontend/__pycache__/selectingSeqType.cpython-39.pyc
ADDED
Binary file (367 Bytes). View file
|
|
frontend/gettingInput.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def gettingInput():
|
4 |
+
seq = st.text_area("Enter the sequence")
|
5 |
+
seq = seq.replace("\n", '')
|
6 |
+
seq = seq.strip()
|
7 |
+
return seq
|
frontend/instructions.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def displayInstructionSection():
|
5 |
+
st.title("Alerginity Prediction of protien")
|
6 |
+
st.header("Instructions")
|
7 |
+
st.write("Allerginity Prediction of protien tools is specialy desinged for to find that the protien is allergin or nor, the machine learning model is specaily designed for a protien sequence, through this the model is able to predict its status")
|
8 |
+
st.markdown("""
|
9 |
+
- Select which type of seq you want to predict for
|
10 |
+
- Enter the sequnce in the input box
|
11 |
+
- Click the predict button
|
12 |
+
- wait for the magic
|
13 |
+
- and boom, you got the result
|
14 |
+
""")
|
15 |
+
st.write("There are many tools and servers for rediction the alergenity status for sequences you want to predict, but this tool gives you 94% accuracy.")
|
16 |
+
|
17 |
+
st.write("you can find the official documentation in the github repository here: http://github.com/syabahmad/alernonpred")
|
frontend/selectingSeqType.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def selecType():
|
4 |
+
type = st.selectbox("Select Seq Type",["Protien", "DNA/RNA"])
|
5 |
+
return type
|