SyedSyab commited on
Commit
d4a5429
1 Parent(s): d9199e0

model added

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ExtraTreelasifier.py
2
+ GBoosting.py
3
+ ExtraData1.csv
4
+ saving model.py
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import joblib
4
+ import time
5
+ # importing modules
6
+
7
+ import frontend.instructions as fi
8
+ fi.displayInstructionSection()
9
+ import frontend.selectingSeqType as sst
10
+ seqType = sst.selecType()
11
+ import frontend.gettingInput as gi
12
+ sequence = gi.gettingInput()
13
+ from backend.labelingInput import returnValues
14
+ from backend import labelingInput
15
+ from backend import gettingFromModel
16
+ #giveing data
17
+
18
+ selectingView = st.selectbox("Select View",["Table View","Simple View"])
19
+ modelPath = "backend/xgboost_model.joblib"
20
+ LabelETargetedPath = "backend/LabelETargeted.joblib"
21
+ LabelESequencePath = "backend/LabelESequence.joblib"
22
+ # button
23
+ predict = st.button('Predict')
24
+ if predict:
25
+ if sequence == "":
26
+ st.error('No sequence: Please Provide Sequence First')
27
+ else:
28
+ with st.spinner("getting values..."):
29
+ labelingInput.giveValues(sequence)
30
+ data = returnValues(len(sequence), sequence)
31
+ print(data.columns)
32
+ st.success("Generated...")
33
+ for key,values in data.items():
34
+ st.code(f"{key}: {values[0]}")
35
+ with st.spinner("Loading model..."):
36
+ model = joblib.load(modelPath)
37
+ LabelT = joblib.load(LabelETargetedPath)
38
+ LabelS = joblib.load(LabelESequencePath)
39
+ response = gettingFromModel.getResponse(data,model,LabelT,LabelS)
40
+ st.info(f'Predicted, it is "{response[0].upper()}"')
41
+ time.sleep(1)
42
+ st.bar_chart(data)
backend/LabelE.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
3
+ size 1830380
backend/LabelESequence.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301509c6a5c33ab86bfc13742a2bcb795864366a157baea2c955b54f9db7883b
3
+ size 1830380
backend/LabelETargeted.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f5afaa82a17b6d6fbf0ffc942210ceb13fbdf23297d224dff04a88ddaed455
3
+ size 551
backend/__pycache__/gettingFromModel.cpython-39.pyc ADDED
Binary file (703 Bytes). View file
 
backend/__pycache__/labelingInput.cpython-39.pyc ADDED
Binary file (6.48 kB). View file
 
backend/gettingFromModel.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datafit.datafit as df
2
+ def getResponse(data, model, LabelT, LabelS):
3
+ print(data.columns)
4
+
5
+ # Transform using the pre-trained LabelEncoder
6
+ data["Sequence"] = LabelS.transform(data["Sequence"])
7
+
8
+ # Apply normalization if needed
9
+ data, _ = df.normalization(data)
10
+
11
+ # Make predictions
12
+ response = model.predict(data)
13
+
14
+ # Assuming 'response' is a binary prediction (0 or 1)
15
+ # If it's a probability, you might need to adjust the logic accordingly
16
+ print("Raw Predictions:")
17
+ print(response)
18
+
19
+ # If you want to interpret the predictions directly (0 or 1)
20
+ predicted_labels = response.astype(int)
21
+ print("Predicted Labels:")
22
+ print(predicted_labels)
23
+
24
+ # If you want to use inverse_transform for better interpretation
25
+ # Uncomment the following lines
26
+ inverse_labels = LabelT.inverse_transform(predicted_labels)
27
+ print("Inverse Transformed Labels:")
28
+ print(inverse_labels)
29
+
30
+ return inverse_labels
backend/labelingInput.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #==================================================================================================#
2
+
3
+ import pandas as pd
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from Bio.SeqUtils import ProtParam
6
+ import streamlit as st
7
+ from Bio.SeqUtils import IsoelectricPoint
8
+ # The variable 'seq' should be initialized with a valid protein sequence
9
+ seq = ""
10
+ def giveValues(seq1):
11
+ global seq
12
+ seq = seq1
13
+
14
+ def structure(seq):
15
+ alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
16
+ return alpha_helix, beta_sheet, turn
17
+
18
+ def calculate_net_charge(protein_sequence):
19
+ # Define the charges of amino acids
20
+ amino_acid_charges = {
21
+ 'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
22
+ 'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
23
+ 'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
24
+ 'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
25
+ }
26
+
27
+ # Calculate net charge
28
+ net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)
29
+
30
+ return net_charge
31
+
32
+ def calculate_electric_potential(protein_sequence, pH=7.0):
33
+ # Define the pKa values of amino acids
34
+ amino_acid_pKa = {
35
+ 'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
36
+ 'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
37
+ 'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
38
+ 'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
39
+ }
40
+
41
+ # Calculate the electric potential
42
+ electric_potential = 0.0
43
+ for aa in protein_sequence:
44
+ charge = 0.0
45
+ if aa in amino_acid_pKa:
46
+ pKa = amino_acid_pKa[aa]
47
+ charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))
48
+
49
+ electric_potential += charge
50
+
51
+ return electric_potential
52
+
53
+ def molWeight(seq):
54
+ """
55
+ Calculate the molecular weight of a protein sequence.
56
+
57
+ Parameters:
58
+ - seq (str): Protein sequence.
59
+
60
+ Returns:
61
+ - float: Molecular weight.
62
+ """
63
+ moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
64
+ return moleWeight1
65
+
66
+ def hydrophobicityValues(seq):
67
+ """
68
+ Calculate the hydrophobicity index of a protein sequence.
69
+
70
+ Parameters:
71
+ - seq (str): Protein sequence.
72
+
73
+ Returns:
74
+ - float: Hydrophobicity index.
75
+ """
76
+ hydrophobicity_values = {
77
+ 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
78
+ 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
79
+ 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
80
+ 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
81
+ }
82
+ hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
83
+ return hydrophobicity
84
+
85
+
86
+
87
+
88
+ def helix_propensity_score(sequence):
89
+ """
90
+ Calculate the helix propensity score of a protein sequence.
91
+
92
+ Parameters:
93
+ - sequence (str): Protein sequence.
94
+
95
+ Returns:
96
+ - float: Helix propensity score.
97
+ """
98
+ helix_propensity_values = {
99
+ 'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
100
+ 'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
101
+ 'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
102
+ 'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
103
+ }
104
+ helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
105
+ return helix_propensity_score
106
+
107
+ def abundance_dispersion(sequence):
108
+ """
109
+ Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.
110
+
111
+ Parameters:
112
+ - sequence (str): Protein sequence.
113
+
114
+ Returns:
115
+ - tuple: Abundance and dispersion.
116
+ """
117
+ beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
118
+ beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
119
+ abundance = beta_strand_count
120
+ dispersion = 0
121
+
122
+ if beta_strand_count > 1:
123
+ positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
124
+ mean_position = sum(positions) / beta_strand_count
125
+ dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
126
+ dispersion = dispersion ** 0.5
127
+
128
+ return abundance, dispersion
129
+
130
+ def BSpropensity_score(sequence):
131
+ """
132
+ Calculate the beta strand propensity score of a protein sequence.
133
+
134
+ Parameters:
135
+ - sequence (str): Protein sequence.
136
+
137
+ Returns:
138
+ - float: Beta strand propensity score.
139
+ """
140
+ beta_strand_propensity_values = {
141
+ 'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
142
+ 'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
143
+ 'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
144
+ 'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
145
+ }
146
+ beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
147
+ return beta_strand_propensity_score
148
+
149
+ def getamino_acid(seq):
150
+ aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
151
+ return aminoAcidPercent
152
+
153
+
154
+ def returnValues(seqLen,seq_unce):
155
+ """
156
+ Return labeled data based on protein sequence features.
157
+
158
+ Returns:
159
+ - pd.DataFrame: Labeled data.
160
+ """
161
+ global seq
162
+ alpha_helix, beta_sheet, turn = structure(seq)
163
+ if seq is None:
164
+ raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")
165
+
166
+ # Convert the single sequence to a list
167
+ seq_list = [seq]
168
+
169
+
170
+
171
+ percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
172
+
173
+ data = pd.DataFrame({
174
+ "Sequence": [seq_unce],
175
+ # "Sequence Length": [seqLen],
176
+ "net_charge": calculate_net_charge(seq),
177
+ "isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
178
+ 'alpha_helix':[alpha_helix],
179
+ 'beta_sheet':[beta_sheet],
180
+ 'turn':[turn],
181
+ "Molecular Weight": molWeight(seq),
182
+ 'percent_A':[percent_A],
183
+ 'percent_C':[percent_C],
184
+ 'percent_D':[percent_D],
185
+ 'percent_E':[percent_E],
186
+ 'percent_F':[percent_F],
187
+ 'percent_G':[percent_G],
188
+ 'percent_H':[percent_H],
189
+ 'percent_I':[percent_I],
190
+ 'percent_K':[percent_K],
191
+ 'percent_L':[percent_L],
192
+ 'percent_M':[percent_M],
193
+ 'percent_N':[percent_N],
194
+ 'percent_P':[percent_P],
195
+ 'percent_Q':[percent_Q],
196
+ 'percent_R':[percent_R],
197
+ 'percent_S':[percent_S],
198
+ 'percent_T':[percent_T],
199
+ 'percent_V':[percent_V],
200
+ 'percent_W':[percent_W],
201
+ 'percent_Y':[percent_Y],
202
+ "Hydrophobicity": hydrophobicityValues(seq),
203
+ "Electrical Potential": calculate_electric_potential(seq),
204
+ "Abundance": abundance_dispersion(seq)[0],
205
+ "Dispersion": abundance_dispersion(seq)[1],
206
+ # "Helix Propensity Score": helix_propensity_score(seq),
207
+ # "Beta strand propensity values": BSpropensity_score(seq),
208
+ })
209
+
210
+ return data
211
+
212
+ # Example of how to use the functions:
213
+ # giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
214
+ # data = returnValues()
215
+ # print(data)
backend/xgboost_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65a3b30630861e9c72c2d21d26586cca14c31871d116235bd223ed3d014edd22
3
+ size 2401574
frontend/__pycache__/gettingInput.cpython-39.pyc ADDED
Binary file (389 Bytes). View file
 
frontend/__pycache__/instructions.cpython-39.pyc ADDED
Binary file (1.2 kB). View file
 
frontend/__pycache__/selectingSeqType.cpython-39.pyc ADDED
Binary file (367 Bytes). View file
 
frontend/gettingInput.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def gettingInput():
4
+ seq = st.text_area("Enter the sequence")
5
+ seq = seq.replace("\n", '')
6
+ seq = seq.strip()
7
+ return seq
frontend/instructions.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def displayInstructionSection():
5
+ st.title("Alerginity Prediction of protien")
6
+ st.header("Instructions")
7
+ st.write("Allerginity Prediction of protien tools is specialy desinged for to find that the protien is allergin or nor, the machine learning model is specaily designed for a protien sequence, through this the model is able to predict its status")
8
+ st.markdown("""
9
+ - Select which type of seq you want to predict for
10
+ - Enter the sequnce in the input box
11
+ - Click the predict button
12
+ - wait for the magic
13
+ - and boom, you got the result
14
+ """)
15
+ st.write("There are many tools and servers for rediction the alergenity status for sequences you want to predict, but this tool gives you 94% accuracy.")
16
+
17
+ st.write("you can find the official documentation in the github repository here: http://github.com/syabahmad/alernonpred")
frontend/selectingSeqType.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def selecType():
4
+ type = st.selectbox("Select Seq Type",["Protien", "DNA/RNA"])
5
+ return type