Spaces:
Running
Running
supercat666
commited on
Commit
•
78b5697
1
Parent(s):
a6ec4bb
fix
Browse files
app.py
CHANGED
@@ -180,30 +180,40 @@ if selected_model == 'Cas9':
|
|
180 |
st.markdown("SpCas9")
|
181 |
# Include "Target" in the DataFrame's columns
|
182 |
df = pd.DataFrame(st.session_state['on_target_results'],
|
183 |
-
columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Target", "gRNA", "Prediction"])
|
184 |
st.dataframe(df)
|
185 |
-
# Now create a Plotly plot with the sorted_predictions# Initialize Plotly figure
|
186 |
# Initialize Plotly figure
|
187 |
fig = go.Figure()
|
188 |
|
189 |
-
#
|
|
|
|
|
|
|
|
|
|
|
190 |
for exon in st.session_state['exons']:
|
191 |
-
exon_start, exon_end =
|
192 |
-
#
|
193 |
-
fig.
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
200 |
for cds in st.session_state['cds']:
|
201 |
-
cds_start, cds_end =
|
202 |
-
fig.
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
207 |
|
208 |
# Plot gRNAs using triangles to indicate direction
|
209 |
# Initialize the y position for the positive and negative strands
|
@@ -213,7 +223,7 @@ if selected_model == 'Cas9':
|
|
213 |
|
214 |
# Iterate over the sorted predictions to create the plot
|
215 |
for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
|
216 |
-
chrom, start, end, strand, target, gRNA, pred_score = prediction
|
217 |
|
218 |
midpoint = (int(start) + int(end)) / 2
|
219 |
y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
|
@@ -225,7 +235,7 @@ if selected_model == 'Cas9':
|
|
225 |
marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
|
226 |
text=f"Rank: {i}", # Adjust based on your data
|
227 |
hoverinfo='text',
|
228 |
-
hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
|
229 |
))
|
230 |
|
231 |
# Update the layout of the plot for better clarity and interactivity
|
@@ -234,8 +244,18 @@ if selected_model == 'Cas9':
|
|
234 |
xaxis_title='Genomic Position',
|
235 |
yaxis_title='Strand',
|
236 |
showlegend=False, # Toggle based on preference
|
237 |
-
xaxis=dict(
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
)
|
240 |
|
241 |
# Display the plot
|
|
|
180 |
st.markdown("SpCas9")
|
181 |
# Include "Target" in the DataFrame's columns
|
182 |
df = pd.DataFrame(st.session_state['on_target_results'],
|
183 |
+
columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Transcript_id", "Target", "gRNA", "Prediction"])
|
184 |
st.dataframe(df)
|
|
|
185 |
# Initialize Plotly figure
|
186 |
fig = go.Figure()
|
187 |
|
188 |
+
# Constants for the appearance
|
189 |
+
EXON_HEIGHT = 0.05 # How 'tall' the exon markers should appear
|
190 |
+
CDS_HEIGHT = 0.05 # How 'tall' the CDS markers should appear
|
191 |
+
Y_POS = -0.1 # Position on the Y axis to place these markers
|
192 |
+
|
193 |
+
# Plot Exons as small markers on the X-axis
|
194 |
for exon in st.session_state['exons']:
|
195 |
+
exon_start, exon_end = exon['start'], exon['end']
|
196 |
+
# Using bars for better control over width and position
|
197 |
+
fig.add_trace(go.Bar(
|
198 |
+
x=[(exon_start + exon_end) / 2], # Position at the center of the exon
|
199 |
+
y=[EXON_HEIGHT],
|
200 |
+
width=[exon_end - exon_start], # Width of the bar is the exon length
|
201 |
+
base=[Y_POS],
|
202 |
+
marker_color='purple',
|
203 |
+
name='Exon'
|
204 |
+
))
|
205 |
+
|
206 |
+
# Plot CDS in a similar manner
|
207 |
for cds in st.session_state['cds']:
|
208 |
+
cds_start, cds_end = cds['start'], cds['end']
|
209 |
+
fig.add_trace(go.Bar(
|
210 |
+
x=[(cds_start + cds_end) / 2], # Position at the center of the CDS
|
211 |
+
y=[CDS_HEIGHT],
|
212 |
+
width=[cds_end - cds_start], # Width of the bar is the CDS length
|
213 |
+
base=[Y_POS - EXON_HEIGHT], # Slightly offset from the exons
|
214 |
+
marker_color='blue',
|
215 |
+
name='CDS'
|
216 |
+
))
|
217 |
|
218 |
# Plot gRNAs using triangles to indicate direction
|
219 |
# Initialize the y position for the positive and negative strands
|
|
|
223 |
|
224 |
# Iterate over the sorted predictions to create the plot
|
225 |
for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
|
226 |
+
chrom, start, end, strand,transcript_id, target, gRNA, pred_score = prediction
|
227 |
|
228 |
midpoint = (int(start) + int(end)) / 2
|
229 |
y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
|
|
|
235 |
marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
|
236 |
text=f"Rank: {i}", # Adjust based on your data
|
237 |
hoverinfo='text',
|
238 |
+
hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Transcript_id: {transcript_id}<br>Prediction Score: {pred_score:.4f}",
|
239 |
))
|
240 |
|
241 |
# Update the layout of the plot for better clarity and interactivity
|
|
|
244 |
xaxis_title='Genomic Position',
|
245 |
yaxis_title='Strand',
|
246 |
showlegend=False, # Toggle based on preference
|
247 |
+
xaxis=dict(
|
248 |
+
showspikes=True, # Show spike line for X-axis
|
249 |
+
spikemode='across',
|
250 |
+
spikesnap='cursor',
|
251 |
+
spikethickness=1,
|
252 |
+
spikecolor='grey',
|
253 |
+
showline=True,
|
254 |
+
showgrid=True,
|
255 |
+
tickformat='.2f', # Adjust based on the precision you need
|
256 |
+
),
|
257 |
+
hovermode='x',
|
258 |
+
hoverdistance=100, # Adjust for best hover interaction
|
259 |
)
|
260 |
|
261 |
# Display the plot
|
cas9on.py
CHANGED
@@ -39,27 +39,6 @@ class DCModelOntar:
|
|
39 |
yp = self.model.predict(x)
|
40 |
return yp.ravel()
|
41 |
|
42 |
-
# Function to predict on-target efficiency and format output
|
43 |
-
def format_prediction_output(targets, model_path):
|
44 |
-
dcModel = DCModelOntar(model_path)
|
45 |
-
formatted_data = []
|
46 |
-
|
47 |
-
for target in targets:
|
48 |
-
# Encode the gRNA sequence
|
49 |
-
encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
|
50 |
-
|
51 |
-
# Predict on-target efficiency using the model
|
52 |
-
prediction = dcModel.ontar_predict(encoded_seq)
|
53 |
-
|
54 |
-
# Format output
|
55 |
-
gRNA = target[1]
|
56 |
-
chr = target[2]
|
57 |
-
start = target[3]
|
58 |
-
end = target[4]
|
59 |
-
strand = target[5]
|
60 |
-
formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
|
61 |
-
|
62 |
-
return formatted_data
|
63 |
|
64 |
def fetch_ensembl_transcripts(gene_symbol):
|
65 |
url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
|
@@ -89,71 +68,76 @@ def fetch_ensembl_sequence(transcript_id):
|
|
89 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
90 |
return None
|
91 |
|
92 |
-
def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
|
93 |
targets = []
|
94 |
len_sequence = len(sequence)
|
|
|
95 |
|
|
|
|
|
96 |
for i in range(len_sequence - len(pam) + 1):
|
97 |
if sequence[i + 1:i + 3] == pam[1:]:
|
98 |
if i >= target_length:
|
99 |
target_seq = sequence[i - target_length:i + 3]
|
100 |
tar_start = start + i - target_length
|
101 |
tar_end = start + i + 3
|
102 |
-
|
103 |
-
targets.append([target_seq,
|
104 |
|
105 |
return targets
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
def process_gene(gene_symbol, model_path):
|
109 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
110 |
-
|
111 |
-
gene_sequence = '' # Initialize an empty string for the gene sequence
|
112 |
-
|
113 |
if transcripts:
|
114 |
-
for
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
# Return the data, fetched sequence, and possibly exon/CDS data
|
136 |
-
return all_data, gene_sequence, exons, cds_list
|
137 |
-
|
138 |
-
def fetch_ensembl_exons(transcript_id):
|
139 |
-
"""Fetch exon information for a given transcript from Ensembl."""
|
140 |
-
url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
|
141 |
-
response = requests.get(url)
|
142 |
-
if response.status_code == 200:
|
143 |
-
return response.json() # Returns a list of exons for the transcript
|
144 |
else:
|
145 |
-
print(
|
146 |
-
return None
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
|
151 |
-
response = requests.get(url)
|
152 |
-
if response.status_code == 200:
|
153 |
-
return response.json() # Returns a list of CDS regions for the transcript
|
154 |
-
else:
|
155 |
-
print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
|
156 |
-
return None
|
157 |
|
158 |
def create_genbank_features(formatted_data):
|
159 |
features = []
|
|
|
39 |
yp = self.model.predict(x)
|
40 |
return yp.ravel()
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def fetch_ensembl_transcripts(gene_symbol):
|
44 |
url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
|
|
|
68 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
69 |
return None
|
70 |
|
71 |
+
def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
|
72 |
targets = []
|
73 |
len_sequence = len(sequence)
|
74 |
+
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
75 |
|
76 |
+
if strand == -1:
|
77 |
+
sequence = ''.join([complement[base] for base in reversed(sequence)])
|
78 |
for i in range(len_sequence - len(pam) + 1):
|
79 |
if sequence[i + 1:i + 3] == pam[1:]:
|
80 |
if i >= target_length:
|
81 |
target_seq = sequence[i - target_length:i + 3]
|
82 |
tar_start = start + i - target_length
|
83 |
tar_end = start + i + 3
|
84 |
+
sgRNA = sequence[i - target_length:i]
|
85 |
+
targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
|
86 |
|
87 |
return targets
|
88 |
|
89 |
+
# Function to predict on-target efficiency and format output
|
90 |
+
def format_prediction_output(targets, model_path):
|
91 |
+
dcModel = DCModelOntar(model_path)
|
92 |
+
formatted_data = []
|
93 |
+
|
94 |
+
for target in targets:
|
95 |
+
# Encode the gRNA sequence
|
96 |
+
encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
|
97 |
+
|
98 |
+
# Predict on-target efficiency using the model
|
99 |
+
prediction = dcModel.ontar_predict(encoded_seq)
|
100 |
+
|
101 |
+
# Format output
|
102 |
+
sgRNA = target[1]
|
103 |
+
chr = target[2]
|
104 |
+
start = target[3]
|
105 |
+
end = target[4]
|
106 |
+
strand = target[5]
|
107 |
+
transcript_id = target[6]
|
108 |
+
formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
|
109 |
+
|
110 |
+
return formatted_data
|
111 |
|
112 |
def process_gene(gene_symbol, model_path):
|
113 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
114 |
+
results = []
|
|
|
|
|
115 |
if transcripts:
|
116 |
+
for i in range(len(transcripts)):
|
117 |
+
Exons = transcripts[i]['Exon']
|
118 |
+
cds = transcripts[i]['cds']
|
119 |
+
transcript_id = transcripts[i]['display_name']
|
120 |
+
for j in range(len(Exons)):
|
121 |
+
exon_id = Exons[j]['id']
|
122 |
+
gene_sequence = fetch_ensembl_sequence(exon_id)
|
123 |
+
if gene_sequence:
|
124 |
+
start = Exons[j]['start']
|
125 |
+
strand = Exons[j]['strand']
|
126 |
+
chr = Exons[j]['seq_region_name']
|
127 |
+
targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id)
|
128 |
+
if not targets:
|
129 |
+
print("No gRNA sites found in the gene sequence.")
|
130 |
+
else:
|
131 |
+
# Predict on-target efficiency for each gRNA site
|
132 |
+
formatted_data = format_prediction_output(targets,model_path)
|
133 |
+
results.append(formatted_data)
|
134 |
+
else:
|
135 |
+
print("Failed to retrieve gene sequence.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
else:
|
137 |
+
print("Failed to retrieve transcripts.")
|
|
|
138 |
|
139 |
+
# Note: Returning last exon's sequence, might need adjustment based on use-case
|
140 |
+
return results, gene_sequence, Exons, cds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
def create_genbank_features(formatted_data):
|
143 |
features = []
|