supercat666 commited on
Commit
78b5697
1 Parent(s): a6ec4bb
Files changed (2) hide show
  1. app.py +42 -22
  2. cas9on.py +52 -68
app.py CHANGED
@@ -180,30 +180,40 @@ if selected_model == 'Cas9':
180
  st.markdown("SpCas9")
181
  # Include "Target" in the DataFrame's columns
182
  df = pd.DataFrame(st.session_state['on_target_results'],
183
- columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Target", "gRNA", "Prediction"])
184
  st.dataframe(df)
185
- # Now create a Plotly plot with the sorted_predictions# Initialize Plotly figure
186
  # Initialize Plotly figure
187
  fig = go.Figure()
188
 
189
- # Plot Exons as horizontal lines or rectangles
 
 
 
 
 
190
  for exon in st.session_state['exons']:
191
- exon_start, exon_end = int(exon['start']), int(exon['end'])
192
- # Create a rectangle for each exon
193
- fig.add_shape(type="rect",
194
- x0=exon_start, y0=-1, # Start slightly below the axis for visibility
195
- x1=exon_end, y1=1, # End slightly above the axis
196
- line=dict(color="purple", width=2),
197
- fillcolor="rgba(128, 0, 128, 0.3)") # Semi-transparent purple
198
-
199
- # Plot CDS areas with similar approach but different color
 
 
 
200
  for cds in st.session_state['cds']:
201
- cds_start, cds_end = int(cds['start']), int(cds['end'])
202
- fig.add_shape(type="rect",
203
- x0=cds_start, y0=-1,
204
- x1=cds_end, y1=1,
205
- line=dict(color="blue", width=2),
206
- fillcolor="rgba(0, 0, 255, 0.3)")
 
 
 
207
 
208
  # Plot gRNAs using triangles to indicate direction
209
  # Initialize the y position for the positive and negative strands
@@ -213,7 +223,7 @@ if selected_model == 'Cas9':
213
 
214
  # Iterate over the sorted predictions to create the plot
215
  for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
216
- chrom, start, end, strand, target, gRNA, pred_score = prediction
217
 
218
  midpoint = (int(start) + int(end)) / 2
219
  y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
@@ -225,7 +235,7 @@ if selected_model == 'Cas9':
225
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
226
  text=f"Rank: {i}", # Adjust based on your data
227
  hoverinfo='text',
228
- hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
229
  ))
230
 
231
  # Update the layout of the plot for better clarity and interactivity
@@ -234,8 +244,18 @@ if selected_model == 'Cas9':
234
  xaxis_title='Genomic Position',
235
  yaxis_title='Strand',
236
  showlegend=False, # Toggle based on preference
237
- xaxis=dict(showspikes=True, spikecolor="grey", spikesnap="cursor", spikemode="across"),
238
- hovermode='closest' # Adjust for best hover interaction
 
 
 
 
 
 
 
 
 
 
239
  )
240
 
241
  # Display the plot
 
180
  st.markdown("SpCas9")
181
  # Include "Target" in the DataFrame's columns
182
  df = pd.DataFrame(st.session_state['on_target_results'],
183
+ columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Transcript_id", "Target", "gRNA", "Prediction"])
184
  st.dataframe(df)
 
185
  # Initialize Plotly figure
186
  fig = go.Figure()
187
 
188
+ # Constants for the appearance
189
+ EXON_HEIGHT = 0.05 # How 'tall' the exon markers should appear
190
+ CDS_HEIGHT = 0.05 # How 'tall' the CDS markers should appear
191
+ Y_POS = -0.1 # Position on the Y axis to place these markers
192
+
193
+ # Plot Exons as small markers on the X-axis
194
  for exon in st.session_state['exons']:
195
+ exon_start, exon_end = exon['start'], exon['end']
196
+ # Using bars for better control over width and position
197
+ fig.add_trace(go.Bar(
198
+ x=[(exon_start + exon_end) / 2], # Position at the center of the exon
199
+ y=[EXON_HEIGHT],
200
+ width=[exon_end - exon_start], # Width of the bar is the exon length
201
+ base=[Y_POS],
202
+ marker_color='purple',
203
+ name='Exon'
204
+ ))
205
+
206
+ # Plot CDS in a similar manner
207
  for cds in st.session_state['cds']:
208
+ cds_start, cds_end = cds['start'], cds['end']
209
+ fig.add_trace(go.Bar(
210
+ x=[(cds_start + cds_end) / 2], # Position at the center of the CDS
211
+ y=[CDS_HEIGHT],
212
+ width=[cds_end - cds_start], # Width of the bar is the CDS length
213
+ base=[Y_POS - EXON_HEIGHT], # Slightly offset from the exons
214
+ marker_color='blue',
215
+ name='CDS'
216
+ ))
217
 
218
  # Plot gRNAs using triangles to indicate direction
219
  # Initialize the y position for the positive and negative strands
 
223
 
224
  # Iterate over the sorted predictions to create the plot
225
  for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
226
+ chrom, start, end, strand,transcript_id, target, gRNA, pred_score = prediction
227
 
228
  midpoint = (int(start) + int(end)) / 2
229
  y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
 
235
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
236
  text=f"Rank: {i}", # Adjust based on your data
237
  hoverinfo='text',
238
+ hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Transcript_id: {transcript_id}<br>Prediction Score: {pred_score:.4f}",
239
  ))
240
 
241
  # Update the layout of the plot for better clarity and interactivity
 
244
  xaxis_title='Genomic Position',
245
  yaxis_title='Strand',
246
  showlegend=False, # Toggle based on preference
247
+ xaxis=dict(
248
+ showspikes=True, # Show spike line for X-axis
249
+ spikemode='across',
250
+ spikesnap='cursor',
251
+ spikethickness=1,
252
+ spikecolor='grey',
253
+ showline=True,
254
+ showgrid=True,
255
+ tickformat='.2f', # Adjust based on the precision you need
256
+ ),
257
+ hovermode='x',
258
+ hoverdistance=100, # Adjust for best hover interaction
259
  )
260
 
261
  # Display the plot
cas9on.py CHANGED
@@ -39,27 +39,6 @@ class DCModelOntar:
39
  yp = self.model.predict(x)
40
  return yp.ravel()
41
 
42
- # Function to predict on-target efficiency and format output
43
- def format_prediction_output(targets, model_path):
44
- dcModel = DCModelOntar(model_path)
45
- formatted_data = []
46
-
47
- for target in targets:
48
- # Encode the gRNA sequence
49
- encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
50
-
51
- # Predict on-target efficiency using the model
52
- prediction = dcModel.ontar_predict(encoded_seq)
53
-
54
- # Format output
55
- gRNA = target[1]
56
- chr = target[2]
57
- start = target[3]
58
- end = target[4]
59
- strand = target[5]
60
- formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
61
-
62
- return formatted_data
63
 
64
  def fetch_ensembl_transcripts(gene_symbol):
65
  url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
@@ -89,71 +68,76 @@ def fetch_ensembl_sequence(transcript_id):
89
  print(f"Error fetching sequence data from Ensembl: {response.text}")
90
  return None
91
 
92
- def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
93
  targets = []
94
  len_sequence = len(sequence)
 
95
 
 
 
96
  for i in range(len_sequence - len(pam) + 1):
97
  if sequence[i + 1:i + 3] == pam[1:]:
98
  if i >= target_length:
99
  target_seq = sequence[i - target_length:i + 3]
100
  tar_start = start + i - target_length
101
  tar_end = start + i + 3
102
- gRNA = sequence[i - target_length:i]
103
- targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
104
 
105
  return targets
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def process_gene(gene_symbol, model_path):
109
  transcripts = fetch_ensembl_transcripts(gene_symbol)
110
- all_data = []
111
- gene_sequence = '' # Initialize an empty string for the gene sequence
112
-
113
  if transcripts:
114
- for transcript in transcripts:
115
- transcript_id = transcript['id']
116
- chr = transcript.get('seq_region_name', 'unknown')
117
- start = transcript.get('start', 0)
118
- strand = transcript.get('strand', 'unknown')
119
- # Fetch the sequence here and concatenate if multiple transcripts
120
- gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
121
-
122
- # Fetch exon and CDS information
123
- exons = fetch_ensembl_exons(transcript_id)
124
- cds_list = fetch_ensembl_cds(transcript_id)
125
-
126
- # You might want to do something specific with exons and CDS information here
127
- # For example, store them, print them, or include them in your analysis
128
-
129
- if gene_sequence:
130
- gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
131
- if gRNA_sites:
132
- formatted_data = format_prediction_output(gRNA_sites, model_path)
133
- all_data.extend(formatted_data)
134
-
135
- # Return the data, fetched sequence, and possibly exon/CDS data
136
- return all_data, gene_sequence, exons, cds_list
137
-
138
- def fetch_ensembl_exons(transcript_id):
139
- """Fetch exon information for a given transcript from Ensembl."""
140
- url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
141
- response = requests.get(url)
142
- if response.status_code == 200:
143
- return response.json() # Returns a list of exons for the transcript
144
  else:
145
- print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
146
- return None
147
 
148
- def fetch_ensembl_cds(transcript_id):
149
- """Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
150
- url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
151
- response = requests.get(url)
152
- if response.status_code == 200:
153
- return response.json() # Returns a list of CDS regions for the transcript
154
- else:
155
- print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
156
- return None
157
 
158
  def create_genbank_features(formatted_data):
159
  features = []
 
39
  yp = self.model.predict(x)
40
  return yp.ravel()
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def fetch_ensembl_transcripts(gene_symbol):
44
  url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
 
68
  print(f"Error fetching sequence data from Ensembl: {response.text}")
69
  return None
70
 
71
+ def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
72
  targets = []
73
  len_sequence = len(sequence)
74
+ complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
75
 
76
+ if strand == -1:
77
+ sequence = ''.join([complement[base] for base in reversed(sequence)])
78
  for i in range(len_sequence - len(pam) + 1):
79
  if sequence[i + 1:i + 3] == pam[1:]:
80
  if i >= target_length:
81
  target_seq = sequence[i - target_length:i + 3]
82
  tar_start = start + i - target_length
83
  tar_end = start + i + 3
84
+ sgRNA = sequence[i - target_length:i]
85
+ targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
86
 
87
  return targets
88
 
89
+ # Function to predict on-target efficiency and format output
90
+ def format_prediction_output(targets, model_path):
91
+ dcModel = DCModelOntar(model_path)
92
+ formatted_data = []
93
+
94
+ for target in targets:
95
+ # Encode the gRNA sequence
96
+ encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
97
+
98
+ # Predict on-target efficiency using the model
99
+ prediction = dcModel.ontar_predict(encoded_seq)
100
+
101
+ # Format output
102
+ sgRNA = target[1]
103
+ chr = target[2]
104
+ start = target[3]
105
+ end = target[4]
106
+ strand = target[5]
107
+ transcript_id = target[6]
108
+ formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
109
+
110
+ return formatted_data
111
 
112
  def process_gene(gene_symbol, model_path):
113
  transcripts = fetch_ensembl_transcripts(gene_symbol)
114
+ results = []
 
 
115
  if transcripts:
116
+ for i in range(len(transcripts)):
117
+ Exons = transcripts[i]['Exon']
118
+ cds = transcripts[i]['cds']
119
+ transcript_id = transcripts[i]['display_name']
120
+ for j in range(len(Exons)):
121
+ exon_id = Exons[j]['id']
122
+ gene_sequence = fetch_ensembl_sequence(exon_id)
123
+ if gene_sequence:
124
+ start = Exons[j]['start']
125
+ strand = Exons[j]['strand']
126
+ chr = Exons[j]['seq_region_name']
127
+ targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id)
128
+ if not targets:
129
+ print("No gRNA sites found in the gene sequence.")
130
+ else:
131
+ # Predict on-target efficiency for each gRNA site
132
+ formatted_data = format_prediction_output(targets,model_path)
133
+ results.append(formatted_data)
134
+ else:
135
+ print("Failed to retrieve gene sequence.")
 
 
 
 
 
 
 
 
 
 
136
  else:
137
+ print("Failed to retrieve transcripts.")
 
138
 
139
+ # Note: Returning last exon's sequence, might need adjustment based on use-case
140
+ return results, gene_sequence, Exons, cds
 
 
 
 
 
 
 
141
 
142
  def create_genbank_features(formatted_data):
143
  features = []