supercat666 commited on
Commit
5272e74
1 Parent(s): 7708ddd
Files changed (2) hide show
  1. app.py +3 -3
  2. cas9on.py +67 -62
app.py CHANGED
@@ -181,7 +181,7 @@ if selected_model == 'Cas9':
181
  # Include "Target" in the DataFrame's columns
182
  try:
183
  df = pd.DataFrame(st.session_state['on_target_results'],
184
- columns=["Chr", "Start", "End", "Strand", "Transcript ID", "Target Sequence",
185
  "sgRNA", "Prediction"])
186
  st.dataframe(df)
187
  except ValueError as e:
@@ -229,7 +229,7 @@ if selected_model == 'Cas9':
229
 
230
  # Iterate over the sorted predictions to create the plot
231
  for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
232
- chrom, start, end, strand,transcript_id, target, gRNA, pred_score = prediction
233
 
234
  midpoint = (int(start) + int(end)) / 2
235
  y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
@@ -241,7 +241,7 @@ if selected_model == 'Cas9':
241
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
242
  text=f"Rank: {i}", # Adjust based on your data
243
  hoverinfo='text',
244
- hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Transcript_id: {transcript_id}<br>Prediction Score: {pred_score:.4f}",
245
  ))
246
 
247
  # Update the layout of the plot for better clarity and interactivity
 
181
  # Include "Target" in the DataFrame's columns
182
  try:
183
  df = pd.DataFrame(st.session_state['on_target_results'],
184
+ columns=["Chr", "Start", "End", "Strand", "Target Sequence",
185
  "sgRNA", "Prediction"])
186
  st.dataframe(df)
187
  except ValueError as e:
 
229
 
230
  # Iterate over the sorted predictions to create the plot
231
  for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
232
+ chrom, start, end, strand, target, gRNA, pred_score = prediction
233
 
234
  midpoint = (int(start) + int(end)) / 2
235
  y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
 
241
  marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
242
  text=f"Rank: {i}", # Adjust based on your data
243
  hoverinfo='text',
244
+ hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
245
  ))
246
 
247
  # Update the layout of the plot for better clarity and interactivity
cas9on.py CHANGED
@@ -39,6 +39,27 @@ class DCModelOntar:
39
  yp = self.model.predict(x)
40
  return yp.ravel()
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def fetch_ensembl_transcripts(gene_symbol):
44
  url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
@@ -68,87 +89,71 @@ def fetch_ensembl_sequence(transcript_id):
68
  print(f"Error fetching sequence data from Ensembl: {response.text}")
69
  return None
70
 
71
- def fetch_ensembl_cds(transcript_id):
72
- url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
73
- response = requests.get(url)
74
- if response.status_code == 200:
75
- cds_data = response.json()
76
- return cds_data
77
- else:
78
- print(f"Error fetching CDS data from Ensembl: {response.text}")
79
- return []
80
- def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
81
  targets = []
82
  len_sequence = len(sequence)
83
- complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
84
 
85
- if strand == -1:
86
- sequence = ''.join([complement[base] for base in reversed(sequence)])
87
  for i in range(len_sequence - len(pam) + 1):
88
  if sequence[i + 1:i + 3] == pam[1:]:
89
  if i >= target_length:
90
  target_seq = sequence[i - target_length:i + 3]
91
  tar_start = start + i - target_length
92
  tar_end = start + i + 3
93
- sgRNA = sequence[i - target_length:i]
94
- targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
95
 
96
  return targets
97
 
98
- # Function to predict on-target efficiency and format output
99
- def format_prediction_output(targets, model_path):
100
- dcModel = DCModelOntar(model_path)
101
- formatted_data = []
102
-
103
- for target in targets:
104
- # Encode the gRNA sequence
105
- encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
106
-
107
- # Predict on-target efficiency using the model
108
- prediction = dcModel.ontar_predict(encoded_seq)
109
-
110
- # Format output
111
- sgRNA = target[1]
112
- chr = target[2]
113
- start = target[3]
114
- end = target[4]
115
- strand = target[5]
116
- transcript_id = target[6]
117
- formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
118
-
119
- return formatted_data
120
 
121
  def process_gene(gene_symbol, model_path):
122
  transcripts = fetch_ensembl_transcripts(gene_symbol)
123
- results = []
 
 
124
  if transcripts:
125
- for i in range(len(transcripts)):
126
- transcript_id = transcripts[i]['display_name']
127
- Exons = transcripts[i]['Exon']
 
 
 
 
 
 
 
128
  cds_list = fetch_ensembl_cds(transcript_id)
129
- for j in range(len(Exons)):
130
- exon_id = Exons[j]['id']
131
- gene_sequence = fetch_ensembl_sequence(exon_id)
132
- if gene_sequence:
133
- start = Exons[j]['start']
134
- strand = Exons[j]['strand']
135
- chr = Exons[j]['seq_region_name']
136
- targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id)
137
- if not targets:
138
- print("No gRNA sites found in the gene sequence.")
139
- else:
140
- # Predict on-target efficiency for each gRNA site
141
- formatted_data = format_prediction_output(targets,model_path)
142
- results.append(formatted_data)
143
- print(f"Appended formatted data: {formatted_data[-1]}") # Check the last appended item
144
- print(f"Current results length: {len(results)}")
145
- else:
146
- print("Failed to retrieve gene sequence.")
 
147
  else:
148
- print("Failed to retrieve transcripts.")
 
149
 
150
- # Note: Returning last exon's sequence, might need adjustment based on use-case
151
- return results, gene_sequence, Exons, cds_list
 
 
 
 
 
 
 
152
 
153
  def create_genbank_features(formatted_data):
154
  features = []
 
39
  yp = self.model.predict(x)
40
  return yp.ravel()
41
 
42
+ # Function to predict on-target efficiency and format output
43
+ def format_prediction_output(targets, model_path):
44
+ dcModel = DCModelOntar(model_path)
45
+ formatted_data = []
46
+
47
+ for target in targets:
48
+ # Encode the gRNA sequence
49
+ encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
50
+
51
+ # Predict on-target efficiency using the model
52
+ prediction = dcModel.ontar_predict(encoded_seq)
53
+
54
+ # Format output
55
+ gRNA = target[1]
56
+ chr = target[2]
57
+ start = target[3]
58
+ end = target[4]
59
+ strand = target[5]
60
+ formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
61
+
62
+ return formatted_data
63
 
64
  def fetch_ensembl_transcripts(gene_symbol):
65
  url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
 
89
  print(f"Error fetching sequence data from Ensembl: {response.text}")
90
  return None
91
 
92
+ def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
 
 
 
 
 
 
 
 
 
93
  targets = []
94
  len_sequence = len(sequence)
 
95
 
 
 
96
  for i in range(len_sequence - len(pam) + 1):
97
  if sequence[i + 1:i + 3] == pam[1:]:
98
  if i >= target_length:
99
  target_seq = sequence[i - target_length:i + 3]
100
  tar_start = start + i - target_length
101
  tar_end = start + i + 3
102
+ gRNA = sequence[i - target_length:i]
103
+ targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
104
 
105
  return targets
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def process_gene(gene_symbol, model_path):
109
  transcripts = fetch_ensembl_transcripts(gene_symbol)
110
+ all_data = []
111
+ gene_sequence = '' # Initialize an empty string for the gene sequence
112
+
113
  if transcripts:
114
+ for transcript in transcripts:
115
+ transcript_id = transcript['id']
116
+ chr = transcript.get('seq_region_name', 'unknown')
117
+ start = transcript.get('start', 0)
118
+ strand = transcript.get('strand', 'unknown')
119
+ # Fetch the sequence here and concatenate if multiple transcripts
120
+ gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
121
+
122
+ # Fetch exon and CDS information
123
+ exons = fetch_ensembl_exons(transcript_id)
124
  cds_list = fetch_ensembl_cds(transcript_id)
125
+
126
+ # You might want to do something specific with exons and CDS information here
127
+ # For example, store them, print them, or include them in your analysis
128
+
129
+ if gene_sequence:
130
+ gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
131
+ if gRNA_sites:
132
+ formatted_data = format_prediction_output(gRNA_sites, model_path)
133
+ all_data.extend(formatted_data)
134
+
135
+ # Return the data, fetched sequence, and possibly exon/CDS data
136
+ return all_data, gene_sequence, exons, cds_list
137
+
138
+ def fetch_ensembl_exons(transcript_id):
139
+ """Fetch exon information for a given transcript from Ensembl."""
140
+ url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
141
+ response = requests.get(url)
142
+ if response.status_code == 200:
143
+ return response.json() # Returns a list of exons for the transcript
144
  else:
145
+ print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
146
+ return None
147
 
148
+ def fetch_ensembl_cds(transcript_id):
149
+ """Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
150
+ url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
151
+ response = requests.get(url)
152
+ if response.status_code == 200:
153
+ return response.json() # Returns a list of CDS regions for the transcript
154
+ else:
155
+ print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
156
+ return None
157
 
158
  def create_genbank_features(formatted_data):
159
  features = []