Spaces:
Running
Running
supercat666
commited on
Commit
•
5272e74
1
Parent(s):
7708ddd
fix
Browse files
app.py
CHANGED
@@ -181,7 +181,7 @@ if selected_model == 'Cas9':
|
|
181 |
# Include "Target" in the DataFrame's columns
|
182 |
try:
|
183 |
df = pd.DataFrame(st.session_state['on_target_results'],
|
184 |
-
columns=["Chr", "Start", "End", "Strand", "
|
185 |
"sgRNA", "Prediction"])
|
186 |
st.dataframe(df)
|
187 |
except ValueError as e:
|
@@ -229,7 +229,7 @@ if selected_model == 'Cas9':
|
|
229 |
|
230 |
# Iterate over the sorted predictions to create the plot
|
231 |
for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
|
232 |
-
chrom, start, end, strand,
|
233 |
|
234 |
midpoint = (int(start) + int(end)) / 2
|
235 |
y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
|
@@ -241,7 +241,7 @@ if selected_model == 'Cas9':
|
|
241 |
marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
|
242 |
text=f"Rank: {i}", # Adjust based on your data
|
243 |
hoverinfo='text',
|
244 |
-
hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>
|
245 |
))
|
246 |
|
247 |
# Update the layout of the plot for better clarity and interactivity
|
|
|
181 |
# Include "Target" in the DataFrame's columns
|
182 |
try:
|
183 |
df = pd.DataFrame(st.session_state['on_target_results'],
|
184 |
+
columns=["Chr", "Start", "End", "Strand", "Target Sequence",
|
185 |
"sgRNA", "Prediction"])
|
186 |
st.dataframe(df)
|
187 |
except ValueError as e:
|
|
|
229 |
|
230 |
# Iterate over the sorted predictions to create the plot
|
231 |
for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
|
232 |
+
chrom, start, end, strand, target, gRNA, pred_score = prediction
|
233 |
|
234 |
midpoint = (int(start) + int(end)) / 2
|
235 |
y_value = i * 0.1 if strand == '1' else -i * 0.1 # Adjust multiplier for spacing
|
|
|
241 |
marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
|
242 |
text=f"Rank: {i}", # Adjust based on your data
|
243 |
hoverinfo='text',
|
244 |
+
hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
|
245 |
))
|
246 |
|
247 |
# Update the layout of the plot for better clarity and interactivity
|
cas9on.py
CHANGED
@@ -39,6 +39,27 @@ class DCModelOntar:
|
|
39 |
yp = self.model.predict(x)
|
40 |
return yp.ravel()
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def fetch_ensembl_transcripts(gene_symbol):
|
44 |
url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
|
@@ -68,87 +89,71 @@ def fetch_ensembl_sequence(transcript_id):
|
|
68 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
69 |
return None
|
70 |
|
71 |
-
def
|
72 |
-
url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
|
73 |
-
response = requests.get(url)
|
74 |
-
if response.status_code == 200:
|
75 |
-
cds_data = response.json()
|
76 |
-
return cds_data
|
77 |
-
else:
|
78 |
-
print(f"Error fetching CDS data from Ensembl: {response.text}")
|
79 |
-
return []
|
80 |
-
def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
|
81 |
targets = []
|
82 |
len_sequence = len(sequence)
|
83 |
-
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
84 |
|
85 |
-
if strand == -1:
|
86 |
-
sequence = ''.join([complement[base] for base in reversed(sequence)])
|
87 |
for i in range(len_sequence - len(pam) + 1):
|
88 |
if sequence[i + 1:i + 3] == pam[1:]:
|
89 |
if i >= target_length:
|
90 |
target_seq = sequence[i - target_length:i + 3]
|
91 |
tar_start = start + i - target_length
|
92 |
tar_end = start + i + 3
|
93 |
-
|
94 |
-
targets.append([target_seq,
|
95 |
|
96 |
return targets
|
97 |
|
98 |
-
# Function to predict on-target efficiency and format output
|
99 |
-
def format_prediction_output(targets, model_path):
|
100 |
-
dcModel = DCModelOntar(model_path)
|
101 |
-
formatted_data = []
|
102 |
-
|
103 |
-
for target in targets:
|
104 |
-
# Encode the gRNA sequence
|
105 |
-
encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
|
106 |
-
|
107 |
-
# Predict on-target efficiency using the model
|
108 |
-
prediction = dcModel.ontar_predict(encoded_seq)
|
109 |
-
|
110 |
-
# Format output
|
111 |
-
sgRNA = target[1]
|
112 |
-
chr = target[2]
|
113 |
-
start = target[3]
|
114 |
-
end = target[4]
|
115 |
-
strand = target[5]
|
116 |
-
transcript_id = target[6]
|
117 |
-
formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
|
118 |
-
|
119 |
-
return formatted_data
|
120 |
|
121 |
def process_gene(gene_symbol, model_path):
|
122 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
123 |
-
|
|
|
|
|
124 |
if transcripts:
|
125 |
-
for
|
126 |
-
transcript_id =
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
cds_list = fetch_ensembl_cds(transcript_id)
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
147 |
else:
|
148 |
-
print("
|
|
|
149 |
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
def create_genbank_features(formatted_data):
|
154 |
features = []
|
|
|
39 |
yp = self.model.predict(x)
|
40 |
return yp.ravel()
|
41 |
|
42 |
+
# Function to predict on-target efficiency and format output
|
43 |
+
def format_prediction_output(targets, model_path):
|
44 |
+
dcModel = DCModelOntar(model_path)
|
45 |
+
formatted_data = []
|
46 |
+
|
47 |
+
for target in targets:
|
48 |
+
# Encode the gRNA sequence
|
49 |
+
encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
|
50 |
+
|
51 |
+
# Predict on-target efficiency using the model
|
52 |
+
prediction = dcModel.ontar_predict(encoded_seq)
|
53 |
+
|
54 |
+
# Format output
|
55 |
+
gRNA = target[1]
|
56 |
+
chr = target[2]
|
57 |
+
start = target[3]
|
58 |
+
end = target[4]
|
59 |
+
strand = target[5]
|
60 |
+
formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
|
61 |
+
|
62 |
+
return formatted_data
|
63 |
|
64 |
def fetch_ensembl_transcripts(gene_symbol):
|
65 |
url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
|
|
|
89 |
print(f"Error fetching sequence data from Ensembl: {response.text}")
|
90 |
return None
|
91 |
|
92 |
+
def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
targets = []
|
94 |
len_sequence = len(sequence)
|
|
|
95 |
|
|
|
|
|
96 |
for i in range(len_sequence - len(pam) + 1):
|
97 |
if sequence[i + 1:i + 3] == pam[1:]:
|
98 |
if i >= target_length:
|
99 |
target_seq = sequence[i - target_length:i + 3]
|
100 |
tar_start = start + i - target_length
|
101 |
tar_end = start + i + 3
|
102 |
+
gRNA = sequence[i - target_length:i]
|
103 |
+
targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
|
104 |
|
105 |
return targets
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
def process_gene(gene_symbol, model_path):
|
109 |
transcripts = fetch_ensembl_transcripts(gene_symbol)
|
110 |
+
all_data = []
|
111 |
+
gene_sequence = '' # Initialize an empty string for the gene sequence
|
112 |
+
|
113 |
if transcripts:
|
114 |
+
for transcript in transcripts:
|
115 |
+
transcript_id = transcript['id']
|
116 |
+
chr = transcript.get('seq_region_name', 'unknown')
|
117 |
+
start = transcript.get('start', 0)
|
118 |
+
strand = transcript.get('strand', 'unknown')
|
119 |
+
# Fetch the sequence here and concatenate if multiple transcripts
|
120 |
+
gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
|
121 |
+
|
122 |
+
# Fetch exon and CDS information
|
123 |
+
exons = fetch_ensembl_exons(transcript_id)
|
124 |
cds_list = fetch_ensembl_cds(transcript_id)
|
125 |
+
|
126 |
+
# You might want to do something specific with exons and CDS information here
|
127 |
+
# For example, store them, print them, or include them in your analysis
|
128 |
+
|
129 |
+
if gene_sequence:
|
130 |
+
gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
|
131 |
+
if gRNA_sites:
|
132 |
+
formatted_data = format_prediction_output(gRNA_sites, model_path)
|
133 |
+
all_data.extend(formatted_data)
|
134 |
+
|
135 |
+
# Return the data, fetched sequence, and possibly exon/CDS data
|
136 |
+
return all_data, gene_sequence, exons, cds_list
|
137 |
+
|
138 |
+
def fetch_ensembl_exons(transcript_id):
|
139 |
+
"""Fetch exon information for a given transcript from Ensembl."""
|
140 |
+
url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
|
141 |
+
response = requests.get(url)
|
142 |
+
if response.status_code == 200:
|
143 |
+
return response.json() # Returns a list of exons for the transcript
|
144 |
else:
|
145 |
+
print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
|
146 |
+
return None
|
147 |
|
148 |
+
def fetch_ensembl_cds(transcript_id):
|
149 |
+
"""Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
|
150 |
+
url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
|
151 |
+
response = requests.get(url)
|
152 |
+
if response.status_code == 200:
|
153 |
+
return response.json() # Returns a list of CDS regions for the transcript
|
154 |
+
else:
|
155 |
+
print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
|
156 |
+
return None
|
157 |
|
158 |
def create_genbank_features(formatted_data):
|
159 |
features = []
|