Spaces:
Running
Running
File size: 4,151 Bytes
ce4236e 0d0c645 ce4236e 0d0c645 ce4236e 0d0c645 ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e 7ef3dbe ce4236e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import requests
import tensorflow as tf
import pandas as pd
import numpy as np
from operator import add
from functools import reduce
from keras.models import load_model
import random
# configure GPUs
for gpu in tf.config.list_physical_devices('GPU'):
tf.config.experimental.set_memory_growth(gpu, enable=True)
if len(tf.config.list_physical_devices('GPU')) > 0:
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
ntmap = {'A': (1, 0, 0, 0),
'C': (0, 1, 0, 0),
'G': (0, 0, 1, 0),
'T': (0, 0, 0, 1)
}
def get_seqcode(seq):
return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
(1, len(seq), -1))
from keras.models import load_model
class DCModelOntar:
def __init__(self, ontar_model_dir, is_reg=False):
self.model = load_model(ontar_model_dir)
def ontar_predict(self, x, channel_first=True):
if channel_first:
x = x.transpose([0, 2, 3, 1])
yp = self.model.predict(x)
return yp.ravel()
# Function to predict on-target efficiency and format output
def format_prediction_output(gRNAs, model_path):
dcModel = DCModelOntar(model_path)
formatted_data = []
for gRNA in gRNAs:
# Encode the gRNA sequence
encoded_seq = get_seqcode(gRNA[0]).reshape(-1,4,1,23)
# Predict on-target efficiency using the model
prediction = dcModel.ontar_predict(encoded_seq)
# Format output
chr = gRNA[1]
start = gRNA[2]
end = gRNA[3]
strand = gRNA[4]
formatted_data.append([chr, start, end, strand, gRNA[0], prediction[0]])
return formatted_data
def fetch_ensembl_transcripts(gene_symbol):
url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
response = requests.get(url)
if response.status_code == 200:
gene_data = response.json()
if 'Transcript' in gene_data:
return gene_data['Transcript']
else:
print("No transcripts found for gene:", gene_symbol)
return None
else:
print(f"Error fetching gene data from Ensembl: {response.text}")
return None
def fetch_ensembl_sequence(transcript_id):
url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
response = requests.get(url)
if response.status_code == 200:
sequence_data = response.json()
if 'seq' in sequence_data:
return sequence_data['seq']
else:
print("No sequence found for transcript:", transcript_id)
return None
else:
print(f"Error fetching sequence data from Ensembl: {response.text}")
return None
def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
targets = []
len_sequence = len(sequence)
for i in range(len_sequence - len(pam) + 1):
if sequence[i + 1:i + 3] == pam[1:]:
if i >= target_length:
target_seq = sequence[i - target_length:i + 3]
tar_start = start + i - target_length
tar_end = start + i + 3
targets.append([target_seq, chr, tar_start, tar_end, strand])
return targets
def process_gene(gene_symbol, model_path):
transcripts = fetch_ensembl_transcripts(gene_symbol)
all_data = []
if transcripts:
for transcript in transcripts:
transcript_id = transcript['id']
gene_sequence = fetch_ensembl_sequence(transcript_id)
if gene_sequence:
gRNA_sites = find_crispr_targets(gene_sequence)
if gRNA_sites:
formatted_data = format_prediction_output(gRNA_sites, transcript_id, model_path)
all_data.extend(formatted_data)
return all_data
# Function to save results as CSV
def save_to_csv(data, filename="crispr_results.csv"):
df = pd.DataFrame(data,
columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "Prediction"])
df.to_csv(filename, index=False) |