File size: 4,151 Bytes
ce4236e
0d0c645
 
 
 
 
ce4236e
 
0d0c645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce4236e
0d0c645
 
 
 
 
 
 
ce4236e
 
7ef3dbe
ce4236e
 
 
7ef3dbe
ce4236e
7ef3dbe
ce4236e
 
7ef3dbe
ce4236e
 
7ef3dbe
 
 
 
 
ce4236e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ef3dbe
ce4236e
 
 
 
 
 
 
7ef3dbe
 
 
ce4236e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ef3dbe
ce4236e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
import tensorflow as tf
import pandas as pd
import numpy as np
from operator import add
from functools import reduce
from keras.models import load_model
import random

# configure GPUs
for gpu in tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, enable=True)
if len(tf.config.list_physical_devices('GPU')) > 0:
    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')


ntmap = {'A': (1, 0, 0, 0),
         'C': (0, 1, 0, 0),
         'G': (0, 0, 1, 0),
         'T': (0, 0, 0, 1)
         }

def get_seqcode(seq):
    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
        (1, len(seq), -1))

from keras.models import load_model
class DCModelOntar:
    def __init__(self, ontar_model_dir, is_reg=False):
        self.model = load_model(ontar_model_dir)

    def ontar_predict(self, x, channel_first=True):
        if channel_first:
            x = x.transpose([0, 2, 3, 1])
        yp = self.model.predict(x)
        return yp.ravel()


# Function to predict on-target efficiency and format output
def format_prediction_output(gRNAs, model_path):
    dcModel = DCModelOntar(model_path)
    formatted_data = []

    for gRNA in gRNAs:
        # Encode the gRNA sequence
        encoded_seq = get_seqcode(gRNA[0]).reshape(-1,4,1,23)

        # Predict on-target efficiency using the model
        prediction = dcModel.ontar_predict(encoded_seq)

        # Format output
        chr = gRNA[1]
        start = gRNA[2]
        end = gRNA[3]
        strand = gRNA[4]
        formatted_data.append([chr, start, end, strand, gRNA[0], prediction[0]])

    return formatted_data

def fetch_ensembl_transcripts(gene_symbol):
    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
    response = requests.get(url)
    if response.status_code == 200:
        gene_data = response.json()
        if 'Transcript' in gene_data:
            return gene_data['Transcript']
        else:
            print("No transcripts found for gene:", gene_symbol)
            return None
    else:
        print(f"Error fetching gene data from Ensembl: {response.text}")
        return None

def fetch_ensembl_sequence(transcript_id):
    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
    response = requests.get(url)
    if response.status_code == 200:
        sequence_data = response.json()
        if 'seq' in sequence_data:
            return sequence_data['seq']
        else:
            print("No sequence found for transcript:", transcript_id)
            return None
    else:
        print(f"Error fetching sequence data from Ensembl: {response.text}")
        return None

def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
    targets = []
    len_sequence = len(sequence)

    for i in range(len_sequence - len(pam) + 1):
        if sequence[i + 1:i + 3] == pam[1:]:
            if i >= target_length:
                target_seq = sequence[i - target_length:i + 3]
                tar_start = start + i - target_length
                tar_end = start + i + 3
                targets.append([target_seq, chr, tar_start, tar_end, strand])

    return targets

def process_gene(gene_symbol, model_path):
    transcripts = fetch_ensembl_transcripts(gene_symbol)
    all_data = []

    if transcripts:
        for transcript in transcripts:
            transcript_id = transcript['id']
            gene_sequence = fetch_ensembl_sequence(transcript_id)
            if gene_sequence:
                gRNA_sites = find_crispr_targets(gene_sequence)
                if gRNA_sites:
                    formatted_data = format_prediction_output(gRNA_sites, transcript_id, model_path)
                    all_data.extend(formatted_data)

    return all_data


# Function to save results as CSV
def save_to_csv(data, filename="crispr_results.csv"):
    df = pd.DataFrame(data,
                      columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "Prediction"])
    df.to_csv(filename, index=False)