Spaces:

NiniCat
/

CRISPRTool

Running

App Files Files Community

supercat666 commited on Jan 29

Commit

7ef3dbe

•

1 Parent(s): dd69d15

updated cas9on

Browse files

Files changed (3) hide show

app.py +10 -4
cas9_model/on-cla.h5 +2 -2
cas9on.py +14 -52

app.py CHANGED Viewed

@@ -90,7 +90,6 @@ if selected_model == 'Cas9':
         key='target_selection'
     )
-    # Actions based on the selected enzyme
     if target_selection == 'on-target':
         # Gene symbol entry
         gene_symbol = st.text_input('Enter a Gene Symbol:', key='gene_symbol')
@@ -107,10 +106,17 @@ if selected_model == 'Cas9':
         # On-target results display
         if 'on_target_results' in st.session_state and st.session_state['on_target_results']:
-            st.write('On-target predictions:', st.session_state['on_target_results'])
             if 'full_on_target_results' in st.session_state:
-                # Provide a download button for the full results
-                full_predictions_csv = cas9on.convert_df(st.session_state['full_on_target_results'])
                 st.download_button(
                     label='Download on-target predictions',
                     data=full_predictions_csv,

         key='target_selection'
     )
     if target_selection == 'on-target':
         # Gene symbol entry
         gene_symbol = st.text_input('Enter a Gene Symbol:', key='gene_symbol')
         # On-target results display
         if 'on_target_results' in st.session_state and st.session_state['on_target_results']:
+            # Convert the results to a pandas DataFrame for better display
+            df = pd.DataFrame(st.session_state['on_target_results'],
+                              columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "Prediction"])
+            st.write('On-target predictions:')
+            st.dataframe(df)
             if 'full_on_target_results' in st.session_state:
+                # Convert full results to a CSV for download
+                full_df = pd.DataFrame(st.session_state['full_on_target_results'],
+                                       columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "Prediction"])
+                full_predictions_csv = full_df.to_csv(index=False).encode('utf-8')
                 st.download_button(
                     label='Download on-target predictions',
                     data=full_predictions_csv,

cas9_model/on-cla.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5acf8f740cf326052ad08db2ca71d7204526c61f6a9fcdca36e15004bc16ad04
-size 34044032

 version https://git-lfs.github.com/spec/v1
+oid sha256:3426146f71d42c25fdc2baa959d3c43d23404c5f9200a064701bb86788d38fe9
+size 34040392

cas9on.py CHANGED Viewed

@@ -19,42 +19,11 @@ ntmap = {'A': (1, 0, 0, 0),
          'G': (0, 0, 1, 0),
          'T': (0, 0, 0, 1)
          }
-epimap = {'A': 1, 'N': 0}
 def get_seqcode(seq):
     return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
         (1, len(seq), -1))
-def get_epicode(eseq):
-    return np.array(list(map(lambda c: epimap[c], eseq))).reshape(1, len(eseq), -1)
-class Episgt:
-    def __init__(self, fpath, num_epi_features, with_y=True):
-        self._fpath = fpath
-        self._ori_df = pd.read_csv(fpath, sep='\t', index_col=None, header=None)
-        self._num_epi_features = num_epi_features
-        self._with_y = with_y
-        self._num_cols = num_epi_features + 2 if with_y else num_epi_features + 1
-        self._cols = list(self._ori_df.columns)[-self._num_cols:]
-        self._df = self._ori_df[self._cols]
-    @property
-    def length(self):
-        return len(self._df)
-    def get_dataset(self, x_dtype=np.float32, y_dtype=np.float32):
-        x_seq = np.concatenate(list(map(get_seqcode, self._df[self._cols[0]])))
-        x_epis = np.concatenate([np.concatenate(list(map(get_epicode, self._df[col]))) for col in
-                                 self._cols[1: 1 + self._num_epi_features]], axis=-1)
-        x = np.concatenate([x_seq, x_epis], axis=-1).astype(x_dtype)
-        x = x.transpose(0, 2, 1)
-        if self._with_y:
-            y = np.array(self._df[self._cols[-1]]).astype(y_dtype)
-            return x, y
-        else:
-            return x
 from keras.models import load_model
 class DCModelOntar:
     def __init__(self, ontar_model_dir, is_reg=False):
@@ -66,32 +35,25 @@ class DCModelOntar:
         yp = self.model.predict(x)
         return yp.ravel()
-# Function to generate random epigenetic data
-def generate_random_epigenetic_data(length):
-    return ''.join(random.choice('AN') for _ in range(length))
 # Function to predict on-target efficiency and format output
-def format_prediction_output(gRNA_sites, gene_id, model_path):
     dcModel = DCModelOntar(model_path)
     formatted_data = []
-    for gRNA in gRNA_sites:
         # Encode the gRNA sequence
-        encoded_seq = get_seqcode(gRNA).reshape(-1,4,1,23)
-        #encoded_seq = np.expand_dims(encoded_seq, axis=2)  # Adjust the shape for the model
-        # Generate random epigenetic features (as placeholders)
-        ctcf = get_epicode(generate_random_epigenetic_data(len(gRNA))).reshape(-1,1,1,23)
-        dnase = get_epicode(generate_random_epigenetic_data(len(gRNA))).reshape(-1,1,1,23)
-        h3k4me3 = get_epicode(generate_random_epigenetic_data(len(gRNA))).reshape(-1,1,1,23)
-        rrbs = get_epicode(generate_random_epigenetic_data(len(gRNA))).reshape(-1,1,1,23)
         # Predict on-target efficiency using the model
-        input = np.concatenate((encoded_seq, ctcf, dnase, h3k4me3, rrbs), axis=1)
-        prediction = dcModel.ontar_predict(input)
         # Format output
-        formatted_data.append([gene_id, "start_pos", "end_pos", "strand", gRNA, ctcf, dnase, h3k4me3, rrbs, prediction[0]])
     return formatted_data
@@ -123,7 +85,7 @@ def fetch_ensembl_sequence(transcript_id):
         print(f"Error fetching sequence data from Ensembl: {response.text}")
         return None
-def find_crispr_targets(sequence, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
@@ -131,11 +93,12 @@ def find_crispr_targets(sequence, pam="NGG", target_length=20):
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
-                targets.append(target_seq)
     return targets
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
     all_data = []
@@ -156,6 +119,5 @@ def process_gene(gene_symbol, model_path):
 # Function to save results as CSV
 def save_to_csv(data, filename="crispr_results.csv"):
     df = pd.DataFrame(data,
-                      columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "CTCF", "Dnase", "H3K4me3", "RRBS",
-                               "Prediction"])
     df.to_csv(filename, index=False)

          'G': (0, 0, 1, 0),
          'T': (0, 0, 0, 1)
          }
 def get_seqcode(seq):
     return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
         (1, len(seq), -1))
 from keras.models import load_model
 class DCModelOntar:
     def __init__(self, ontar_model_dir, is_reg=False):
         yp = self.model.predict(x)
         return yp.ravel()
 # Function to predict on-target efficiency and format output
+def format_prediction_output(gRNAs, model_path):
     dcModel = DCModelOntar(model_path)
     formatted_data = []
+    for gRNA in gRNAs:
         # Encode the gRNA sequence
+        encoded_seq = get_seqcode(gRNA[0]).reshape(-1,4,1,23)
         # Predict on-target efficiency using the model
+        prediction = dcModel.ontar_predict(encoded_seq)
         # Format output
+        chr = gRNA[1]
+        start = gRNA[2]
+        end = gRNA[3]
+        strand = gRNA[4]
+        formatted_data.append([chr, start, end, strand, gRNA[0], prediction[0]])
     return formatted_data
         print(f"Error fetching sequence data from Ensembl: {response.text}")
         return None
+def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
+                tar_start = start + i - target_length
+                tar_end = start + i + 3
+                targets.append([target_seq, chr, tar_start, tar_end, strand])
     return targets
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
     all_data = []
 # Function to save results as CSV
 def save_to_csv(data, filename="crispr_results.csv"):
     df = pd.DataFrame(data,
+                      columns=["Gene ID", "Start Pos", "End Pos", "Strand", "gRNA", "Prediction"])
     df.to_csv(filename, index=False)