Spaces:

NiniCat
/

CRISPRTool

Running

App Files Files Community

supercat666 commited on Mar 4

Commit

78b5697

•

1 Parent(s): a6ec4bb

fix

Browse files

Files changed (2) hide show

app.py +42 -22
cas9on.py +52 -68

app.py CHANGED Viewed

@@ -180,30 +180,40 @@ if selected_model == 'Cas9':
                     st.markdown("SpCas9")
                 # Include "Target" in the DataFrame's columns
                 df = pd.DataFrame(st.session_state['on_target_results'],
-                                  columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Target", "gRNA", "Prediction"])
                 st.dataframe(df)
-                # Now create a Plotly plot with the sorted_predictions# Initialize Plotly figure
                 # Initialize Plotly figure
                 fig = go.Figure()
-                # Plot Exons as horizontal lines or rectangles
                 for exon in st.session_state['exons']:
-                    exon_start, exon_end = int(exon['start']), int(exon['end'])
-                    # Create a rectangle for each exon
-                    fig.add_shape(type="rect",
-                                  x0=exon_start, y0=-1,  # Start slightly below the axis for visibility
-                                  x1=exon_end, y1=1,  # End slightly above the axis
-                                  line=dict(color="purple", width=2),
-                                  fillcolor="rgba(128, 0, 128, 0.3)")  # Semi-transparent purple
-                # Plot CDS areas with similar approach but different color
                 for cds in st.session_state['cds']:
-                    cds_start, cds_end = int(cds['start']), int(cds['end'])
-                    fig.add_shape(type="rect",
-                                  x0=cds_start, y0=-1,
-                                  x1=cds_end, y1=1,
-                                  line=dict(color="blue", width=2),
-                                  fillcolor="rgba(0, 0, 255, 0.3)")
                     # Plot gRNAs using triangles to indicate direction
                 # Initialize the y position for the positive and negative strands
@@ -213,7 +223,7 @@ if selected_model == 'Cas9':
                 # Iterate over the sorted predictions to create the plot
                 for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
-                    chrom, start, end, strand, target, gRNA, pred_score = prediction
                     midpoint = (int(start) + int(end)) / 2
                     y_value = i * 0.1 if strand == '1' else -i * 0.1  # Adjust multiplier for spacing
@@ -225,7 +235,7 @@ if selected_model == 'Cas9':
                         marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
                         text=f"Rank: {i}",  # Adjust based on your data
                         hoverinfo='text',
-                        hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Prediction Score: {pred_score:.4f}",
                     ))
                 # Update the layout of the plot for better clarity and interactivity
@@ -234,8 +244,18 @@ if selected_model == 'Cas9':
                     xaxis_title='Genomic Position',
                     yaxis_title='Strand',
                     showlegend=False,  # Toggle based on preference
-                    xaxis=dict(showspikes=True, spikecolor="grey", spikesnap="cursor", spikemode="across"),
-                    hovermode='closest'  # Adjust for best hover interaction
                 )
                 # Display the plot

                     st.markdown("SpCas9")
                 # Include "Target" in the DataFrame's columns
                 df = pd.DataFrame(st.session_state['on_target_results'],
+                                  columns=["Gene ID", "Start Pos", "End Pos", "Strand", "Transcript_id", "Target", "gRNA", "Prediction"])
                 st.dataframe(df)
                 # Initialize Plotly figure
                 fig = go.Figure()
+                # Constants for the appearance
+                EXON_HEIGHT = 0.05  # How 'tall' the exon markers should appear
+                CDS_HEIGHT = 0.05  # How 'tall' the CDS markers should appear
+                Y_POS = -0.1  # Position on the Y axis to place these markers
+                # Plot Exons as small markers on the X-axis
                 for exon in st.session_state['exons']:
+                    exon_start, exon_end = exon['start'], exon['end']
+                    # Using bars for better control over width and position
+                    fig.add_trace(go.Bar(
+                        x=[(exon_start + exon_end) / 2],  # Position at the center of the exon
+                        y=[EXON_HEIGHT],
+                        width=[exon_end - exon_start],  # Width of the bar is the exon length
+                        base=[Y_POS],
+                        marker_color='purple',
+                        name='Exon'
+                    ))
+                # Plot CDS in a similar manner
                 for cds in st.session_state['cds']:
+                    cds_start, cds_end = cds['start'], cds['end']
+                    fig.add_trace(go.Bar(
+                        x=[(cds_start + cds_end) / 2],  # Position at the center of the CDS
+                        y=[CDS_HEIGHT],
+                        width=[cds_end - cds_start],  # Width of the bar is the CDS length
+                        base=[Y_POS - EXON_HEIGHT],  # Slightly offset from the exons
+                        marker_color='blue',
+                        name='CDS'
+                    ))
                     # Plot gRNAs using triangles to indicate direction
                 # Initialize the y position for the positive and negative strands
                 # Iterate over the sorted predictions to create the plot
                 for i, prediction in enumerate(st.session_state['on_target_results'], start=1):
+                    chrom, start, end, strand,transcript_id, target, gRNA, pred_score = prediction
                     midpoint = (int(start) + int(end)) / 2
                     y_value = i * 0.1 if strand == '1' else -i * 0.1  # Adjust multiplier for spacing
                         marker=dict(symbol='triangle-up' if strand == '1' else 'triangle-down', size=12),
                         text=f"Rank: {i}",  # Adjust based on your data
                         hoverinfo='text',
+                        hovertext=f"Rank: {i}<br>Chromosome: {chrom}<br>Target Sequence: {target}<br>gRNA: {gRNA}<br>Start: {start}<br>End: {end}<br>Strand: {'+' if strand == '1' else '-'}<br>Transcript_id: {transcript_id}<br>Prediction Score: {pred_score:.4f}",
                     ))
                 # Update the layout of the plot for better clarity and interactivity
                     xaxis_title='Genomic Position',
                     yaxis_title='Strand',
                     showlegend=False,  # Toggle based on preference
+                    xaxis=dict(
+                        showspikes=True,  # Show spike line for X-axis
+                        spikemode='across',
+                        spikesnap='cursor',
+                        spikethickness=1,
+                        spikecolor='grey',
+                        showline=True,
+                        showgrid=True,
+                        tickformat='.2f',  # Adjust based on the precision you need
+                    ),
+                    hovermode='x',
+                    hoverdistance=100,  # Adjust for best hover interaction
                 )
                 # Display the plot

cas9on.py CHANGED Viewed

@@ -39,27 +39,6 @@ class DCModelOntar:
         yp = self.model.predict(x)
         return yp.ravel()
-# Function to predict on-target efficiency and format output
-def format_prediction_output(targets, model_path):
-    dcModel = DCModelOntar(model_path)
-    formatted_data = []
-    for target in targets:
-        # Encode the gRNA sequence
-        encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
-        # Predict on-target efficiency using the model
-        prediction = dcModel.ontar_predict(encoded_seq)
-        # Format output
-        gRNA = target[1]
-        chr = target[2]
-        start = target[3]
-        end = target[4]
-        strand = target[5]
-        formatted_data.append([chr, start, end, strand, target[0], gRNA, prediction[0]])
-    return formatted_data
 def fetch_ensembl_transcripts(gene_symbol):
     url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
@@ -89,71 +68,76 @@ def fetch_ensembl_sequence(transcript_id):
         print(f"Error fetching sequence data from Ensembl: {response.text}")
         return None
-def find_crispr_targets(sequence, chr, start, strand, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
     for i in range(len_sequence - len(pam) + 1):
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
                 tar_start = start + i - target_length
                 tar_end = start + i + 3
-                gRNA = sequence[i - target_length:i]
-                targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand)])
     return targets
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
-    all_data = []
-    gene_sequence = ''  # Initialize an empty string for the gene sequence
     if transcripts:
-        for transcript in transcripts:
-            transcript_id = transcript['id']
-            chr = transcript.get('seq_region_name', 'unknown')
-            start = transcript.get('start', 0)
-            strand = transcript.get('strand', 'unknown')
-            # Fetch the sequence here and concatenate if multiple transcripts
-            gene_sequence += fetch_ensembl_sequence(transcript_id) or ''
-            # Fetch exon and CDS information
-            exons = fetch_ensembl_exons(transcript_id)
-            cds_list = fetch_ensembl_cds(transcript_id)
-            # You might want to do something specific with exons and CDS information here
-            # For example, store them, print them, or include them in your analysis
-            if gene_sequence:
-                gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand)
-                if gRNA_sites:
-                    formatted_data = format_prediction_output(gRNA_sites, model_path)
-                    all_data.extend(formatted_data)
-    # Return the data, fetched sequence, and possibly exon/CDS data
-    return all_data, gene_sequence, exons, cds_list
-def fetch_ensembl_exons(transcript_id):
-    """Fetch exon information for a given transcript from Ensembl."""
-    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon;content-type=application/json"
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.json()  # Returns a list of exons for the transcript
     else:
-        print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
-        return None
-def fetch_ensembl_cds(transcript_id):
-    """Fetch coding sequence (CDS) information for a given transcript from Ensembl."""
-    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds;content-type=application/json"
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.json()  # Returns a list of CDS regions for the transcript
-    else:
-        print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
-        return None
 def create_genbank_features(formatted_data):
     features = []

         yp = self.model.predict(x)
         return yp.ravel()
 def fetch_ensembl_transcripts(gene_symbol):
     url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
         print(f"Error fetching sequence data from Ensembl: {response.text}")
         return None
+def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
+    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+    if strand == -1:
+        sequence = ''.join([complement[base] for base in reversed(sequence)])
     for i in range(len_sequence - len(pam) + 1):
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
                 tar_start = start + i - target_length
                 tar_end = start + i + 3
+                sgRNA = sequence[i - target_length:i]
+                targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
     return targets
+# Function to predict on-target efficiency and format output
+def format_prediction_output(targets, model_path):
+    dcModel = DCModelOntar(model_path)
+    formatted_data = []
+    for target in targets:
+        # Encode the gRNA sequence
+        encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
+        # Predict on-target efficiency using the model
+        prediction = dcModel.ontar_predict(encoded_seq)
+        # Format output
+        sgRNA = target[1]
+        chr = target[2]
+        start = target[3]
+        end = target[4]
+        strand = target[5]
+        transcript_id = target[6]
+        formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
+    return formatted_data
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
     if transcripts:
+        for i in range(len(transcripts)):
+            Exons = transcripts[i]['Exon']
+            cds = transcripts[i]['cds']
+            transcript_id = transcripts[i]['display_name']
+            for j in range(len(Exons)):
+                exon_id = Exons[j]['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)
+                if gene_sequence:
+                    start = Exons[j]['start']
+                    strand = Exons[j]['strand']
+                    chr = Exons[j]['seq_region_name']
+                    targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id)
+                    if not targets:
+                        print("No gRNA sites found in the gene sequence.")
+                    else:
+                        # Predict on-target efficiency for each gRNA site
+                        formatted_data = format_prediction_output(targets,model_path)
+                        results.append(formatted_data)
+                else:
+                    print("Failed to retrieve gene sequence.")
     else:
+        print("Failed to retrieve transcripts.")
+    # Note: Returning last exon's sequence, might need adjustment based on use-case
+    return results, gene_sequence, Exons, cds
 def create_genbank_features(formatted_data):
     features = []