import gradio as gr import re import pandas as pd from io import StringIO def remove_nested_branches(smiles): """Remove nested branches from SMILES string""" result = '' depth = 0 for char in smiles: if char == '(': depth += 1 elif char == ')': depth -= 1 elif depth == 0: result += char return result def identify_linkage_type(segment): """ Identify the type of linkage between residues Returns: tuple (type, is_n_methylated) """ if 'OC(=O)' in segment: return ('ester', False) elif 'N(C)C(=O)' in segment: return ('peptide', True) # N-methylated peptide bond elif 'NC(=O)' in segment: return ('peptide', False) # Regular peptide bond return (None, False) def identify_residue(segment, next_segment=None, prev_segment=None): """ Identify amino acid residues with modifications and special handling for Proline Returns: tuple (residue, modifications) """ modifications = [] # Check for modifications in the next segment if next_segment: if 'N(C)C(=O)' in next_segment: modifications.append('N-Me') if 'OC(=O)' in next_segment: modifications.append('O-linked') # Special case for Proline - check for CCCN pattern and its cyclization # Proline can appear in several patterns due to its cyclic nature if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']): return ('Pro', modifications) # Check if this segment is part of a Proline ring by looking at context if prev_segment and next_segment: if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment): combined = prev_segment + segment + next_segment if re.search(r'CCCN.*C\(=O\)', combined): return ('Pro', modifications) # Aromatic amino acids if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment: return ('Phe', modifications) if 'c2ccc(O)cc2' in segment: return ('Tyr', modifications) if 'c1c[nH]c2ccccc12' in segment: return ('Trp', modifications) if 'c1cnc[nH]1' in segment: return ('His', modifications) # Branched chain amino acids if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment: return ('Leu', modifications) if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment: return ('Leu', modifications) if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']): return ('Val', modifications) if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment: return ('Ile', modifications) # Small/polar amino acids if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment: return ('Ala', modifications) if '[C@H](CO)' in segment: return ('Ser', modifications) if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment: return ('Thr', modifications) if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']): return ('Gly', modifications) # Rest of amino acids remain the same... # [Previous code for other amino acids] return (None, modifications) def parse_peptide(smiles): """ Parse peptide sequence with enhanced Proline recognition """ # Split on peptide bonds while preserving cycle numbers bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))' segments = re.split(bond_pattern, smiles) segments = [s for s in segments if s] sequence = [] i = 0 while i < len(segments): segment = segments[i] next_segment = segments[i+1] if i+1 < len(segments) else None prev_segment = segments[i-1] if i > 0 else None # Skip pure bond patterns if re.match(r'.*C\(=O\)$', segment): i += 1 continue residue, modifications = identify_residue(segment, next_segment, prev_segment) if residue: # Format residue with modifications formatted_residue = residue if modifications: formatted_residue += f"({','.join(modifications)})" sequence.append(formatted_residue) i += 1 is_cyclic = is_cyclic_peptide(smiles) # Print debug information print("\nDetailed Analysis:") print("Segments:", segments) print("Found sequence:", sequence) # Format the final sequence if is_cyclic: return f"cyclo({'-'.join(sequence)})" return '-'.join(sequence) def is_cyclic_peptide(smiles): """ Determine if SMILES represents a cyclic peptide by checking: 1. Proper cycle number pairing 2. Presence of peptide bonds between cycle points 3. Distinguishing between aromatic rings and peptide cycles """ cycle_info = {} # Find all cycle numbers and their contexts for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles): number = match.group(2) pre_context = match.group(1) or '' post_context = match.group(3) or '' position = match.start(2) if number not in cycle_info: cycle_info[number] = [] cycle_info[number].append({ 'position': position, 'pre_context': pre_context, 'post_context': post_context, 'full_context': smiles[max(0, position-3):min(len(smiles), position+4)] }) # Check each cycle peptide_cycles = [] aromatic_cycles = [] for number, occurrences in cycle_info.items(): if len(occurrences) != 2: # Must have exactly 2 occurrences continue start, end = occurrences[0]['position'], occurrences[1]['position'] # Get the segment between cycle points segment = smiles[start:end+1] clean_segment = remove_nested_branches(segment) # Check if this is an aromatic ring is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences) # Check if this is a peptide cycle has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment if is_aromatic: aromatic_cycles.append(number) elif has_peptide_bond: peptide_cycles.append(number) return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles def analyze_single_smiles(smiles): """Analyze a single SMILES string""" try: is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles) sequence = parse_peptide(smiles) details = { 'SMILES': smiles, 'Sequence': sequence, 'Is Cyclic': 'Yes' if is_cyclic else 'No', 'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None', 'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None' } return details except Exception as e: return { 'SMILES': smiles, 'Sequence': f'Error: {str(e)}', 'Is Cyclic': 'Error', 'Peptide Cycles': 'Error', 'Aromatic Cycles': 'Error' } def process_input(smiles_input=None, file_obj=None): """Process either direct SMILES input or file input""" results = [] # Handle direct SMILES input if smiles_input: result = analyze_single_smiles(smiles_input.strip()) results.append(result) # Handle file input if file_obj is not None: content = file_obj.decode('utf-8') for line in StringIO(content): smiles = line.strip() if smiles: # Skip empty lines result = analyze_single_smiles(smiles) results.append(result) # Create formatted output output_text = "" for i, result in enumerate(results, 1): output_text += f"Entry {i}:\n" output_text += f"SMILES: {result['SMILES']}\n" output_text += f"Sequence: {result['Sequence']}\n" output_text += f"Is Cyclic: {result['Is Cyclic']}\n" output_text += f"Peptide Cycles: {result['Peptide Cycles']}\n" output_text += f"Aromatic Cycles: {result['Aromatic Cycles']}\n" output_text += "-" * 50 + "\n" return output_text # Create Gradio interface iface = gr.Interface( fn=process_input, inputs=[ gr.Textbox( label="Enter SMILES string", placeholder="Enter SMILES notation of peptide...", lines=2 ), gr.File( label="Or upload a text file with SMILES", file_types=[".txt"], type="binary" ) ], outputs=gr.Textbox( label="Analysis Results", lines=10 ), title="Peptide Structure Analyzer", description=""" Analyze peptide structures from SMILES notation to: 1. Determine if the peptide is cyclic 2. Identify peptide cycles vs aromatic rings 3. Parse the amino acid sequence Input: Either enter a SMILES string directly or upload a text file with multiple SMILES (one per line) """, examples=[ # Example cyclic peptide with Proline ["CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O", None], # Example cyclic peptide with ester bond ["CC(C)C[C@@H]1OC(=O)[C@H](C)NC(=O)[C@H](C(C)C)OC(=O)[C@H](C)N(C)C(=O)[C@@H](C)NC(=O)[C@@H](Cc2ccccc2)N(C)C1=O", None] ], allow_flagging="never" ) # Launch the app if __name__ == "__main__": iface.launch()