import gradio as gr
import re
import pandas as pd
from io import StringIO

def remove_nested_branches(smiles):
    """Remove nested branches from SMILES string"""
    result = ''
    depth = 0
    for char in smiles:
        if char == '(':
            depth += 1
        elif char == ')':
            depth -= 1
        elif depth == 0:
            result += char
    return result
def identify_linkage_type(segment):
    """
    Identify the type of linkage between residues
    Returns: tuple (type, is_n_methylated)
    """
    if 'OC(=O)' in segment:
        return ('ester', False)
    elif 'N(C)C(=O)' in segment:
        return ('peptide', True)  # N-methylated peptide bond
    elif 'NC(=O)' in segment:
        return ('peptide', False)  # Regular peptide bond
    return (None, False)
def identify_residue(segment, next_segment=None, prev_segment=None):
    """
    Identify amino acid residues with modifications and special handling for Proline
    Returns: tuple (residue, modifications)
    """
    modifications = []
    
    # Check for modifications in the next segment
    if next_segment:
        if 'N(C)C(=O)' in next_segment:
            modifications.append('N-Me')
        if 'OC(=O)' in next_segment:
            modifications.append('O-linked')

    # Special case for Proline - check for CCCN pattern and its cyclization
    # Proline can appear in several patterns due to its cyclic nature
    if any(pattern in segment for pattern in ['CCCN2', 'N2CCC', '[C@@H]2CCCN2', 'CCCN1', 'N1CCC']):
        return ('Pro', modifications)
    
    # Check if this segment is part of a Proline ring by looking at context
    if prev_segment and next_segment:
        if ('CCC' in segment and 'N' in next_segment) or ('N' in segment and 'CCC' in prev_segment):
            combined = prev_segment + segment + next_segment
            if re.search(r'CCCN.*C\(=O\)', combined):
                return ('Pro', modifications)

    # Aromatic amino acids
    if 'Cc2ccccc2' in segment or 'c1ccccc1' in segment:  
        return ('Phe', modifications)
    if 'c2ccc(O)cc2' in segment:  
        return ('Tyr', modifications)
    if 'c1c[nH]c2ccccc12' in segment:  
        return ('Trp', modifications)
    if 'c1cnc[nH]1' in segment:  
        return ('His', modifications)
        
    # Branched chain amino acids
    if 'CC(C)C[C@H]' in segment or 'CC(C)C[C@@H]' in segment:  
        return ('Leu', modifications)
    if '[C@H](CC(C)C)' in segment or '[C@@H](CC(C)C)' in segment:  
        return ('Leu', modifications)
    if 'C(C)C' in segment and not any(pat in segment for pat in ['CC(C)C', 'C(C)C[C@H]', 'C(C)C[C@@H]']):
        return ('Val', modifications)
    if 'C(C)C[C@H]' in segment or 'C(C)C[C@@H]' in segment:  
        return ('Ile', modifications)
        
    # Small/polar amino acids
    if ('[C@H](C)' in segment or '[C@@H](C)' in segment) and 'C(C)C' not in segment:
        return ('Ala', modifications)
    if '[C@H](CO)' in segment:
        return ('Ser', modifications)
    if '[C@H](C(C)O)' in segment or '[C@@H](C(C)O)' in segment:
        return ('Thr', modifications)
    if '[C@H]' in segment and not any(pat in segment for pat in ['C(C)', 'CC', 'O', 'N', 'S']):
        return ('Gly', modifications)
        
    # Rest of amino acids remain the same...
    # [Previous code for other amino acids]
    
    return (None, modifications)
def parse_peptide(smiles):
    """
    Parse peptide sequence with enhanced Proline recognition
    """
    # Split on peptide bonds while preserving cycle numbers
    bond_pattern = r'(NC\(=O\)|N\(C\)C\(=O\)|N\dC\(=O\)|OC\(=O\))'
    segments = re.split(bond_pattern, smiles)
    segments = [s for s in segments if s]
    
    sequence = []
    i = 0
    while i < len(segments):
        segment = segments[i]
        next_segment = segments[i+1] if i+1 < len(segments) else None
        prev_segment = segments[i-1] if i > 0 else None
        
        # Skip pure bond patterns
        if re.match(r'.*C\(=O\)$', segment):
            i += 1
            continue
            
        residue, modifications = identify_residue(segment, next_segment, prev_segment)
        if residue:
            # Format residue with modifications
            formatted_residue = residue
            if modifications:
                formatted_residue += f"({','.join(modifications)})"
            sequence.append(formatted_residue)
        
        i += 1
    
    is_cyclic = is_cyclic_peptide(smiles)
    
    # Print debug information
    print("\nDetailed Analysis:")
    print("Segments:", segments)
    print("Found sequence:", sequence)
    
    # Format the final sequence
    if is_cyclic:
        return f"cyclo({'-'.join(sequence)})"
    return '-'.join(sequence)

def is_cyclic_peptide(smiles):
    """
    Determine if SMILES represents a cyclic peptide by checking:
    1. Proper cycle number pairing
    2. Presence of peptide bonds between cycle points
    3. Distinguishing between aromatic rings and peptide cycles
    """
    cycle_info = {}
    
    # Find all cycle numbers and their contexts
    for match in re.finditer(r'(\w{3})?(\d)(\w{3})?', smiles):
        number = match.group(2)
        pre_context = match.group(1) or ''
        post_context = match.group(3) or ''
        position = match.start(2)
        
        if number not in cycle_info:
            cycle_info[number] = []
        cycle_info[number].append({
            'position': position,
            'pre_context': pre_context,
            'post_context': post_context,
            'full_context': smiles[max(0, position-3):min(len(smiles), position+4)]
        })
    
    # Check each cycle
    peptide_cycles = []
    aromatic_cycles = []
    
    for number, occurrences in cycle_info.items():
        if len(occurrences) != 2:  # Must have exactly 2 occurrences
            continue
            
        start, end = occurrences[0]['position'], occurrences[1]['position']
        
        # Get the segment between cycle points
        segment = smiles[start:end+1]
        clean_segment = remove_nested_branches(segment)
        
        # Check if this is an aromatic ring
        is_aromatic = any(context['full_context'].count('c') >= 2 for context in occurrences)
        
        # Check if this is a peptide cycle
        has_peptide_bond = 'NC(=O)' in segment or 'N2C(=O)' in segment
        
        if is_aromatic:
            aromatic_cycles.append(number)
        elif has_peptide_bond:
            peptide_cycles.append(number)
    
    return len(peptide_cycles) > 0, peptide_cycles, aromatic_cycles

def analyze_single_smiles(smiles):
    """Analyze a single SMILES string"""
    try:
        is_cyclic, peptide_cycles, aromatic_cycles = is_cyclic_peptide(smiles)
        sequence = parse_peptide(smiles)
        
        details = {
            'SMILES': smiles,
            'Sequence': sequence,
            'Is Cyclic': 'Yes' if is_cyclic else 'No',
            'Peptide Cycles': ', '.join(peptide_cycles) if peptide_cycles else 'None',
            'Aromatic Cycles': ', '.join(aromatic_cycles) if aromatic_cycles else 'None'
        }
        return details
        
    except Exception as e:
        return {
            'SMILES': smiles,
            'Sequence': f'Error: {str(e)}',
            'Is Cyclic': 'Error',
            'Peptide Cycles': 'Error',
            'Aromatic Cycles': 'Error'
        }

def process_input(smiles_input=None, file_obj=None):
    """Process either direct SMILES input or file input"""
    results = []
    
    # Handle direct SMILES input
    if smiles_input:
        result = analyze_single_smiles(smiles_input.strip())
        results.append(result)
    
    # Handle file input
    if file_obj is not None:
        content = file_obj.decode('utf-8')
        for line in StringIO(content):
            smiles = line.strip()
            if smiles:  # Skip empty lines
                result = analyze_single_smiles(smiles)
                results.append(result)
    
    # Create formatted output
    output_text = ""
    for i, result in enumerate(results, 1):
        output_text += f"Entry {i}:\n"
        output_text += f"SMILES: {result['SMILES']}\n"
        output_text += f"Sequence: {result['Sequence']}\n"
        output_text += f"Is Cyclic: {result['Is Cyclic']}\n"
        output_text += f"Peptide Cycles: {result['Peptide Cycles']}\n"
        output_text += f"Aromatic Cycles: {result['Aromatic Cycles']}\n"
        output_text += "-" * 50 + "\n"
    
    return output_text

# Create Gradio interface
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Textbox(
            label="Enter SMILES string",
            placeholder="Enter SMILES notation of peptide...",
            lines=2
        ),
        gr.File(
            label="Or upload a text file with SMILES",
            file_types=[".txt"],
            type="binary"
        )
    ],
    outputs=gr.Textbox(
        label="Analysis Results",
        lines=10
    ),
    title="Peptide Structure Analyzer",
    description="""
    Analyze peptide structures from SMILES notation to:
    1. Determine if the peptide is cyclic
    2. Identify peptide cycles vs aromatic rings
    3. Parse the amino acid sequence
    
    Input: Either enter a SMILES string directly or upload a text file with multiple SMILES (one per line)
    """,
    examples=[
        # Example cyclic peptide with Proline
        ["CC(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O", None],
        # Example cyclic peptide with ester bond
        ["CC(C)C[C@@H]1OC(=O)[C@H](C)NC(=O)[C@H](C(C)C)OC(=O)[C@H](C)N(C)C(=O)[C@@H](C)NC(=O)[C@@H](Cc2ccccc2)N(C)C1=O", None]
    ],
    allow_flagging="never"
)

# Launch the app
if __name__ == "__main__":
    iface.launch()