import streamlit as st
import pandas as pd
import numpy as np
import torch
import networkx as nx
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import spacy

st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")

# Advanced NLP Libraries
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    AutoModelForTokenClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification
)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textstat import flesch_reading_ease, flesch_kincaid_grade

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model (requires separate installation)
try:
    nlp = spacy.load('en_core_web_lg')
except:
    st.error("Please install spaCy and en_core_web_lg model: \n"
             "pip install spacy\n"
             "python -m spacy download en_core_web_lg")

# Constants and Configurations
MORAL_FOUNDATIONS = {
    'care': 'Care/Harm',
    'fairness': 'Fairness/Cheating', 
    'loyalty': 'Loyalty/Betrayal',
    'authority': 'Authority/Subversion',
    'sanctity': 'Sanctity/Degradation'
}

RHETORICAL_DEVICES = {
    'analogy': ['like', 'as', 'similar to'],
    'repetition': ['repetitive', 'recurring'],
    'metaphor': ['as if', 'like', 'represents'],
    'hyperbole': ['always', 'never', 'absolute'],
    'rhetorical_question': ['?']
}

class SpeechAnalyzer:
    def __init__(self):
    # Load MoralFoundations model
        self.moral_model_path = "MMADS/MoralFoundationsClassifier"
        self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
        self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
    
    # Define label names directly
        self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
        
        # Other pipelines remain the same
        self.sentiment_pipeline = pipeline("sentiment-analysis")
        self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)

    def split_text(self, text, max_length=512, overlap=50):
        """Split long text into overlapping segments"""
        words = text.split()
        segments = []
        current_segment = []
        current_length = 0

        for word in words:
            if current_length + len(word.split()) > max_length:
                segments.append(' '.join(current_segment))
                current_segment = current_segment[-overlap:] + [word]
                current_length = len(' '.join(current_segment).split())
            else:
                current_segment.append(word)
                current_length = len(' '.join(current_segment).split())

        if current_segment:
            segments.append(' '.join(current_segment))

        return segments

    def analyze_moral_foundations(self, text):
        """Analyze moral foundations using the RoBERTa-based classifier"""
        segments = self.split_text(text)
        
        foundation_scores = {
            'care': [], 'fairness': [], 'loyalty': [],
            'authority': [], 'sanctity': []
        }
        
        for segment in segments:
            inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)
            
            with torch.no_grad():
                outputs = self.moral_model(**inputs)
            
            probabilities = torch.softmax(outputs.logits, dim=1)
            
            for idx, label in enumerate(self.label_names):
                foundation = label.lower()
                if foundation in foundation_scores:
                    foundation_scores[foundation].append(probabilities[0][idx].item())
        
        # Average the scores across segments
        aggregated_scores = {
            foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
        }
        
        return aggregated_scores

    def analyze_emotional_trajectory(self, text, window_size=5):
        """Perform emotional trajectory analysis"""
        segments = self.split_text(text, max_length=256)
        
        sentiment_scores = []
        for segment in segments:
            result = self.sentiment_pipeline(segment)[0]
            score = 1 if result['label'] == 'POSITIVE' else -1
            sentiment_scores.append(score)
        
        smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2) 
                           if len(sentiment_scores) > window_size else sentiment_scores)

        return smoothed_scores

    def detect_named_entities(self, text):
        """Detect named entities in the text"""
        entities = self.ner_pipeline(text)
        return entities

    def extract_key_phrases(self, text, top_n=10):
        """Extract key phrases using TF-IDF"""
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()
        
        # Get top phrases by TF-IDF score
        sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
        top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
        
        return top_phrases

    def calculate_readability(self, text):
        """Calculate readability metrics"""
        return {
            'Flesch Reading Ease': flesch_reading_ease(text),
            'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
        }

    def detect_rhetorical_devices(self, text):
        """Detect rhetorical devices"""
        devices_found = {}
        for device, markers in RHETORICAL_DEVICES.items():
            count = sum(text.lower().count(marker) for marker in markers)
            if count > 0:
                devices_found[device] = count
        return devices_found

    def create_semantic_network(self, text, top_n=20):
        """Create semantic network graph"""
        # Use spaCy for advanced parsing
        doc = nlp(text)
        
        # Create graph
        G = nx.Graph()
        
        # Extract top nouns and their relationships
        nouns = [token.text for token in doc if token.pos_ == 'NOUN']
        noun_freq = nltk.FreqDist(nouns)
        top_nouns = [noun for noun, _ in noun_freq.most_common(top_n)]
        
        # Add nodes and edges
        for noun in top_nouns:
            G.add_node(noun)
        
        # Connect related nouns
        for i in range(len(top_nouns)):
            for j in range(i+1, len(top_nouns)):
                if top_nouns[i] in text and top_nouns[j] in text:
                    G.add_edge(top_nouns[i], top_nouns[j])
        
        return G

def main():
    st.title("🗣️ Advanced Political Speech Analysis Toolkit")
    
    # Initialize analyzer
    analyzer = SpeechAnalyzer()
    
    # File upload
    uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
    
    if uploaded_file is not None:
        # Read file (similar to previous implementation)
        if uploaded_file.name.endswith('.txt'):
            text = uploaded_file.getvalue().decode('utf-8')
        elif uploaded_file.name.endswith('.docx'):
            import docx
            doc = docx.Document(uploaded_file)
            text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        elif uploaded_file.name.endswith('.pdf'):
            import PyPDF2
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            text = ' '.join([page.extract_text() for page in pdf_reader.pages])
        
        # Create tabs for different analyses
        tab1, tab2, tab3, tab4, tab5 = st.tabs([
            "Moral Foundations", 
            "Emotional Analysis", 
            "Linguistic Insights", 
            "Semantic Network", 
            "Advanced NLP"
        ])
        
        with tab1:
            st.subheader("Moral Foundations Analysis")
            moral_scores = analyzer.analyze_moral_foundations(text)
            
            # Plotly bar chart
            moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
            moral_df.index.name = 'Moral Foundation'
            moral_df = moral_df.reset_index()
            
            fig = px.bar(
                moral_df, 
                x='Moral Foundation', 
                y='Score', 
                title='Moral Foundations Breakdown',
                color='Moral Foundation'
            )
            st.plotly_chart(fig)
            
            # Detailed insights
            for foundation, score in moral_scores.items():
                st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
        
        with tab2:
            st.subheader("Emotional Trajectory")
            emotional_trajectory = analyzer.analyze_emotional_trajectory(text)
            
            # Plotly line chart
            trajectory_fig = go.Figure(data=go.Scatter(
                y=emotional_trajectory, 
                mode='lines+markers',
                name='Emotional Intensity'
            ))
            trajectory_fig.update_layout(
                title='Speech Emotional Trajectory',
                xaxis_title='Speech Segments',
                yaxis_title='Emotional Intensity'
            )
            st.plotly_chart(trajectory_fig)
        
        with tab3:
            st.subheader("Linguistic Complexity")
            readability = analyzer.calculate_readability(text)
            
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}")
            with col2:
                st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}")
            
            # Key Phrases
            st.subheader("Key Phrases")
            key_phrases = analyzer.extract_key_phrases(text)
            st.write(", ".join(key_phrases))
        
        with tab4:
            st.subheader("Semantic Network")
            semantic_graph = analyzer.create_semantic_network(text)
            
            # Convert NetworkX graph to Plotly
            edge_x = []
            edge_y = []
            for edge in semantic_graph.edges():
                x0, y0 = semantic_graph.nodes[edge[0]].get('pos', (0,0))
                x1, y1 = semantic_graph.nodes[edge[1]].get('pos', (0,0))
                edge_x.append(x0)
                edge_x.append(x1)
                edge_y.append(y0)
                edge_y.append(y1)
            
            # Plotly network visualization
            network_fig = go.Figure()
            network_fig.add_trace(go.Scatter(
                x=edge_x, 
                y=edge_y,
                mode='lines',
                line=dict(width=0.5, color='#888'),
                hoverinfo='none'
            ))
            
            st.plotly_chart(network_fig)
        
        with tab5:
            st.subheader("Advanced NLP Analysis")
            
            # Named Entities
            st.write("### Named Entities")
            named_entities = analyzer.detect_named_entities(text)
            entities_df = pd.DataFrame(named_entities)
            st.dataframe(entities_df)
            
            # Rhetorical Devices
            st.write("### Rhetorical Devices")
            rhetorical_devices = analyzer.detect_rhetorical_devices(text)
            for device, count in rhetorical_devices.items():
                st.write(f"**{device.capitalize()}**: {count} instances")

if __name__ == "__main__":
    main()