import os
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from gtts import gTTS
import tempfile
import logging
from typing import Optional, Tuple
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Before initializing any Hugging Face components, unset proxy environment variables:
os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None)
os.environ.pop("http_proxy", None)
os.environ.pop("https_proxy", None)


def create_temp_dir():
    """Create temporary directory for audio files if it doesn't exist"""
    temp_dir = "temp_audio"
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    return temp_dir

class EnhancedAIAgent:  # Corrected the typo: 'lass' to 'class'
    def __init__(self):
        """Initialize the AI agent with models and pipelines"""
        try:
            logger.info("Initializing AI Agent...")
            
            # Set device
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info(f"Using device: {self.device}")
            
            # Initialize speech recognition
            logger.info("Loading speech recognition model...")
            self.speech_recognizer = pipeline(
                "automatic-speech-recognition",
                model="facebook/wav2vec2-base-960h",
                device=self.device
            )
            
            # Initialize sentiment analysis
            logger.info("Loading sentiment analysis model...")
            self.sentiment_analyzer = pipeline(
                "sentiment-analysis",
                model="distilbert-base-uncased-finetuned-sst-2-english",
                device=self.device
            )
            
            # Initialize emotion recognition
            logger.info("Loading emotion recognition model...")
            self.emotion_recognizer = pipeline(
                "text-classification",
                model="j-hartmann/emotion-english-distilroberta-base",
                device=self.device
            )
            
            # Create temporary directory for audio files
            self.temp_dir = create_temp_dir()
            
            logger.info("AI Agent initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing AI Agent: {str(e)}")
            raise

    def process_audio(
        self, 
        audio_path: Optional[str], 
        history: Optional[str]
    ) -> Tuple[str, Optional[str], str, str]:
        """Process audio input and generate response"""
        try:
            # Initialize history
            history_list = [] if not history else history.split('\n')
            
            # Handle no audio input
            if not audio_path:
                return history or "", None, "", ""
            
            # Convert speech to text
            logger.info("Converting speech to text...")
            user_input = self.speech_recognizer(audio_path)["text"]
            logger.info(f"Transcribed text: {user_input}")
            
            # Generate simple response (can be enhanced later)
            response = f"I heard you say: {user_input}"
            
            # Analyze sentiment
            logger.info("Analyzing sentiment...")
            sentiment_result = self.sentiment_analyzer(user_input)[0]
            sentiment = f"Sentiment: {sentiment_result['label']} ({sentiment_result['score']:.2f})"
            
            # Analyze emotion
            logger.info("Analyzing emotion...")
            emotion_result = self.emotion_recognizer(user_input)[0]
            emotion = f"Emotion: {emotion_result['label']} ({emotion_result['score']:.2f})"
            
            # Generate audio response
            logger.info("Generating audio response...")
            audio_output = self.text_to_speech(response)
            
            # Update history
            history_list.extend([
                f"User: {user_input}",
                f"Assistant: {response}",
                sentiment,
                emotion
            ])
            
            return '\n'.join(history_list), audio_output, sentiment, emotion
            
        except Exception as e:
            logger.error(f"Error processing audio: {str(e)}")
            return str(history or ""), None, f"Error: {str(e)}", f"Error: {str(e)}"
    
    def text_to_speech(self, text: str) -> Optional[str]:
        """Convert text to speech"""
        try:
            output_path = os.path.join(self.temp_dir, f"response_{hash(text)}.wav")
            tts = gTTS(text=text, lang='en')
            tts.save(output_path)
            return output_path
        except Exception as e:
            logger.error(f"Error in text-to-speech conversion: {str(e)}")
            return None

def create_interface():
    """Create the Gradio interface"""
    try:
        # Initialize AI Agent
        agent = EnhancedAIAgent()
        
        # Define interface
        with gr.Blocks() as interface:
            gr.Markdown("# AI Speech Analysis App")
            
            with gr.Row():
                with gr.Column(scale=2):
                    # Audio input
                    audio_input = gr.Audio(
                        label="Record your message",
                        type="filepath"
                    )
                    
                    # Analyze button
                    analyze_button = gr.Button(
                        "Analyze Speech",
                        variant="primary"
                    )
                    
                    # Chat history
                    chat_history = gr.Textbox(
                        label="Conversation History",
                        lines=10,
                        interactive=False
                    )
                    
                    # Audio output
                    audio_output = gr.Audio(
                        label="AI Response",
                        type="filepath"
                    )
                    
                    # Analysis displays
                    sentiment_display = gr.Textbox(
                        label="Sentiment Analysis",
                        interactive=False
                    )
                    
                    emotion_display = gr.Textbox(
                        label="Emotion Recognition",
                        interactive=False
                    )
            
            # Set up event handler
            analyze_button.click(  # Ensure this line is aligned with the outer 'with' block
                fn=agent.process_audio,
                inputs=[audio_input, chat_history],
                outputs=[chat_history, audio_output, sentiment_display, emotion_display]
            )
            
            # Instructions
            gr.Markdown("""
            ### How to Use:
            1. Click the microphone icon to start recording
            2. Speak your message
            3. Click stop when finished
            4. Press "Analyze Speech" to process your message
            5. View the results and listen to the response
            """)
        
        return interface
    
    except Exception as e:
        logger.error(f"Error creating interface: {str(e)}")
        raise

# Create and launch the interface
demo = create_interface()

# Launch the app
if __name__ == "__main__":
    demo.launch()