import gradio as gr
import cv2
import easyocr
import numpy as np
import requests
import os
import whisper
from transformers import pipeline

API_KEY = os.getenv("API_KEY")

IMAGE_API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
headers = {"Authorization": "Bearer "+ API_KEY+""}

EMOTIONS_API_URL = "https://api-inference.huggingface.co/models/SamLowe/roberta-base-go_emotions"
headers = {"Authorization": "Bearer "+ API_KEY+""}

reader = easyocr.Reader(['en'], gpu=False)

model = whisper.load_model("base")
sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")

def query(image):
    image_data = np.array(image, dtype=np.uint8)
    _, buffer = cv2.imencode('.jpg', image_data)
    binary_data = buffer.tobytes()

    response = requests.post(IMAGE_API_URL, headers=headers, data=binary_data)
    result = {item['label']: item['score'] for item in response.json()}

    return result

def text_extraction(image):
    global text_content
    text_content = ''
    facial_data = query(image)
    text_ = reader.readtext(image)
    threshold = 0.25
    for t_, t in enumerate(text_):
        bbox, text, score = t
        text_content = text_content + ' ' + ' '.join(text)
        if score > threshold:
            cv2.rectangle(image, tuple(map(int, bbox[0])), tuple(map(int, bbox[2])), (0, 255, 0), 5)

    return image, text_content, facial_data

def analyze_sentiment(text):
    results = sentiment_analysis(text)
    print(results)
    sentiment_results = {result['label']: result['score'] for result in results}
    return sentiment_results

def get_sentiment_emoji(sentiment):
    emoji_mapping = {
        "disappointment": "😞",
        "sadness": "😢",
        "annoyance": "😠",
        "neutral": "😐",
        "disapproval": "👎",
        "realization": "😮",
        "nervousness": "😬",
        "approval": "👍",
        "joy": "😄",
        "anger": "😡",
        "embarrassment": "😳",
        "caring": "🤗",
        "remorse": "😔",
        "disgust": "🤢",
        "grief": "😥",
        "confusion": "😕",
        "relief": "😌",
        "desire": "😍",
        "admiration": "😌",
        "optimism": "😊",
        "fear": "😨",
        "love": "❤️",
        "excitement": "🎉",
        "curiosity": "🤔",
        "amusement": "😄",
        "surprise": "😲",
        "gratitude": "🙏",
        "pride": "🦁"
    }
    return emoji_mapping.get(sentiment, "")

def display_sentiment_results(sentiment_results, option):
    sentiment_text = ""
    for sentiment, score in sentiment_results.items():
        emoji = get_sentiment_emoji(sentiment)
        if option == "Sentiment Only":
            sentiment_text += f"{sentiment} {emoji}\n"
        elif option == "Sentiment + Score":
            sentiment_text += f"{sentiment} {emoji}: {score}\n"
    return sentiment_text

def inference(image, text, audio, sentiment_option):
    extracted_image, extracted_text, extracted_facial_data = text_extraction(image)
    
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)
    lang = max(probs, key=probs.get)

    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(model, mel, options)

    audio_sentiment_results = analyze_sentiment(result.text)            # Ta - Text from audio
    image_sentiment_results = analyze_sentiment(extracted_text)         # Ti - Text from image 
    text_sentiment_results = analyze_sentiment(text)                    # T  - User defined Text
    
    audio_sentiment_output = display_sentiment_results(audio_sentiment_results, sentiment_option)
    image_sentiment_output = display_sentiment_results(image_sentiment_results, sentiment_option)
    text_sentiment_output = display_sentiment_results(text_sentiment_results, sentiment_option)

    return extracted_image, extracted_facial_data, extracted_text, image_sentiment_output, text_sentiment_output, lang.upper(), result.text, audio_sentiment_output

title = """<h1 align="center">Cross Model Machine Learning (Sentiment Analysis)</h1>"""
image_path = "thmbnail.png"
description = """
💻 This demo showcases a Cross Model Machine Learning for Sentiment Analysis.<br><br>
<br>
⚙️ Components of the tool:<br>
<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Sentiment Analysis of Image<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Text Extraction from Image<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Sentiment analysis of the user given text.<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Real-time multilingual speech recognition<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Language identification<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Sentiment analysis of the transcriptions<br>
<br>
🎯 The sentiment analysis results are provided as a dictionary with different emotions and their corresponding scores.<br>
<br>

😃 The sentiment analysis results are displayed with emojis representing the corresponding sentiment.<br>
<br>

✅ The higher the score for a specific emotion, the stronger the presence of that emotion in the transcribed text.<br>
<br>

❓ Use the microphone for real-time speech recognition.<br>
<br>

⚡️ The model will transcribe the audio and perform sentiment analysis on the transcribed text.<br>

"""

custom_css = """
#banner-image {
    display: block;
    margin-left: auto;
    margin-right: auto;
}
#chat-message {
    font-size: 14px;
    min-height: 300px;
}
"""

block = gr.Blocks(css=custom_css)

with block:
    gr.HTML(title)

    with gr.Row():
        with gr.Column():
            gr.Image(image_path, elem_id="banner-image", show_label=False)
        with gr.Column():
            gr.HTML(description)

    with gr.Blocks():
        with gr.Column():
            with gr.Row():
                image = gr.Image()
                
                image_output = gr.Image()
                text_output = gr.Textbox(label="Text Content")
                image_text_sentiment = gr.Textbox(label="Image Text Sentiment")
                facial_output = gr.Label(label='Facial Data', container=True, scale=2)

            with gr.Row():
                with gr.Column():
                    gr.Textbox(label="Text Content")
    
                    output_text_sentiment = gr.Textbox(label="Text Sentiment")

                with gr.Blocks():
                    with gr.Row():
                        audio = gr.Audio(label="Input Audio", show_label=False, type="filepath")
                    
                        with gr.Row():
                            sentiment_option = gr.Radio(choices=["Sentiment Only", "Sentiment + Score"], label="Select an option")
                            
                            lang_str = gr.Textbox(label="Language")
                            text = gr.Textbox(label="Transcription")
                            sentiment_output = gr.Textbox(label="Audio Text Sentiment")

        
        btn = gr.Button("Run")

        btn.click(inference, inputs=[image, text, audio, sentiment_option], outputs=[image_output, facial_output, text_output, image_text_sentiment, output_text_sentiment, lang_str, text, sentiment_output])

block.launch()