Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

File size: 7,930 Bytes

babca6f
dc9cf4c
cbe4d4c
 
c8e54ed
1ae8e53
ec796a2
a6b9c5b
ff14337
218afdc
87e9ad0
ff14337
df85058
ff14337
 
 
53eb88c
 
 
 
 
 
 
28ff844
ff14337
 
df85058
a94b06f
df85058
 
 
 
 
 
61fa7d4
 
d4a83f2
34bf2a6
18d712a
 
61fa7d4
 
4b9eea9
df85058
d90b7ed
 
 
 
 
0c5e4a4
ed2f0b8
d90b7ed
ed2f0b8
 
fd26334
 
e7cf2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8e54ed
dc2eabd
 
c8e54ed
dc2eabd
 
d7bfcf2
 
395d676
d7bfcf2
395d676
d7bfcf2
 
395d676
d7bfcf2
59bfc5c
d90b7ed
d7bfcf2
22b7cff
d7bfcf2
eabbe21
d7bfcf2
 
d90b7ed
d7bfcf2
 
 
 
 
b394174
d90b7ed
d7bfcf2
d90b7ed
b394174
 
 
 
 
 
 
 
 
 
 
 
 
 
3f0c79b
b394174
3f0c79b
b394174
 
 
 
 
 
 
 
 
d7bfcf2
7cd77f2
6d9a5de
 
 
 
 
b394174
6d9a5de
 
d7bfcf2
ff14337
a6b9c5b
 
 
 
 
 
 
 
 
 
33b1b5b
df92cf7
53eb88c
18d712a
218afdc
18d712a
3e45c8c
33b1b5b
ca7ae8f
335e90e
 
02a7c9f
 
 
 
 
30dbd25
c8e54ed

import os
import whisper
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
import pandas as pd
import random
import classify
import replace_explitives
from whisper.model import Whisper
from whisper.tokenizer import get_tokenizer
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer


# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

model_cache = {}

# Building prediction function for gradio
emo_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "racism": ["racism", "hate speech", "bigotry", "racially targeted", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
    "LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"],
    "sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexist", "sexually demeaning", "sexual violence", "victim blaming"],
    "alcohol use": ["alcohol", "drinking", "drinks", "under the influence", "liquor", "beer", "wine"]
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_emotion(audio):
    #### Emotion classification ####
    # EMO MODEL LINE emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
    out_prob, score, index, text_lab = emotion_classifier.classify_file(audio)
    return  emo_dict[text_lab[0]]

def slider_logic(slider):
    threshold = 0
    if slider == 1:
        threshold = .98
    elif slider == 2:
        threshold = .88
    elif slider == 3:
        threshold = .77
    elif slider == 4:
        threshold = .66
    elif slider == 5:
        threshold = .55
    else:
        threshold = []
    return threshold

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, classify_anxiety, emo_class, explitive_selection, slider):
    
    # Transcribe the audio file using Whisper ASR
    transcribed_text = pipe(audio_file)["text"]
    
    ## SLIDER ##
    threshold = slider_logic(slider)
    
    #------- explitive call ---------------
    
    if replace_explitives != None and emo_class == None:
        transcribed_text = replace_explitives.sub_explitives(transcribed_text, explitive_selection)
    
    #### Toxicity Classifier ####
        
    # TOX MODEL LINE toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
    #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")

    toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
 
    toxicity_score = toxicity_results["toxicity"][0]
    print(toxicity_score)
    
    # emo call
    if emo_class != None:
        classify_emotion(audio_file)

    #### Text classification #####
    if classify_anxiety != None: 
        # DEVICE LINE device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
        # CLASSIFICATION LINE text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    
        sequence_to_classify = transcribed_text
        print(classify_anxiety, class_options)
        candidate_labels = class_options.get(classify_anxiety, [])
        # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
        classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
        print("class output ", type(classification_output))
        # classification_df = pd.DataFrame.from_dict(classification_output)
        print("keys ", classification_output.keys())
        
        # formatted_classification_output = "\n".join([f"{key}: {value}" for key, value in classification_output.items()])
        # label_score_pairs = [(label, score) for label, score in zip(classification_output['labels'], classification_output['scores'])]
        label_score_dict = {label: score for label, score in zip(classification_output['labels'], classification_output['scores'])}
        k = max(label_score_dict, key=label_score_dict.get)
        print("k keys: ", k)
        maxval = label_score_dict[k]
        print("max value: ", maxval)
        if maxval > toxicity_score:
            if maxval > threshold:
                print("Toxic")
                affirm = positive_affirmations()
                topScore = maxval
            else:
                print("Not Toxic")
                affirm = ""
                topScore = maxval
    else:
        if toxicity_score > threshold:
            affirm = positive_affirmations()
            topScore = toxicity_score
        else:
            affirm = ""
            topScore = toxicity_score
        label_score_dict = {"toxicity" : toxicity_score}

    return transcribed_text, topScore, label_score_dict, affirm
    # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
    
def positive_affirmations():
    affirmations = [
        "I have survived my anxiety before and I will survive again now",
        "I am not in danger; I am just uncomfortable; this too will pass",
        "I forgive and release the past and look forward to the future",
        "I can't control what other people say but I can control my breathing and my response"
    ]
    selected_affirm = random.choice(affirmations)
    return selected_affirm
    
with gr.Blocks() as iface:
    show_state = gr.State([])
    with gr.Column():
        anxiety_class = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "alcohol use"])
        explit_preference = gr.Radio(choices=["N-Word", "B-Word", "All Explitives"], label="Words to omit from general anxiety classes", info="certain words may be acceptible within certain contects for given groups of people, and some people may be unbothered by explitives broadly speaking.")
        emo_class = gr.Radio(choices=["negaitve emotionality"], label="Negative Emotionality", info="Select if you would like explitives to be considered anxiety-indiucing in the case of anger/ negative emotionality.")
        sense_slider = gr.Slider(minimum=1, maximum=5, step=1.0, label="How readily do you want the tool to intervene? 1 = in extreme cases and 5 = at every opportunity")
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_text = gr.Textbox(label="Transcribed Audio")
        out_val = gr.Textbox(label="Overall Toxicity")
        out_class = gr.Label(label="Toxicity Class Breakdown")
        out_affirm = gr.Textbox(label="Automated Text Message")
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, anxiety_class, emo_class, explit_preference, sense_slider], outputs=[out_text, out_val, out_class, out_affirm])

iface.launch()