Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

File size: 7,669 Bytes

babca6f
 
dc9cf4c
cbe4d4c
 
c8e54ed
1ae8e53
ec796a2
ff14337
218afdc
87e9ad0
ff14337
df85058
ff14337
 
 
53eb88c
 
 
 
 
 
 
28ff844
ff14337
 
df85058
a94b06f
df85058
 
 
 
 
 
61fa7d4
 
d4a83f2
34bf2a6
d4a83f2
61fa7d4
 
 
4b9eea9
df85058
0c5e4a4
ed2f0b8
 
 
 
fd26334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9a0cdb
c8e54ed
dd5c246
c8e54ed
bbd3701
2cadcf2
f10b2fa
6bfef5d
395d676
ed2f0b8
dd5c246
 
fd26334
8c28395
e91d036
 
 
218afdc
 
395d676
 
 
 
73d041b
395d676
 
 
 
b31f1e8
 
0c5e4a4
218afdc
395d676
 
 
 
 
 
 
 
 
 
 
95f2d9c
184643c
f5f212e
b9a0cdb
184643c
fd26334
 
74753c4
395d676
 
fd26334
6615174
ff14337
7a481f6
187b547
bb7f792
187b547
bb7f792
 
 
7a481f6
187b547
9bae889
 
ff14337
 
6615174
ff14337
789fd51
ff14337
 
be06195
ff14337
 
 
 
 
 
 
 
 
 
395d676
ff14337
33b1b5b
53eb88c
187b547
218afdc
 
dd5c246
33b1b5b
ca7ae8f
 
335e90e
 
ec796a2
401d5c0
33b1b5b
dd5c246
30dbd25
c8e54ed

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
import pandas as pd
import classify
import replace_explitives
from whisper.model import Whisper
from whisper.tokenizer import get_tokenizer
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer


# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

model_cache = {}

# Building prediction function for gradio
emo_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "racism": ["racism", "hate speech", "bigotry", "racially targeted", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
    "LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"],
    "sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexist", "sexually demeaning", "sexual violence", "victim blaming"],
    "misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"]
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

def classify_emotion(audio):
    #### Emotion classification ####
    emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
    out_prob, score, index, text_lab = emotion_classifier.classify_file(audio)
    return  emo_dict[text_lab[0]]

def slider_logic(slider):
        if slider == 1:
            theshold = .98
        elif slider == 2:
            threshold = .88
        elif slider == 3:
            threshold = .77
        elif slider == 4:
            threshold = .66
        elif slider == 5:
            threshold = .55
        else:
            threshold = []
        return threshold
    
# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, text_input, classify_anxiety, emo_class, explitive_selection, slider):
    # Transcribe the audio file using Whisper ASR
    if audio_file != None:
        transcribed_text = pipe(audio_file)["text"]
    else:
        transcribed_text = text_input
    if classify_anxiety != "misophonia":
        print("emo_class ", emo_class, "explitive select", explitive_selection)

        ## SLIDER ##
        threshold = slider_logic(slider)
        
        #------- explitive call ---------------
        
        if replace_explitives != None and emo_class == None:
            transcribed_text = replace_explitives.sub_explitives(transcribed_text, explitive_selection)
        
        #### Toxicity Classifier ####
            
        toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
        #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")
    
        toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
     
        toxicity_score = toxicity_results["toxicity"][0]
        print(toxicity_score)
        # emo call
        if emo_class != None:
            classify_emotion(audio_file)

        #### Text classification #####
    
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
        text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    
        sequence_to_classify = transcribed_text
        print(classify_anxiety, class_options)
        candidate_labels = class_options.get(classify_anxiety, [])
        # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
        classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
        print("class output ", type(classification_output))
        # classification_df = pd.DataFrame.from_dict(classification_output)
        print("keys ", classification_output.keys())

        # plot.update(x=classification_df["labels"], y=classification_df["scores"])
        if toxicity_score > threshold:
            print("threshold exceeded!!")
        return toxicity_score, classification_output, transcribed_text
        # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
    else: 
        threshold = slider_logic(slider)
        model = whisper.load_model("large")
        # model = model_cache[model_name]
        # class_names = classify_anxiety.split(",")
        class_names_list = class_options.get(classify_anxiety, [])
        class_str = ""
        for elm in class_names_list:
            class_str += elm + ","
        #class_names = class_names_temp.split(",")
        class_names = class_str.split(",")
        print("class names ", class_names, "classify_anxiety ", classify_anxiety)
        
        tokenizer = get_tokenizer("large")
        # tokenizer= WhisperTokenizer.from_pretrained("openai/whisper-large")
    
        internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs(
            model=model,
            class_names=class_names,
            # class_names=classify_anxiety,
            tokenizer=tokenizer,
        )
        audio_features = classify.calculate_audio_features(audio_file, model)
        average_logprobs = classify.calculate_average_logprobs(
            model=model,
            audio_features=audio_features,
            class_names=class_names,
            tokenizer=tokenizer,
        )
        average_logprobs -= internal_lm_average_logprobs
        scores = average_logprobs.softmax(-1).tolist()
        return {class_name: score for class_name, score in zip(class_names, scores)}

        return classify_anxiety
     
with gr.Blocks() as iface:
    with gr.Column():
        anxiety_class = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "misophonia"])
        explit_preference = gr.Radio(choices=["N-Word", "B-Word", "All Explitives"], label="Words to omit from general anxiety classes", info="certain words may be acceptible within certain contects for given groups of people, and some people may be unbothered by explitives broadly speaking.")
        emo_class = gr.Radio(choices=["negaitve emotionality"], label="label", info="Select if you would like explitives to be considered anxiety-indiucing in the case of anger/ negative emotionality.")
        sense_slider = gr.Slider(minimum=1, maximum=5, label="How readily do you want the tool to intervene? 1 = in extreme cases and 5 = at every opportunity")
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        text = gr.Textbox(label="Enter Text", placeholder="Enter text here...")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_val = gr.Textbox()
        out_class = gr.Textbox()
        out_text = gr.Textbox()
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, anxiety_class, emo_class, explit_preference, sense_slider], outputs=[out_val, out_class, out_text])

iface.launch()