File size: 2,457 Bytes
2427679
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#importing the necessary libraries
import gradio as gr
import numpy as np
import pandas as pd
import re
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from topic_labels import labels

#Defining the models and tokenuzer
model_name = "valurank/distilroberta-topic-classification"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
#model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)


def clean_text(raw_text):
  text = raw_text.encode("ascii", errors="ignore").decode(
          "ascii"
    )  # remove non-ascii, Chinese characters
    
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"\t", " ", text)
  text = text.strip(" ")
  text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single

  text = re.sub(r"Date\s\d{1,2}\/\d{1,2}\/\d{4}", "", text) #remove date
  text = re.sub(r"\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+", "", text) #remove time
    
  return text


def find_two_highest_indices(arr):
    if len(arr) < 2:
        raise ValueError("Array must have at least two elements")

    # Initialize the indices of the two highest values
    max_idx = second_max_idx = None

    for i, value in enumerate(arr):
        if max_idx is None or value > arr[max_idx]:
            second_max_idx = max_idx
            max_idx = i
        elif second_max_idx is None or value > arr[second_max_idx]:
            second_max_idx = i

    return max_idx, second_max_idx


def predict_topic(text):
  text = clean_text(text)
  dict_topic = {}

  input_tensor = tokenizer.encode(text, return_tensors="pt", truncation=True)
  logits = model(input_tensor).logits

  softmax = torch.nn.Softmax(dim=1)
  probs = softmax(logits)[0]
  probs = probs.cpu().detach().numpy()

  max_index = find_two_highest_indices(probs)
  emotion_1, emotion_2 = labels[max_index[0]], labels[max_index[1]]
  probs_1, probs_2 = probs[max_index[0]], probs[max_index[1]]
  dict_topic[emotion_1] = round((probs_1), 2)

  #if probs_2 > 0.01:
  dict_topic[emotion_2] = round((probs_2), 2)

  return dict_topic


#Creating the interface for the radio appdemo = gr.Interface(multi_label_emotions, inputs=gr.Textbox(),
demo = gr.Interface(predict_topic, inputs=gr.Textbox(),
                    outputs = gr.Label(num_top_classes=2),
                    title="Topic Classification")

if __name__ == "__main__":
  demo.launch(debug=True)