File size: 2,148 Bytes
c29d314
 
 
 
a79be1a
c29d314
 
 
 
2c5b5b4
1073c6d
c29d314
efb3d97
c85007d
 
c29d314
4ee9c74
c29d314
 
 
 
458779c
c29d314
4ee9c74
 
 
 
5dd89c8
 
 
 
 
 
 
 
 
 
 
efb3d97
 
458779c
5dd89c8
c29d314
 
4ee9c74
 
c29d314
efb3d97
c29d314
 
 
 
 
 
 
458779c
c29d314
 
 
c078293
efb3d97
 
9d55375
 
7ff01e2
efb3d97
b56f45b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#importing the necessary libraries
import gradio as gr
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

#Defining the labels of the models
labels = [ "business", "science", "health", "world", "sport", "politics", "entertainment", "technology", "education", "environment", "travel", "lifestyle", "crime", "opinion", "weather", "culture", "art", "food", "automotive", "finance", "international" ]

#Defining the models and tokenuzer
model_name = "valurank/finetuned-distilbert-news-article-categorization"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

"""
#Reading in the text file
def read_in_text(url):
  with open(url, 'r') as file:
    article = file.read()
      
    return article
"""

def clean_text(raw_text):
  text = raw_text.encode("ascii", errors="ignore").decode(
          "ascii"
    )  # remove non-ascii, Chinese characters
    
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"\t", " ", text)
  text = text.strip(" ")
  text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single

  text = re.sub(r"Date\s\d{1,2}\/\d{1,2}\/\d{4}", "", text) #remove date
  text = re.sub(r"\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+", "", text) #remove time
    
  return text
 
#Defining a function to get the category of the news article   
def get_category(text):
  text = clean_text(text)

  input_tensor = tokenizer.encode(text, return_tensors="pt", truncation=True)
  logits = model(input_tensor).logits

  softmax = torch.nn.Softmax(dim=1)
  probs = softmax(logits)[0]
  probs = probs.cpu().detach().numpy()
  max_index = np.argmax(probs)
  emotion = labels[max_index]
    
  return emotion
  
#Creating the interface for the radio app
demo = gr.Interface(get_category, inputs=gr.Textbox(label="Drop your articles here"),
                    outputs = "text",
                    title="News Article Categorization")


#Launching the gradio app
if __name__ == "__main__":
  demo.launch(debug=True)