toxic_model / app.py
kanav0183's picture
Upload 3 files
e1a5a68
raw
history blame
3.82 kB
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
class MultiLabelDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len, new_data=False):
self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.comment_text
self.new_data = new_data
if not new_data:
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
out = {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
}
if not self.new_data:
out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)
return out
class DistilBERTClass(torch.nn.Module):
def __init__(self):
super(DistilBERTClass, self).__init__()
self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
self.classifier = torch.nn.Sequential(
torch.nn.Linear(768, 768),
torch.nn.ReLU(),
torch.nn.Dropout(0.1),
torch.nn.Linear(768, 6)
)
def forward(self, input_ids, attention_mask, token_type_ids):
output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
out = hidden_state[:, 0]
out = self.classifier(out)
return out
model = DistilBERTClass()
model.to(DEVICE);
model_loaded = torch.load('model/inference_models_output_4fold_distilbert_fold_best_model.pth')
model.load_state_dict(model_loaded['model'])
val_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
}
def give_toxic(text):
# text = "You fucker "
test_data = pd.DataFrame([text],columns=['comment_text'])
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, **val_params)
all_test_pred = []
def test(epoch):
model.eval()
with torch.inference_mode():
for _, data in tqdm(enumerate(test_loader, 0)):
ids = data['ids'].to(DEVICE, dtype=torch.long)
mask = data['mask'].to(DEVICE, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
outputs = model(ids, mask, token_type_ids)
probas = torch.sigmoid(outputs)
all_test_pred.append(probas)
probas = test(model)
all_test_pred = torch.cat(all_test_pred)
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
preds = all_test_pred.detach().cpu().numpy()[0]
final_dict = dict(zip(label_columns , preds))
return final_dict
def device():
return DEVICE
print(give_toxic("fuck"))