toxic_model / app.py
kanav0183's picture
Update app.py
5f4c80b
raw
history blame
3.81 kB
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
class MultiLabelDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len, new_data=False):
self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.comment_text
self.new_data = new_data
if not new_data:
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
out = {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
}
if not self.new_data:
out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)
return out
class DistilBERTClass(torch.nn.Module):
def __init__(self):
super(DistilBERTClass, self).__init__()
self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
self.classifier = torch.nn.Sequential(
torch.nn.Linear(768, 768),
torch.nn.ReLU(),
torch.nn.Dropout(0.1),
torch.nn.Linear(768, 6)
)
def forward(self, input_ids, attention_mask, token_type_ids):
output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
out = hidden_state[:, 0]
out = self.classifier(out)
return out
model = DistilBERTClass()
model.to(DEVICE);
model_loaded = torch.load('inference_models_output_4fold_distilbert_fold_best_model.pth')
model.load_state_dict(model_loaded['model'])
val_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
}
def give_toxic(text):
# text = "You fucker "
test_data = pd.DataFrame([text],columns=['comment_text'])
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, **val_params)
all_test_pred = []
def test(epoch):
model.eval()
with torch.inference_mode():
for _, data in tqdm(enumerate(test_loader, 0)):
ids = data['ids'].to(DEVICE, dtype=torch.long)
mask = data['mask'].to(DEVICE, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
outputs = model(ids, mask, token_type_ids)
probas = torch.sigmoid(outputs)
all_test_pred.append(probas)
probas = test(model)
all_test_pred = torch.cat(all_test_pred)
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
preds = all_test_pred.detach().cpu().numpy()[0]
final_dict = dict(zip(label_columns , preds))
return final_dict
def device():
return DEVICE
print(give_toxic("fuck"))