Update README.md
Browse files
README.md
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020
|
2 |
+
|
3 |
+
Models and predictions for submission to TRAC - 2020 Second Workshop on Trolling, Aggression and Cyberbullying.
|
4 |
+
|
5 |
+
Our trained models as well as evaluation metrics during traing are available at: https://databank.illinois.edu/datasets/IDB-8882752#
|
6 |
+
We also make a few of our models available in HuggingFace's models repository at https://huggingface.co/socialmediaie/, these models can be further fine-tuned on your dataset of choice.
|
7 |
+
|
8 |
+
Our approach is described in our paper titled:
|
9 |
+
|
10 |
+
> Mishra, Sudhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. "Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020." In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020).
|
11 |
+
|
12 |
+
The source code for training this model and more details can be found on our code repository: https://github.com/socialmediaie/TRAC2020
|
13 |
+
|
14 |
+
NOTE: These models are retrained for uploading here after our submission so the evaluation measures may be slightly different from the ones reported in the paper.
|
15 |
+
|
16 |
+
If you plan to use the dataset please cite the following resources:
|
17 |
+
|
18 |
+
* Mishra, Sudhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. "Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020." In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020).
|
19 |
+
* Mishra, Shubhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. “Trained Models for Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020.” University of Illinois at Urbana-Champaign. https://doi.org/10.13012/B2IDB-8882752_V1.
|
20 |
+
|
21 |
+
|
22 |
+
```
|
23 |
+
@inproceedings{Mishra2020TRAC,
|
24 |
+
author = {Mishra, Sudhanshu and Prasad, Shivangi and Mishra, Shubhanshu},
|
25 |
+
booktitle = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020)},
|
26 |
+
title = {{Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020}},
|
27 |
+
year = {2020}
|
28 |
+
}
|
29 |
+
|
30 |
+
@data{illinoisdatabankIDB-8882752,
|
31 |
+
author = {Mishra, Shubhanshu and Prasad, Shivangi and Mishra, Shubhanshu},
|
32 |
+
doi = {10.13012/B2IDB-8882752_V1},
|
33 |
+
publisher = {University of Illinois at Urbana-Champaign},
|
34 |
+
title = {{Trained models for Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020}},
|
35 |
+
url = {https://doi.org/10.13012/B2IDB-8882752{\_}V1},
|
36 |
+
year = {2020}
|
37 |
+
}
|
38 |
+
```
|
39 |
+
|
40 |
+
|
41 |
+
## Usage
|
42 |
+
|
43 |
+
The models can be used via the following code:
|
44 |
+
|
45 |
+
```python
|
46 |
+
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
|
47 |
+
import torch
|
48 |
+
from pathlib import Path
|
49 |
+
from scipy.special import softmax
|
50 |
+
import numpy as np
|
51 |
+
import pandas as pd
|
52 |
+
|
53 |
+
TASK_LABEL_IDS = {
|
54 |
+
"Sub-task A": ["OAG", "NAG", "CAG"],
|
55 |
+
"Sub-task B": ["GEN", "NGEN"],
|
56 |
+
"Sub-task C": ["OAG-GEN", "OAG-NGEN", "NAG-GEN", "NAG-NGEN", "CAG-GEN", "CAG-NGEN"]
|
57 |
+
}
|
58 |
+
|
59 |
+
model_version="databank" # other option is hugging face library
|
60 |
+
if model_version == "databank":
|
61 |
+
# Make sure you have downloaded the required model file from https://databank.illinois.edu/datasets/IDB-8882752
|
62 |
+
# Unzip the file at some model_path (we are using: "databank_model")
|
63 |
+
model_path = next(Path("databank_model").glob("./*/output/*/model"))
|
64 |
+
# Assuming you get the following type of structure inside "databank_model"
|
65 |
+
# 'databank_model/ALL/Sub-task C/output/bert-base-multilingual-uncased/model'
|
66 |
+
lang, task, _, base_model, _ = model_path.parts
|
67 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
68 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
69 |
+
else:
|
70 |
+
lang, task, base_model = "ALL", "Sub-task C", "bert-base-multilingual-uncased"
|
71 |
+
base_model = f"socialmediaie/TRAC2020_{lang}_{lang.split()[-1]}_{base_model}"
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
73 |
+
model = AutoModelForSequenceClassification.from_pretrained(base_model)
|
74 |
+
|
75 |
+
# For doing inference set model in eval mode
|
76 |
+
model.eval()
|
77 |
+
# If you want to further fine-tune the model you can reset the model to model.train()
|
78 |
+
|
79 |
+
task_labels = TASK_LABEL_IDS[task]
|
80 |
+
|
81 |
+
sentence = "This is a good cat and this is a bad dog."
|
82 |
+
processed_sentence = f"{tokenizer.cls_token} {sentence}"
|
83 |
+
tokens = tokenizer.tokenize(sentence)
|
84 |
+
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
|
85 |
+
tokens_tensor = torch.tensor([indexed_tokens])
|
86 |
+
|
87 |
+
with torch.no_grad():
|
88 |
+
logits, = model(tokens_tensor, labels=None)
|
89 |
+
logits
|
90 |
+
|
91 |
+
|
92 |
+
preds = logits.detach().cpu().numpy()
|
93 |
+
preds_probs = softmax(preds, axis=1)
|
94 |
+
preds = np.argmax(preds_probs, axis=1)
|
95 |
+
preds_labels = np.array(task_labels)[preds]
|
96 |
+
print(dict(zip(task_labels, preds_probs[0])), preds_labels)
|
97 |
+
"""You should get an output as follows:
|
98 |
+
|
99 |
+
({'CAG-GEN': 0.06762535,
|
100 |
+
'CAG-NGEN': 0.03244293,
|
101 |
+
'NAG-GEN': 0.6897794,
|
102 |
+
'NAG-NGEN': 0.15498641,
|
103 |
+
'OAG-GEN': 0.034373745,
|
104 |
+
'OAG-NGEN': 0.020792078},
|
105 |
+
array(['NAG-GEN'], dtype='<U8'))
|
106 |
+
|
107 |
+
"""
|
108 |
+
|
109 |
+
```
|