class created

Browse files

Files changed (3) hide show

.gitignore +2 -0
explainableai.py +58 -0
finetune-emotions.py +42 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ .DS_Store

explainableai.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from transformers import AutoTokenizer, Trainer, TrainingArguments
+from sklearn.metrics import accuracy_score, f1_score
+import numpy as np
+CITDA_EPOCHS = 10
+CITDA_WEIGHT_DECAY = 0.05 # L2 regularization
+CITDA_BATCH_SIZE = 32
+CITDA_LEARNINGRATE= 2e-5
+class CITDA:
+    def __init__(self, model, labels, base_model_name, tokenizer, encoded_data):
+        self.labels = labels
+        # self.device = device
+        self.tokenizer = tokenizer
+        self.model = model
+        self.encoded_data = encoded_data
+    def _get_trainer(self):
+        def compute_metrics(pred):
+            labels = pred.label_ids
+            preds = pred.predictions.argmax(-1)
+            f1 = f1_score(labels, preds, average="weighted")
+            acc = accuracy_score(labels, preds)
+            return {"accuracy": acc, "f1": f1}
+        training_args = TrainingArguments(output_dir="results",
+                                        num_train_epochs=CITDA_EPOCHS,
+                                        learning_rate=CITDA_LEARNINGRATE,
+                                        per_device_train_batch_size=CITDA_BATCH_SIZE,
+                                        per_device_eval_batch_size=CITDA_BATCH_SIZE,
+                                        load_best_model_at_end=True,
+                                        metric_for_best_model="f1",
+                                        weight_decay=CITDA_WEIGHT_DECAY,
+                                        evaluation_strategy="epoch",
+                                        save_strategy="epoch",
+                                        disable_tqdm=False)
+        trainer = Trainer(model=self.model, tokenizer=self.tokenizer, args=training_args,
+                    compute_metrics=compute_metrics,
+                    train_dataset = self.encoded_data["train"],
+                    eval_dataset = self.encoded_data["validation"],
+                    report_to="wandb")
+        return trainer
+    def train(self):
+        trainer = self._get_trainer()
+        results = trainer.evaluate()
+        preds_output = trainer.predict(encoded_data["validation"])
+        y_valid = np.array(encoded_data["validation"]["label"])
+        y_preds = np.argmax(preds_output.predictions, axis=1)
+        #Saving the fine-tuned model
+        self.model.save_pretrained('./model')
+        self.tokenizer.save_pretrained('./model')
+        return y_valid, y_pred

finetune-emotions.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Modified https://github.com/bhadreshpsavani/ExploringSentimentalAnalysis/blob/main/SentimentalAnalysisWithDistilbert.ipynb
+import torch
+from sklearn.metrics import confusion_matrix
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from datasets import load_dataset
+#import matplotlib.pyplot as plt
+import seaborn as sns
+import explainableai
+BASE_MODEL_NAME = "bert-base-uncased"
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+def save_confusion_matrix(y_valid, y_preds):
+    cm = confusion_matrix(y_valid, y_preds)
+    f = sns.heatmap(cm, annot=True, fmt='d')
+    f.figure.savefig("confusion_matrix.png")
+def get_encoded_data(tokenizer):
+    def tokenize(batch):
+        return tokenizer(batch["text"], padding=True, truncation=True)
+    emotions = load_dataset("emotion")
+    emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
+    emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
+    return emotions_encoded
+if __name__ == "__main__":
+    labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
+    model = AutoModelForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path = BASE_MODEL_NAME,
+                num_labels = len(labels),
+                id2label=[{i: labels[i]} for i in range(len(labels))],
+                resume_download=True,).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
+    encoded_data = get_encoded_data(tokenizer)
+    citda = explainableai.CITDA(model, labels, BASE_MODEL_NAME, tokenizer, encoded_data)
+    y_valid, y_pred = citda.train()
+    save_confusion_matrix(y_valid, y_preds)
+    print("y_valid=",len(y_valid), "y_pred=", len(y_pred))