Spaces:
Running
Running
import evaluate | |
from evaluate.evaluation_suite import SubTask | |
# This is odd because the first dataset is multi-class and | |
# the second dataset is binary. The model I'm using has 4 labels | |
# and is finetuned to the first dataset. | |
# So what does it mean for this model to be evaluated on the second | |
# dataset? | |
metric = evaluate.combine(["accuracy"]) | |
class Suite(evaluate.EvaluationSuite): | |
def __init__(self, name): | |
super().__init__(name) | |
self.preprocessor = lambda x: {"text": x["text"].lower()} | |
self.suite = [ | |
SubTask( | |
task_type="text-classification", | |
data="hate_speech18", | |
split="train[:1000]", | |
args_for_task={ | |
"metric": metric, | |
"input_column": "text", | |
"label_column": "label", | |
"label_mapping": { | |
"NO_HATE": 0.0, | |
"HATE": 1.0, | |
"RELATION": 1.0, | |
"IDK": 1.0 | |
} | |
} | |
), | |
SubTask( | |
task_type="text-classification", | |
data="mteb/toxic_conversations_50k", | |
split="test[:1000]", | |
args_for_task={ | |
"metric": metric, | |
"input_column": "text", | |
"label_column": "label", | |
"label_mapping": { | |
"NO_HATE": 0.0, | |
"HATE": 1.0 | |
} | |
} | |
) | |
] | |