mdocekal's picture
Init commit containing implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014) and multiset variant.
5ae6761
raw
history blame
9.33 kB
from unittest import TestCase
from multi_label_precision_recall_accuracy_fscore import MultiLabelPrecisionRecallAccuracyFscore
class MultiLabelPrecisionRecallAccuracyFscoreTest(TestCase):
"""
All of these tests are also used for multiset configuration. So please mind this and write the test in a way that
it is valid for both configurations (do not use same label multiple times).
"""
def setUp(self):
self.multi_label_precision_recall_accuracy_fscore = MultiLabelPrecisionRecallAccuracyFscore()
def test_eok(self):
self.assertDictEqual(
{
"precision": 1.0,
"recall": 1.0,
"accuracy": 1.0,
"fscore": 1.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[1, 2],
[0, 1, 2],
],
references=[
[0, 1],
[1, 2],
[0, 1, 2],
]
)
)
def test_eok_string(self):
self.assertDictEqual(
{
"precision": 1.0,
"recall": 1.0,
"accuracy": 1.0,
"fscore": 1.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
["0", "1"],
["1", "2"],
["0", "1", "2"],
],
references=[
["0", "1"],
["1", "2"],
["0", "1", "2"],
]
)
)
def test_empty(self):
self.assertDictEqual(
{
"precision": 0.0,
"recall": 0.0,
"accuracy": 0.0,
"fscore": 0.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[],
[],
[],
],
references=[
[],
[],
[],
]
)
)
def test_empty_reference(self):
self.assertDictEqual(
{
"precision": 0.0,
"recall": 0.0,
"accuracy": 0.0,
"fscore": 0.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[1, 2],
[0, 1, 2],
],
references=[
[],
[],
[],
]
)
)
def test_empty_prediction(self):
self.assertDictEqual(
{
"precision": 0.0,
"recall": 0.0,
"accuracy": 0.0,
"fscore": 0.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[],
[],
[],
],
references=[
[0, 1],
[1, 2],
[0, 1, 2],
]
)
)
def test_completely_different(self):
self.assertDictEqual(
{
"precision": 0.0,
"recall": 0.0,
"accuracy": 0.0,
"fscore": 0.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[1, 2],
[0, 1, 2],
],
references=[
[3, 4],
[5, 6],
[7, 8, 9],
]
)
)
def test_max_precision(self):
self.assertDictEqual(
{
"precision": 1.0,
"recall": 0.5,
"accuracy": 0.5,
"fscore": 2/3
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1]
],
references=[
[0, 1, 2, 3]
]
)
)
def test_max_recall(self):
self.assertDictEqual(
{
"precision": 0.5,
"recall": 1.0,
"accuracy": 0.5,
"fscore": 2/3
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1, 2, 3]
],
references=[
[0, 1]
]
)
)
def test_partial_match(self):
self.assertDictEqual(
{
"precision": 0.5,
"recall": 0.5,
"accuracy": 1/3,
"fscore": 0.5
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1]
],
references=[
[0, 2]
]
)
)
def test_partial_match_multi_sample(self):
self.assertDictEqual(
{
"precision": 2.5/3,
"recall": 2/3,
"accuracy": 0.5,
"fscore": 2*(2.5/3 * 2/3) / (2.5/3 + 2/3)
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[0, 1],
[2, 3]
],
references=[
[0, 1, 2, 3],
[0, 1, 2, 3],
[2]
]
)
)
def test_beta(self):
self.multi_label_precision_recall_accuracy_fscore.beta = 2
self.assertDictEqual(
{
"precision": 2.5/3,
"recall": 2/3,
"accuracy": 0.5,
"fscore": 5*(2.5/3 * 2/3) / (4*2.5/3 + 2/3)
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[0, 1],
[2, 3]
],
references=[
[0, 1, 2, 3],
[0, 1, 2, 3],
[2]
]
)
)
self.assertDictEqual(
{
"precision": 2.5 / 3,
"recall": 2 / 3,
"accuracy": 0.5,
"fscore": 10 * (2.5 / 3 * 2 / 3) / (9 * 2.5 / 3 + 2 / 3)
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1],
[0, 1],
[2, 3]
],
references=[
[0, 1, 2, 3],
[0, 1, 2, 3],
[2]
],
beta=3
)
)
class MultiLabelPrecisionRecallAccuracyFscoreTestMultiset(MultiLabelPrecisionRecallAccuracyFscoreTest):
def setUp(self):
self.multi_label_precision_recall_accuracy_fscore = MultiLabelPrecisionRecallAccuracyFscore(config_name="multiset")
def test_multiset_eok(self):
self.assertDictEqual(
{
"precision": 1.0,
"recall": 1.0,
"accuracy": 1.0,
"fscore": 1.0
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1, 1],
[1, 2, 2],
[0, 1, 2, 1],
],
references=[
[1, 0, 1],
[1, 2, 2],
[0, 1, 1, 2],
]
)
)
def test_multiset_partial_match(self):
self.assertDictEqual(
{
"precision": 1.0,
"recall": 0.5,
"accuracy": 0.5,
"fscore": 2/3
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1, 1]
],
references=[
[1, 0, 1, 1, 0, 0],
]
)
)
def test_multiset_partial_match_multi_sample(self):
p = (1+2/3) / 2
r = (3/4 + 1) / 2
self.assertDictEqual(
{
"precision": p,
"recall": r,
"accuracy": (3/4 + 2/3) / 2,
"fscore": 2*p*r / (p + r)
},
self.multi_label_precision_recall_accuracy_fscore.compute(
predictions=[
[0, 1, 1],
[1, 2, 2]
],
references=[
[1, 0, 1, 1],
[1, 2],
]
)
)