Spaces:
Build error
Build error
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""ISCO-08 Hierarchical Accuracy Measure.""" | |
from typing import List, Set, Dict, Tuple | |
import evaluate | |
import datasets | |
# import ham | |
# import isco | |
# TODO: Add BibTeX citation | |
_CITATION = """ | |
@article{scikit-learn, | |
title={Scikit-learn: Machine Learning in {P}ython}, | |
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. | |
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. | |
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and | |
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, | |
journal={Journal of Machine Learning Research}, | |
volume={12}, | |
pages={2825--2830}, | |
year={2011} | |
} | |
""" | |
_DESCRIPTION = """ | |
The ISCO-08 Hierarchical Accuracy Measure is an implementation of the measure described in [Functional Annotation of Genes Using Hierarchical Text Categorization](https://www.researchgate.net/publication/44046343_Functional_Annotation_of_Genes_Using_Hierarchical_Text_Categorization) (Kiritchenko, Svetlana and Famili, Fazel. 2005) and adapted for the ISCO-08 classification scheme by the International Labour Organization. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates hierarchical precision, hierarchical recall and hierarchical F1 given a list of reference codes and predicted codes from the ISCO-08 taxonomy by the International Labour Organization. | |
Args: | |
- references (List[str]): List of ISCO-08 reference codes. Each reference code should be a single token, 4-digit ISCO-08 code string. | |
- predictions (List[str]): List of machine predicted or human assigned ISCO-08 codes to score. Each prediction should be a single token, 4-digit ISCO-08 code string. | |
Returns: | |
- hierarchical_precision (`float` or `int`): Hierarchical precision score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy. | |
- hierarchical_recall: Hierarchical recall score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy. | |
- hierarchical_fmeasure: Hierarchical F1 score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy. | |
Examples: | |
Example 1 | |
>>> ham = evaluate.load("danieldux/isco_hierarchical_accuracy") | |
>>> results = ham.compute(reference=["1111", "1112", "1113", "1114"], predictions=["1111", "1113", "1120", "1211"]) | |
>>> print(results) | |
{ | |
'accuracy': 0.25, | |
'hierarchical_precision': 0.7142857142857143, | |
'hierarchical_recall': 0.5, | |
'hierarchical_fmeasure': 0.588235294117647 | |
} | |
""" | |
# TODO: Define external resources urls if needed | |
ISCO_CSV_MIRROR_URL = ( | |
"https://storage.googleapis.com/isco-public/tables/ISCO_structure.csv" | |
) | |
ILO_ISCO_CSV_URL = ( | |
"https://www.ilo.org/ilostat-files/ISCO/newdocs-08-2021/ISCO-08/ISCO-08%20EN.csv" | |
) | |
class ISCO_Hierarchical_Accuracy(evaluate.Metric): | |
"""The ISCO-08 Hierarchical Accuracy Measure""" | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
# features=datasets.Features( | |
# { | |
# "predictions": datasets.Value("string"), | |
# "references": datasets.Value("string"), | |
# } | |
# ), | |
features=datasets.Features( | |
{ | |
"references": datasets.Sequence(datasets.Value("string")), | |
"predictions": datasets.Sequence(datasets.Value("string")), | |
} | |
if self.config_name == "multilabel" | |
else { | |
"references": datasets.Value("string"), | |
"predictions": datasets.Value("string"), | |
} | |
), | |
# TODO: Homepage of the module for documentation | |
homepage="http://module.homepage", | |
# TODO: Additional links to the codebase or references | |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
reference_urls=["http://path.to.reference.url/new_module"], | |
) | |
def create_hierarchy_dict(self, file: str) -> dict: | |
""" | |
Creates a dictionary where keys are nodes and values are dictionaries of their parent nodes with distance as weights, | |
representing the group level hierarchy of the ISCO-08 structure. | |
Args: | |
- file: A string representing the path to the CSV file containing the 4-digit ISCO-08 codes. It can be a local path or a web URL. | |
Returns: | |
- A dictionary where keys are ISCO-08 unit codes and values are dictionaries of their parent codes with distances. | |
""" | |
try: | |
import requests | |
import csv | |
except ImportError as error: | |
raise error | |
isco_hierarchy = {} | |
if file.startswith("http://") or file.startswith("https://"): | |
response = requests.get(file) | |
lines = response.text.splitlines() | |
else: | |
with open(file, newline="") as csvfile: | |
lines = csvfile.readlines() | |
reader = csv.DictReader(lines) | |
for row in reader: | |
unit_code = row["unit"].zfill(4) | |
minor_code = unit_code[0:3] | |
sub_major_code = unit_code[0:2] | |
major_code = unit_code[0] | |
# Assign weights, higher for closer ancestors | |
weights = {minor_code: 0.75, sub_major_code: 0.5, major_code: 0.25} | |
# Store ancestors with their weights | |
isco_hierarchy[unit_code] = weights | |
return isco_hierarchy | |
def find_ancestors(self, node: str, hierarchy: dict) -> set: | |
""" | |
Find the ancestors of a given node in a hierarchy. | |
Args: | |
node (str): The node for which to find ancestors. | |
hierarchy (dict): A dictionary representing the hierarchy, where the keys are nodes and the values are their parents. | |
Returns: | |
set: A set of ancestors of the given node. | |
""" | |
ancestors = set() | |
nodes_to_visit = [node] | |
while nodes_to_visit: | |
current_node = nodes_to_visit.pop() | |
if current_node in hierarchy: | |
parents = hierarchy[current_node] | |
ancestors.update(parents) | |
nodes_to_visit.extend(parents) | |
return ancestors | |
def extend_with_ancestors(self, classes: set, hierarchy: dict) -> set: | |
""" | |
Extend the given set of classes with their ancestors from the hierarchy. | |
Args: | |
classes (set): The set of classes to extend. | |
hierarchy (dict): The hierarchy of classes. | |
Returns: | |
set: The extended set of classes including their ancestors. | |
""" | |
extended_classes = set(classes) | |
for cls in classes: | |
ancestors = self.find_ancestors(cls, hierarchy) | |
extended_classes.update(ancestors) | |
return extended_classes | |
def calculate_hierarchical_precision_recall( | |
self, | |
reference_codes: List[str], | |
predicted_codes: List[str], | |
hierarchy: Dict[str, Dict[str, float]], | |
) -> Tuple[float, float]: | |
""" | |
Calculates the hierarchical precision and recall given the reference codes, predicted codes, and hierarchy definition. | |
Args: | |
reference_codes (List[str]): The list of reference codes. | |
predicted_codes (List[str]): The list of predicted codes. | |
hierarchy (Dict[str, Set[str]]): The hierarchy definition where keys are nodes and values are sets of parent nodes. | |
Returns: | |
Tuple[float, float]: A tuple containing the hierarchical precision and recall floating point values. | |
""" | |
extended_real = {} | |
# Extend the sets of reference codes with their ancestors | |
for code in reference_codes: | |
weight = 1.0 # Full weight for exact match | |
extended_real[code] = weight | |
for ancestor, ancestor_weight in hierarchy.get(code, {}).items(): | |
extended_real[ancestor] = max( | |
extended_real.get(ancestor, 0), ancestor_weight | |
) | |
extended_predicted = {} | |
# Extend the sets of predicted codes with their ancestors | |
for code in predicted_codes: | |
weight = 1.0 | |
extended_predicted[code] = weight | |
for ancestor, ancestor_weight in hierarchy.get(code, {}).items(): | |
extended_predicted[ancestor] = max( | |
extended_predicted.get(ancestor, 0), ancestor_weight | |
) | |
# Calculate weighted correct predictions | |
correct_weights = 0 | |
for code, weight in extended_predicted.items(): | |
if code in extended_real: | |
correct_weights += min(weight, extended_real[code]) | |
total_predicted_weights = sum(extended_predicted.values()) | |
total_real_weights = sum(extended_real.values()) | |
# Calculate hierarchical precision and recall using weighted sums | |
hP = correct_weights / total_predicted_weights if total_predicted_weights else 0 | |
hR = correct_weights / total_real_weights if total_real_weights else 0 | |
return hP, hR | |
def hierarchical_f_measure(self, hP, hR, beta=1.0): | |
""" | |
Calculate the hierarchical F-measure. | |
Parameters: | |
hP (float): The hierarchical precision. | |
hR (float): The hierarchical recall. | |
beta (float, optional): The beta value for F-measure calculation. Default is 1.0. | |
Returns: | |
float: The hierarchical F-measure. | |
""" | |
if hP + hR == 0: | |
return 0 | |
return (beta**2 + 1) * hP * hR / (beta**2 * hP + hR) | |
def _download_and_prepare(self, dl_manager): | |
"""Download external ISCO-08 csv file from the ILO website for creating the hierarchy dictionary.""" | |
isco_csv = dl_manager.download_and_extract(ISCO_CSV_MIRROR_URL) | |
print(f"ISCO CSV file downloaded") | |
# self.isco_hierarchy = isco.create_hierarchy_dict(isco_csv) | |
self.isco_hierarchy = self.create_hierarchy_dict(isco_csv) | |
print("ISCO hierarchy dictionary created") | |
print(self.isco_hierarchy) | |
def _compute(self, predictions, references): | |
"""Returns the accuracy scores.""" | |
# Convert the inputs to strings | |
predictions = [str(p) for p in predictions] | |
references = [str(r) for r in references] | |
# Calculate accuracy | |
accuracy = sum(i == j for i, j in zip(predictions, references)) / len( | |
predictions | |
) | |
print(f"Accuracy: {accuracy}") | |
# Calculate hierarchical precision, recall and f-measure | |
hierarchy = self.isco_hierarchy | |
hP, hR = self.calculate_hierarchical_precision_recall( | |
references, predictions, hierarchy | |
) | |
hF = self.hierarchical_f_measure(hP, hR) | |
print( | |
f"Hierarchical Precision: {hP}, Hierarchical Recall: {hR}, Hierarchical F-measure: {hF}" | |
) | |
return { | |
"accuracy": accuracy, | |
"hierarchical_precision": hP, | |
"hierarchical_recall": hR, | |
"hierarchical_fmeasure": hF, | |
} | |