File size: 4,962 Bytes
77961b6
 
0c7d7d0
77961b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7d7d0
 
 
 
 
 
 
 
 
 
 
 
 
 
77961b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7d7d0
77961b6
 
 
0c7d7d0
 
77961b6
 
 
 
 
 
 
 
 
54410d4
 
 
0c7d7d0
 
d65e913
0c7d7d0
 
77961b6
 
 
 
 
 
0c7d7d0
 
d65e913
77961b6
0c7d7d0
54410d4
77961b6
0c7d7d0
 
d65e913
77961b6
54410d4
77961b6
 
0c7d7d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import datasets
import logging
import json
import pandas as pd


def text_classificaiton_match_label_case_unsensative(id2label_mapping, label):
    for model_label in id2label_mapping.keys():
        if model_label.upper() == label.upper():
            return model_label, label
    return None, label


def text_classification_map_model_and_dataset_labels(id2label, dataset_features):
    id2label_mapping = {id2label[k]: None for k in id2label.keys()}
    dataset_labels = None
    for feature in dataset_features.values():
        if not isinstance(feature, datasets.ClassLabel):
            continue
        if len(feature.names) != len(id2label_mapping.keys()):
            continue

        dataset_labels = feature.names

        # Try to match labels
        for label in feature.names:
            if label in id2label_mapping.keys():
                model_label = label
            else:
                # Try to find case unsensative
                model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
            if model_label is not None:
                id2label_mapping[model_label] = label

    return id2label_mapping, dataset_labels


def check_column_mapping_keys_validity(column_mapping, ppl):
    # get the element in all the list elements
    column_mapping = json.loads(column_mapping)
    if "data" not in column_mapping.keys():
        return True
    user_labels = set([pair[0] for pair in column_mapping["data"]])
    model_labels = set([pair[1] for pair in column_mapping["data"]])

    id2label = ppl.model.config.id2label
    original_labels = set(id2label.values())
    
    return user_labels == model_labels == original_labels


def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
    # We assume dataset is ok here
    ds = datasets.load_dataset(d_id, config)[split]

    try:
        dataset_features = ds.features
    except AttributeError:
        # Dataset does not have features, need to provide everything
        return None, None, None

    # Check whether we need to infer the text input column
    infer_text_input_column = True
    if "text" in column_mapping.keys():
        dataset_text_column = column_mapping["text"]
        if dataset_text_column in dataset_features.keys():
            infer_text_input_column = False
        else:
            logging.warning(f"Provided {dataset_text_column} is not in Dataset columns")

    if infer_text_input_column:
        # Try to retrieve one
        candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
        if len(candidates) > 0:
            logging.debug(f"Candidates are {candidates}")
            column_mapping["text"] = candidates[0]
        else:
            # Not found a text feature
            return column_mapping, None, None

    # Load dataset as DataFrame
    df = ds.to_pandas()

    # Retrieve all labels
    id2label_mapping = {}
    id2label = ppl.model.config.id2label
    label2id = {v: k for k, v in id2label.items()}
    prediction_input = None
    prediction_result = None
    try:
        # Use the first item to test prediction
        prediction_input = df.head(1).at[0, column_mapping["text"]]
        results = ppl({"text": prediction_input}, top_k=None)
        prediction_result = {
            f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
        }
    except Exception:
        # Pipeline prediction failed, need to provide labels
        return column_mapping, None, None

    # Infer labels
    id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
    id2label_mapping_dataset_model = {
        v: k for k, v in id2label_mapping.items()
    }
    if "data" in column_mapping.keys():
        if isinstance(column_mapping["data"], list):
            # Use the column mapping passed by user
            for user_label, model_label in column_mapping["data"]:
                id2label_mapping[model_label] = user_label
    elif None in id2label_mapping.values():
        column_mapping["label"] = {
            i: None for i in id2label.keys()
        }
        return column_mapping, prediction_result, None

    prediction_result = {
        f'[{label2id[result["label"]]}]{result["label"]}(original) - {id2label_mapping[result["label"]]}(mapped)': result["score"] for result in results
    }
    id2label_df = pd.DataFrame({
        "Dataset Labels": dataset_labels,
        "Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
    })

    if "data" not in column_mapping.keys():
        # Column mapping should contain original model labels
        column_mapping["label"] = {
            str(i): id2label_mapping_dataset_model[label] for i, label in zip(id2label.keys(), dataset_labels)
        }

    return column_mapping, prediction_input, prediction_result, id2label_df