giskard-evaluator

Running

App Files Files Community

200

inoki-giskard

ZeroCommand commited on Jan 20

Commit

35be7f4

•

1 Parent(s): e7115eb

GSK-2547-get-rid-of-pipeline (#51)

Browse files

- remove pipeline and improve events trigger (461883adf15e41590810f0a6b15b21cb9ec07ffd)

Co-authored-by: zcy <ZeroCommand@users.noreply.huggingface.co>

Files changed (4) hide show

app_text_classification.py +14 -53
text_classification.py +63 -13
text_classification_ui_helpers.py +67 -81
wordings.py +20 -1

app_text_classification.py CHANGED Viewed

@@ -8,11 +8,10 @@ from text_classification_ui_helpers import (
     align_columns_and_show_prediction,
     check_dataset,
     precheck_model_ds_enable_example_btn,
-    select_run_mode,
     try_submit,
     write_column_mapping_to_config,
 )
-from wordings import CONFIRM_MAPPING_DETAILS_MD, INTRODUCTION_MD
 MAX_LABELS = 40
 MAX_FEATURES = 20
@@ -80,30 +79,9 @@ def get_demo():
                         column_mappings.append(gr.Dropdown(visible=False))
     with gr.Accordion(label="Model Wrap Advance Config", open=True):
-        run_inference = gr.Checkbox(
-            value=True,
-            label="Run with HF Inference API"
-        )
-        gr.HTML(
-            value="""
-                We recommend to use
-                <a href="https://huggingface.co/docs/api-inference/detailed_parameters#text-classification-task">
-                    Hugging Face Inference API
-                </a>
-                for the evaluation,
-                which requires your <a href="https://huggingface.co/settings/tokens">HF token</a>.
-                <br/>
-                Otherwise, an
-                <a href="https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TextClassificationPipeline">
-                    HF pipeline
-                </a>
-                will be created and run in this Space. It takes more time to get the result.
-                <br/>
-                <b>
-                Do not worry, your HF token is only used in this Space for your evaluation.
-                </b>
-            """,
-        )
         inference_token = gr.Textbox(
             placeholder="hf-xxxxxxxxxxxxxxxxxxxx",
             value="",
@@ -112,7 +90,6 @@ def get_demo():
             interactive=True,
         )
     with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
         scanners = gr.CheckboxGroup(label="Scan Settings", visible=True)
@@ -143,37 +120,21 @@ def get_demo():
             every=0.5,
         )
-    dataset_id_input.change(
-        check_dataset,
-        inputs=[dataset_id_input],
-        outputs=[dataset_config_input, dataset_split_input, first_line_ds, loading_status],
-    )
-    dataset_config_input.change(
-        check_dataset,
-        inputs=[dataset_id_input, dataset_config_input],
-        outputs=[dataset_config_input, dataset_split_input, first_line_ds, loading_status],
-    )
-    dataset_split_input.change(
-        check_dataset,
-        inputs=[dataset_id_input, dataset_config_input, dataset_split_input],
-        outputs=[dataset_config_input, dataset_split_input, first_line_ds, loading_status],
-    )
     scanners.change(write_scanners, inputs=[scanners, uid_label])
-    run_inference.change(
-        select_run_mode,
-        inputs=[run_inference],
-        outputs=[inference_token],
-    )
     gr.on(
         triggers=[model_id_input.change],
         fn=get_related_datasets_from_leaderboard,
         inputs=[model_id_input],
         outputs=[dataset_id_input],
     )
     gr.on(
@@ -209,7 +170,7 @@ def get_demo():
             dataset_config_input,
             dataset_split_input,
         ],
-        outputs=[example_btn, loading_status],
     )
     gr.on(
@@ -254,7 +215,7 @@ def get_demo():
     )
     def enable_run_btn(run_inference, inference_token, model_id, dataset_id, dataset_config, dataset_split):
-        if run_inference and inference_token == "":
             return gr.update(interactive=False)
         if model_id == "" or dataset_id == "" or dataset_config == "" or dataset_split == "":
             return gr.update(interactive=False)

     align_columns_and_show_prediction,
     check_dataset,
     precheck_model_ds_enable_example_btn,
     try_submit,
     write_column_mapping_to_config,
 )
+from wordings import CONFIRM_MAPPING_DETAILS_MD, INTRODUCTION_MD, USE_INFERENCE_API_TIP
 MAX_LABELS = 40
 MAX_FEATURES = 20
                         column_mappings.append(gr.Dropdown(visible=False))
     with gr.Accordion(label="Model Wrap Advance Config", open=True):
+        gr.HTML(USE_INFERENCE_API_TIP)
+        run_inference = gr.Checkbox(value=True, label="Run with Inference API")
         inference_token = gr.Textbox(
             placeholder="hf-xxxxxxxxxxxxxxxxxxxx",
             value="",
             interactive=True,
         )
     with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
         scanners = gr.CheckboxGroup(label="Scan Settings", visible=True)
             every=0.5,
         )
     scanners.change(write_scanners, inputs=[scanners, uid_label])
     gr.on(
         triggers=[model_id_input.change],
         fn=get_related_datasets_from_leaderboard,
         inputs=[model_id_input],
         outputs=[dataset_id_input],
+    ).then(fn=check_dataset, inputs=[dataset_id_input], outputs=[dataset_config_input, dataset_split_input, loading_status])
+    gr.on(
+        triggers=[dataset_id_input.input],
+        fn=check_dataset,
+        inputs=[dataset_id_input],
+        outputs=[dataset_config_input, dataset_split_input, loading_status]
     )
     gr.on(
             dataset_config_input,
             dataset_split_input,
         ],
+        outputs=[example_btn, first_line_ds, loading_status],
     )
     gr.on(
     )
     def enable_run_btn(run_inference, inference_token, model_id, dataset_id, dataset_config, dataset_split):
+        if not run_inference or inference_token == "":
             return gr.update(interactive=False)
         if model_id == "" or dataset_id == "" or dataset_config == "" or dataset_split == "":
             return gr.update(interactive=False)

text_classification.py CHANGED Viewed

@@ -5,15 +5,13 @@ import datasets
 import huggingface_hub
 import pandas as pd
 from transformers import pipeline
-def get_labels_and_features_from_dataset(dataset_id, dataset_config, split):
-    if not dataset_config:
-        dataset_config = "default"
-    if not split:
-        split = "train"
     try:
-        ds = datasets.load_dataset(dataset_id, dataset_config)[split]
         dataset_features = ds.features
         label_keys = [i for i in dataset_features.keys() if i.startswith('label')]
         if len(label_keys) == 0: # no labels found
@@ -29,12 +27,60 @@ def get_labels_and_features_from_dataset(dataset_id, dataset_config, split):
         return labels, features
     except Exception as e:
         logging.warning(
-            f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}"
         )
         return None, None
-def check_model(model_id):
     try:
         task = huggingface_hub.model_info(model_id).pipeline_tag
     except Exception:
@@ -207,7 +253,7 @@ def check_dataset_features_validity(d_id, config, split):
     return df, dataset_features
-def get_example_prediction(ppl, dataset_id, dataset_config, dataset_split):
     # get a sample prediction from the model on the dataset
     prediction_input = None
     prediction_result = None
@@ -220,9 +266,13 @@ def get_example_prediction(ppl, dataset_id, dataset_config, dataset_split):
         else:
             prediction_input = ds[0]["text"]
-        print("prediction_input", prediction_input)
-        results = ppl(prediction_input, top_k=None)
-        # Display results in original label and mapped label
         prediction_result = {
             f'{result["label"]}': result["score"] for result in results
         }
@@ -298,4 +348,4 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
         prediction_result,
         id2label_df,
         feature_map_df,
-    )

 import huggingface_hub
 import pandas as pd
 from transformers import pipeline
+import requests
+import os
+HF_WRITE_TOKEN = "HF_WRITE_TOKEN"
+def get_labels_and_features_from_dataset(ds):
     try:
         dataset_features = ds.features
         label_keys = [i for i in dataset_features.keys() if i.startswith('label')]
         if len(label_keys) == 0: # no labels found
         return labels, features
     except Exception as e:
         logging.warning(
+            f"Get Labels/Features Failed for dataset: {e}"
         )
         return None, None
+def check_model_task(model_id):
+    # check if model is valid on huggingface
+    try:
+        task = huggingface_hub.model_info(model_id).pipeline_tag
+        if task is None:
+            return None
+        return task
+    except Exception:
+        return None
+def get_model_labels(model_id, example_input):
+    hf_token = os.environ.get(HF_WRITE_TOKEN, default="")
+    payload = {"inputs": example_input, "options": {"use_cache": True}}
+    response = hf_inference_api(model_id, hf_token, payload)
+    if "error" in response:
+        return None
+    return extract_from_response(response, "label")
+def extract_from_response(data, key):
+    results = []
+    if isinstance(data, dict):
+        res = data.get(key)
+        if res is not None:
+            results.append(res)
+        for value in data.values():
+            results.extend(extract_from_response(value, key))
+    elif isinstance(data, list):
+        for element in data:
+            results.extend(extract_from_response(element, key))
+    return results
+def hf_inference_api(model_id, hf_token, payload):
+    hf_inference_api_endpoint = os.environ.get(
+        "HF_INFERENCE_ENDPOINT", default="https://api-inference.huggingface.co"
+    )
+    url = f"{hf_inference_api_endpoint}/models/{model_id}"
+    headers = {"Authorization": f"Bearer {hf_token}"}
+    response = requests.post(url, headers=headers, json=payload)
+    if response.status_code != 200:
+        logging.ERROR(f"Request to inference API returns {response.status_code}")
+    try:
+        return response.json()
+    except Exception:
+        return {"error": response.content}
+def check_model_pipeline(model_id):
     try:
         task = huggingface_hub.model_info(model_id).pipeline_tag
     except Exception:
     return df, dataset_features
+def get_example_prediction(model_id, dataset_id, dataset_config, dataset_split):
     # get a sample prediction from the model on the dataset
     prediction_input = None
     prediction_result = None
         else:
             prediction_input = ds[0]["text"]
+        hf_token = os.environ.get(HF_WRITE_TOKEN, default="")
+        payload = {"inputs": prediction_input, "options": {"use_cache": True}}
+        results = hf_inference_api(model_id, hf_token, payload)
+        while isinstance(results, list):
+            if isinstance(results[0], dict):
+                break
+            results = results[0]
         prediction_result = {
             f'{result["label"]}': result["score"] for result in results
         }
         prediction_result,
         id2label_df,
         feature_map_df,
+    )

text_classification_ui_helpers.py CHANGED Viewed

@@ -9,7 +9,6 @@ import leaderboard
 import datasets
 import gradio as gr
 import pandas as pd
-from transformers.pipelines import TextClassificationPipeline
 from io_utils import (
     get_yaml_path,
@@ -19,7 +18,7 @@ from io_utils import (
     write_log_to_user_file,
 )
 from text_classification import (
-    check_model,
     get_example_prediction,
     get_labels_and_features_from_dataset,
 )
@@ -43,72 +42,55 @@ HF_GSK_HUB_HF_TOKEN = "GSK_HF_TOKEN"
 HF_GSK_HUB_UNLOCK_TOKEN = "GSK_HUB_UNLOCK_TOKEN"
 LEADERBOARD = "giskard-bot/evaluator-leaderboard"
 def get_related_datasets_from_leaderboard(model_id):
     records = leaderboard.records
     model_records = records[records["model_id"] == model_id]
-    datasets_unique = model_records["dataset_id"].unique()
     if len(datasets_unique) == 0:
         all_unique_datasets = list(records["dataset_id"].unique())
-        print(type(all_unique_datasets), all_unique_datasets)
         return gr.update(choices=all_unique_datasets, value="")
     return gr.update(choices=datasets_unique, value=datasets_unique[0])
 logger = logging.getLogger(__file__)
-def check_dataset(dataset_id, dataset_config=None, dataset_split=None):
-    configs = ["default"]
-    splits = ["default"]
-    logger.info(f"Loading {dataset_id}, {dataset_config}, {dataset_split}")
     try:
         configs = datasets.get_dataset_config_names(dataset_id)
         splits = list(
-            datasets.load_dataset(
-                dataset_id, configs[0] if not dataset_config else dataset_config
-            ).keys()
         )
-        if dataset_config == None:
-            dataset_config = configs[0]
-            dataset_split = splits[0]
-        elif dataset_split == None:
-            dataset_split = splits[0]
     except Exception as e:
-        # Dataset may not exist
-        logger.warn(
-            f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}"
         )
-        if dataset_config == None:
-            return (
-                gr.Dropdown(configs, value=configs[0], visible=True),
-                gr.Dropdown(splits, value=splits[0], visible=True),
-                gr.DataFrame(pd.DataFrame(), visible=False),
-                "",
-            )
-        elif dataset_split == None:
-            return (
-                gr.Dropdown(configs, value=dataset_config, visible=True),
-                gr.Dropdown(splits, value=splits[0], visible=True),
-                gr.DataFrame(pd.DataFrame(), visible=False),
-                "",
-            )
-    dataset_dict = datasets.load_dataset(dataset_id, dataset_config)
-    dataframe: pd.DataFrame = dataset_dict[dataset_split].to_pandas().head(5)
-    return (
-        gr.Dropdown(configs, value=dataset_config, visible=True),
-        gr.Dropdown(splits, value=dataset_split, visible=True),
-        gr.DataFrame(dataframe, visible=True),
-        "",
-    )
-def select_run_mode(run_inf):
-    if run_inf:
-        return gr.update(visible=True)
-    else:
-        return gr.update(visible=False)
 def write_column_mapping_to_config(uid, *labels):
     # TODO: Substitute 'text' with more features for zero-shot
@@ -144,8 +126,7 @@ def export_mappings(all_mappings, key, subkeys, values):
     return all_mappings
-def list_labels_and_features_from_dataset(ds_labels, ds_features, model_id2label, uid):
-    model_labels = list(model_id2label.values())
     all_mappings = read_column_mapping(uid)
     # For flattened raw datasets with no labels
     # check if there are shared labels between model and dataset
@@ -163,7 +144,7 @@ def list_labels_and_features_from_dataset(ds_labels, ds_features, model_id2label
         gr.Dropdown(
             label=f"{label}",
             choices=model_labels,
-            value=model_id2label[i % len(model_labels)],
             interactive=True,
             visible=True,
         )
@@ -195,25 +176,37 @@ def list_labels_and_features_from_dataset(ds_labels, ds_features, model_id2label
 def precheck_model_ds_enable_example_btn(
     model_id, dataset_id, dataset_config, dataset_split
 ):
-    ppl = check_model(model_id)
-    if ppl is None or not isinstance(ppl, TextClassificationPipeline):
         gr.Warning("Please check your model.")
         return gr.update(interactive=False), ""
-    ds_labels, ds_features = get_labels_and_features_from_dataset(
-        dataset_id, dataset_config, dataset_split
-    )
-    if not isinstance(ds_labels, list) or not isinstance(ds_features, list):
-        gr.Warning(CHECK_CONFIG_OR_SPLIT_RAW)
-        return gr.update(interactive=False), ""
-    return gr.update(interactive=True), ""
 def align_columns_and_show_prediction(
     model_id, dataset_id, dataset_config, dataset_split, uid, run_inference, inference_token
 ):
-    ppl = check_model(model_id)
-    if ppl is None or not isinstance(ppl, TextClassificationPipeline):
         gr.Warning("Please check your model.")
         return (
             gr.update(visible=False),
@@ -228,20 +221,15 @@ def align_columns_and_show_prediction(
         gr.Dropdown(visible=False) for _ in range(MAX_LABELS + MAX_FEATURES)
     ]
-    if ppl is None:  # pipeline not found
-        gr.Warning("Model not found")
-        return (
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False, open=False),
-            gr.update(interactive=False),
-            *dropdown_placement,
-        )
-    model_id2label = ppl.model.config.id2label
-    ds_labels, ds_features = get_labels_and_features_from_dataset(
-        dataset_id, dataset_config, dataset_split
     )
     # when dataset does not have labels or features
     if not isinstance(ds_labels, list) or not isinstance(ds_features, list):
         gr.Warning(CHECK_CONFIG_OR_SPLIT_RAW)
@@ -257,14 +245,14 @@ def align_columns_and_show_prediction(
     column_mappings = list_labels_and_features_from_dataset(
         ds_labels,
         ds_features,
-        model_id2label,
         uid,
     )
     # when labels or features are not aligned
     # show manually column mapping
     if (
-        collections.Counter(model_id2label.values()) != collections.Counter(ds_labels)
         or ds_features[0] != "text"
     ):
         return (
@@ -276,9 +264,6 @@ def align_columns_and_show_prediction(
             *column_mappings,
         )
-    prediction_input, prediction_output = get_example_prediction(
-        ppl, dataset_id, dataset_config, dataset_split
-    )
     return (
         gr.update(value=get_styled_input(prediction_input), visible=True),
         gr.update(value=prediction_output, visible=True),
@@ -322,10 +307,10 @@ def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
     if os.environ.get("SPACE_ID") == "giskardai/giskard-evaluator":
         leaderboard_dataset = LEADERBOARD
-    inference_type = "hf_pipeline"
-    if inference and inference_token:
         inference_type = "hf_inference_api"
     # TODO: Set column mapping for some dataset such as `amazon_polarity`
     command = [
         "giskard_scanner",
@@ -354,6 +339,7 @@ def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
         "--inference_api_token",
         inference_token,
     ]
     # The token to publish post
     if os.environ.get(HF_WRITE_TOKEN):
         command.append("--hf_token")

 import datasets
 import gradio as gr
 import pandas as pd
 from io_utils import (
     get_yaml_path,
     write_log_to_user_file,
 )
 from text_classification import (
+    check_model_task,
     get_example_prediction,
     get_labels_and_features_from_dataset,
 )
 HF_GSK_HUB_UNLOCK_TOKEN = "GSK_HUB_UNLOCK_TOKEN"
 LEADERBOARD = "giskard-bot/evaluator-leaderboard"
+global ds_dict, ds_config
+ds_dict = None
+ds_config = None
 def get_related_datasets_from_leaderboard(model_id):
     records = leaderboard.records
     model_records = records[records["model_id"] == model_id]
+    datasets_unique = list(model_records["dataset_id"].unique())
     if len(datasets_unique) == 0:
         all_unique_datasets = list(records["dataset_id"].unique())
         return gr.update(choices=all_unique_datasets, value="")
     return gr.update(choices=datasets_unique, value=datasets_unique[0])
 logger = logging.getLogger(__file__)
+def check_dataset(dataset_id):
+    logger.info(f"Loading {dataset_id}")
     try:
         configs = datasets.get_dataset_config_names(dataset_id)
+        if len(configs) == 0:
+            return (
+                gr.update(),
+                gr.update(),
+                ""
+            )
         splits = list(
+                    datasets.load_dataset(
+                        dataset_id, configs[0]
+                    ).keys()
+                )
+        return (
+            gr.update(choices=configs, value=configs[0], visible=True),
+            gr.update(choices=splits, value=splits[0], visible=True),
+            ""
         )
     except Exception as e:
+        logger.warn(f"Check your dataset {dataset_id}: {e}")
+        return (
+            gr.update(),
+            gr.update(),
+            ""
         )
 def write_column_mapping_to_config(uid, *labels):
     # TODO: Substitute 'text' with more features for zero-shot
     return all_mappings
+def list_labels_and_features_from_dataset(ds_labels, ds_features, model_labels, uid):
     all_mappings = read_column_mapping(uid)
     # For flattened raw datasets with no labels
     # check if there are shared labels between model and dataset
         gr.Dropdown(
             label=f"{label}",
             choices=model_labels,
+            value=model_labels[i % len(model_labels)],
             interactive=True,
             visible=True,
         )
 def precheck_model_ds_enable_example_btn(
     model_id, dataset_id, dataset_config, dataset_split
 ):
+    model_task = check_model_task(model_id)
+    if model_task is None or model_task != "text-classification":
         gr.Warning("Please check your model.")
         return gr.update(interactive=False), ""
+    if dataset_config is None or dataset_split is None or len(dataset_config) == 0:
+        return (gr.update(), gr.update(), "")
+    try:
+        ds = datasets.load_dataset(dataset_id, dataset_config)
+        df: pd.DataFrame = ds[dataset_split].to_pandas().head(5)
+        ds_labels, ds_features = get_labels_and_features_from_dataset(ds[dataset_split])
+        if not isinstance(ds_labels, list) or not isinstance(ds_features, list):
+            gr.Warning(CHECK_CONFIG_OR_SPLIT_RAW)
+            return (gr.update(interactive=False), gr.update(value=df, visible=True), "")
+        return (gr.update(interactive=True), gr.update(value=df, visible=True), "")
+    except Exception as e:
+        # Config or split wrong
+        gr.Warning(f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}")
+        return (gr.update(interactive=False), gr.update(value=pd.DataFrame(), visible=False), "")
 def align_columns_and_show_prediction(
     model_id, dataset_id, dataset_config, dataset_split, uid, run_inference, inference_token
 ):
+    model_task = check_model_task(model_id)
+    if model_task is None or model_task != "text-classification":
         gr.Warning("Please check your model.")
         return (
             gr.update(visible=False),
         gr.Dropdown(visible=False) for _ in range(MAX_LABELS + MAX_FEATURES)
     ]
+    prediction_input, prediction_output = get_example_prediction(
+        model_id, dataset_id, dataset_config, dataset_split
     )
+    model_labels = list(prediction_output.keys())
+    ds = datasets.load_dataset(dataset_id, dataset_config)[dataset_split]
+    ds_labels, ds_features = get_labels_and_features_from_dataset(ds)
     # when dataset does not have labels or features
     if not isinstance(ds_labels, list) or not isinstance(ds_features, list):
         gr.Warning(CHECK_CONFIG_OR_SPLIT_RAW)
     column_mappings = list_labels_and_features_from_dataset(
         ds_labels,
         ds_features,
+        model_labels,
         uid,
     )
     # when labels or features are not aligned
     # show manually column mapping
     if (
+        collections.Counter(model_labels) != collections.Counter(ds_labels)
         or ds_features[0] != "text"
     ):
         return (
             *column_mappings,
         )
     return (
         gr.update(value=get_styled_input(prediction_input), visible=True),
         gr.update(value=prediction_output, visible=True),
     if os.environ.get("SPACE_ID") == "giskardai/giskard-evaluator":
         leaderboard_dataset = LEADERBOARD
+    if inference:
         inference_type = "hf_inference_api"
     # TODO: Set column mapping for some dataset such as `amazon_polarity`
     command = [
         "giskard_scanner",
         "--inference_api_token",
         inference_token,
     ]
     # The token to publish post
     if os.environ.get(HF_WRITE_TOKEN):
         command.append("--hf_token")

wordings.py CHANGED Viewed

@@ -38,7 +38,26 @@ MAPPING_STYLED_ERROR_WARNING = """
                         </h3>
                         """
 def get_styled_input(input):
-    return f"""<h3 style="text-align: center;color: #5ec26a; background-color: #e2fbe8; border-radius: 8px; padding: 10px; ">
             Sample input: {input}
             </h3>"""

                         </h3>
                         """
+USE_INFERENCE_API_TIP = """
+                We recommend to use
+                <a href="https://huggingface.co/docs/api-inference/detailed_parameters#text-classification-task">
+                    Hugging Face Inference API
+                </a>
+                for the evaluation,
+                which requires your <a href="https://huggingface.co/settings/tokens">HF token</a>.
+                <br/>
+                Otherwise, an
+                <a href="https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TextClassificationPipeline">
+                    HF pipeline
+                </a>
+                will be created and run in this Space. It takes more time to get the result.
+                <br/>
+                <b>
+                Do not worry, your HF token is only used in this Space for your evaluation.
+                </b>
+            """
 def get_styled_input(input):
+    return f"""<h3 style="text-align: center;color: #4ca154; background-color: #e2fbe8; border-radius: 8px; padding: 10px; ">
             Sample input: {input}
             </h3>"""