File size: 10,399 Bytes
1ce5e18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import inspect
import warnings
from typing import Dict
import numpy as np
from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline
if is_tf_available():
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
def sigmoid(_outputs):
return 1.0 / (1.0 + np.exp(-_outputs))
def softmax(_outputs):
maxes = np.max(_outputs, axis=-1, keepdims=True)
shifted_exp = np.exp(_outputs - maxes)
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
class ClassificationFunction(ExplicitEnum):
SIGMOID = "sigmoid"
SOFTMAX = "softmax"
NONE = "none"
@add_end_docstrings(
PIPELINE_INIT_ARGS,
r"""
return_all_scores (`bool`, *optional*, defaults to `False`):
Whether to return all prediction scores or just the one of the predicted class.
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
- `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
has several labels, will apply the softmax function on the output.
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.
""",
)
class TextClassificationPipeline(Pipeline):
"""
Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
examples](../task_summary#sequence-classification) for more information.
Example:
```python
>>> from transformers import pipeline
>>> classifier = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")
>>> classifier("This movie is disgustingly good !")
[{'label': 'POSITIVE', 'score': 1.0}]
>>> classifier("Director tried too much.")
[{'label': 'NEGATIVE', 'score': 0.996}]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
over the results. If there is a single label, the pipeline will run a sigmoid over the result.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
the up-to-date list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=text-classification).
"""
return_all_scores = False
function_to_apply = ClassificationFunction.NONE
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.check_model_type(
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
if self.framework == "tf"
else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
)
def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
# Using "" as default argument because we're going to use `top_k=None` in user code to declare
# "No top_k"
preprocess_params = tokenizer_kwargs
postprocess_params = {}
if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
return_all_scores = self.model.config.return_all_scores
if isinstance(top_k, int) or top_k is None:
postprocess_params["top_k"] = top_k
postprocess_params["_legacy"] = False
elif return_all_scores is not None:
warnings.warn(
"`return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of"
" `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
UserWarning,
)
if return_all_scores:
postprocess_params["top_k"] = None
else:
postprocess_params["top_k"] = 1
if isinstance(function_to_apply, str):
function_to_apply = ClassificationFunction[function_to_apply.upper()]
if function_to_apply is not None:
postprocess_params["function_to_apply"] = function_to_apply
return preprocess_params, {}, postprocess_params
def __call__(self, *args, **kwargs):
"""
Classify the text(s) given as inputs.
Args:
args (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
One or several texts to classify. In order to use text pairs for your classification, you can send a
dictionary containing `{"text", "text_pair"}` keys, or a list of those.
top_k (`int`, *optional*, defaults to `1`):
How many results to return.
function_to_apply (`str`, *optional*, defaults to `"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different
values:
If this argument is not specified, then it will apply the following functions according to the number
of labels:
- If the model has a single label, will apply the sigmoid function on the output.
- If the model has several labels, will apply the softmax function on the output.
Possible values are:
- `"sigmoid"`: Applies the sigmoid function on the output.
- `"softmax"`: Applies the softmax function on the output.
- `"none"`: Does not apply any function on the output.
Return:
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
- **label** (`str`) -- The label predicted.
- **score** (`float`) -- The corresponding probability.
If `top_k` is used, one such dictionary is returned per label.
"""
result = super().__call__(*args, **kwargs)
# TODO try and retrieve it in a nicer way from _sanitize_parameters.
_legacy = "top_k" not in kwargs
if isinstance(args[0], str) and _legacy:
# This pipeline is odd, and return a list when single item is run
return [result]
else:
return result
def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
return_tensors = self.framework
if isinstance(inputs, dict):
return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
# It used to be valid to use a list of list of list for text pairs, keeping this path for BC
return self.tokenizer(
text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
)
elif isinstance(inputs, list):
# This is likely an invalid usage of the pipeline attempting to pass text pairs.
raise ValueError(
"The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
)
return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
def _forward(self, model_inputs):
# `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
model_forward = self.model.forward if self.framework == "pt" else self.model.call
if "use_cache" in inspect.signature(model_forward).parameters.keys():
model_inputs["use_cache"] = False
return self.model(**model_inputs)
def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
# `_legacy` is used to determine if we're running the naked pipeline and in backward
# compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
# the more natural result containing the list.
# Default value before `set_parameters`
if function_to_apply is None:
if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
function_to_apply = ClassificationFunction.SIGMOID
elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
function_to_apply = ClassificationFunction.SOFTMAX
elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
function_to_apply = self.model.config.function_to_apply
else:
function_to_apply = ClassificationFunction.NONE
outputs = model_outputs["logits"][0]
outputs = outputs.numpy()
if function_to_apply == ClassificationFunction.SIGMOID:
scores = sigmoid(outputs)
elif function_to_apply == ClassificationFunction.SOFTMAX:
scores = softmax(outputs)
elif function_to_apply == ClassificationFunction.NONE:
scores = outputs
else:
raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
if top_k == 1 and _legacy:
return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
dict_scores = [
{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
]
if not _legacy:
dict_scores.sort(key=lambda x: x["score"], reverse=True)
if top_k is not None:
dict_scores = dict_scores[:top_k]
return dict_scores
|