Spaces:

minskiter
/

resume-token-classification

Runtime error

App Files Files Community

minskiter commited on Jul 14, 2023

Commit

07eca22

•

1 Parent(s): 7d71864

feat(app.py): update app.py

Browse files

Files changed (9) hide show

app.py +13 -7
models/bert/__init__.py +0 -1
models/bert/configuration_bert.py +0 -51
models/bert/model_bert.py +0 -41
models/crf/__init__.py +0 -1
models/crf/model_crf.py +0 -166
pipelines/__init__.py +0 -1
pipelines/ner_pipeline.py +0 -114
register.py +0 -8

app.py CHANGED Viewed

@@ -1,18 +1,24 @@
-from transformers import BertTokenizer,AutoModel
 from transformers.pipelines import pipeline
-from register import register
 import gradio as gr
 from huggingface_hub import login
 import os
-register()
 login(os.environ["HF_Token"])
-tokenizer = BertTokenizer.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
-model = AutoModel.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
 ner_predictor = pipeline(
-    "ner_predictor",
     model=model,
     tokenizer=tokenizer,
-    device="cpu"
 )
 def ner_predictor_gradio(input):

+from transformers import AutoTokenizer,AutoModel,BertTokenizer
 from transformers.pipelines import pipeline
 import gradio as gr
 from huggingface_hub import login
 import os
 login(os.environ["HF_Token"])
+model = AutoModel.from_pretrained(
+    "minskiter/resume-token-classification",
+    use_auth_token=True,
+    trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "hfl/chinese-bert-wwm"
+)
 ner_predictor = pipeline(
+    task="nerpipe",
     model=model,
+    config=model.config,
     tokenizer=tokenizer,
+    device="cpu",
+    trust_remote_code=True
 )
 def ner_predictor_gradio(input):

models/bert/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .model_bert import BertCrfModel,BertCrfConfig

models/bert/configuration_bert.py DELETED Viewed

@@ -1,51 +0,0 @@
-from transformers import PretrainedConfig
-class BertCrfConfig(PretrainedConfig):
-    model_type="bert_crf"
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        position_embedding_type="absolute",
-        use_cache=True,
-        classifier_dropout=None,
-        lstm_hidden_state=300,
-        num_tags=2,
-        tag2id={"O":0,"I":1},
-        id2tag={"0":"O","1":"I"},
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id,**kwargs)
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
-        self.lstm_hidden_state = lstm_hidden_state
-        self.num_tags = num_tags
-        self.tag2id = tag2id
-        self.id2tag = id2tag

models/bert/model_bert.py DELETED Viewed

@@ -1,41 +0,0 @@
-from transformers import PreTrainedModel,BertModel
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-from ..crf import CRF
-from .configuration_bert import BertCrfConfig
-class BertCrfModel(PreTrainedModel):
-    """BERT LSTM CRF Classify
-    Args:
-        PreTrainedModel (BertConfig): config
-    Returns:
-        loss: (torch.Tensor) batch loss
-        (best_path, labels): crf best path with true labels
-    """
-    config_class = BertCrfConfig
-    def __init__(self, config, num_tags = None):
-        super().__init__(config)
-        if num_tags is not None:
-            config.num_tags = num_tags
-        self.bert = BertModel(config=config, add_pooling_layer=False)
-        self.lstm = nn.LSTM(config.hidden_size, config.lstm_hidden_state, 1, batch_first=True, bidirectional=True)
-        self.crf = CRF(config.num_tags)
-        self.fc = nn.Linear(config.lstm_hidden_state*2, config.num_tags)
-    def forward(self, input_ids, attention_mask, token_type_ids, input_mask, labels=None):
-        outputs = self.bert(
-            input_ids = input_ids,
-            attention_mask = attention_mask,
-            token_type_ids = token_type_ids
-        )
-        hidden_states = outputs[0]
-        lstm_hidden_states = self.lstm(hidden_states)[0]
-        emission_scores = self.fc(lstm_hidden_states)
-        loss = None
-        if labels is not None:
-            loss = self.crf.loss(emission_scores, labels, input_mask==0)
-        _,best_path = self.crf(emission_scores, input_mask==0)
-        return loss,(list(i[1:-1] for i in best_path), labels.cpu() if labels is not None else None)

models/crf/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .model_crf import CRF

models/crf/model_crf.py DELETED Viewed

@@ -1,166 +0,0 @@
-import torch
-import torch.nn as nn
-def log_sum_exp(x):
-    """calculate log(sum(exp(x))) = max(x) + log(sum(exp(x - max(x))))
-    """
-    max_score = x.max(-1)[0]
-    return max_score + (x - max_score.unsqueeze(-1)).exp().sum(-1).log()
-IMPOSSIBLE = -1e4
-class CRF(nn.Module):
-    """General CRF module.
-    The CRF module contain a inner Linear Layer which transform the input from features space to tag space.
-    :param in_features: number of features for the input
-    :param num_tag: number of tags. DO NOT include START, STOP tags, they are included internal.
-    """
-    def __init__(self, num_tags):
-        super(CRF, self).__init__()
-        self.num_tags = num_tags + 2
-        self.start_idx = self.num_tags - 2
-        self.stop_idx = self.num_tags - 1
-        # transition factor, Tij mean transition from j to i
-        self.transitions = nn.Parameter(torch.randn(self.num_tags, self.num_tags), requires_grad=True)
-        self.transitions.data[self.start_idx, :] = IMPOSSIBLE
-        self.transitions.data[:, self.stop_idx] = IMPOSSIBLE
-    def __get_emission_score(self, features):
-        # features
-        b,seq,_ = features.size()
-        start_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
-        end_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
-        return torch.cat([features,start_score,end_score],dim=-1)
-    def forward(self, features, masks):
-        """decode tags
-        :param features: [B, L, C], batch of unary scores
-        :param masks: [B, L] masks
-        :return: (best_score, best_paths)
-            best_score: [B]
-            best_paths: [B, L]
-        """
-        features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
-        return self.__viterbi_decode(features, masks[:, :features.size(1)].float())
-    def loss(self, features, ys, masks):
-        """negative log likelihood loss
-        B: batch size, L: sequence length, D: dimension
-        :param features: [B, L, D]
-        :param ys: tags, [B, L]
-        :param masks: masks for padding, [B, L]
-        :return: loss
-        """
-        features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
-        L = features.size(1)
-        masks_ = masks[:, :L].float()
-        forward_score = self.__forward_algorithm(features, masks_)
-        ys = ys.clone().detach()
-        ys[ys<0] = 0
-        gold_score = self.__score_sentence(features, ys[:, :L].long(), masks_)
-        loss = (forward_score - gold_score).mean()
-        return loss
-    def __score_sentence(self, features, tags, masks):
-        """Gives the score of a provided tag sequence
-        :param features: [B, L, C]
-        :param tags: [B, L]
-        :param masks: [B, L]
-        :return: [B] score in the log space
-        """
-        B, L, C = features.shape
-        # emission score
-        emit_scores = features.gather(dim=2, index=tags.unsqueeze(-1)).squeeze(-1)
-        # transition score
-        start_tag = torch.full((B, 1), self.start_idx, dtype=torch.long, device=tags.device)
-        tags = torch.cat([start_tag, tags], dim=1)  # [B, L+1]
-        trans_scores = self.transitions[tags[:, 1:], tags[:, :-1]]
-        # last transition score to STOP tag
-        last_tag = tags.gather(dim=1, index=masks.sum(1).long().unsqueeze(1)).squeeze(1)  # [B]
-        last_score = self.transitions[self.stop_idx, last_tag]
-        score = ((trans_scores + emit_scores) * masks).sum(1) + last_score
-        return score
-    def __viterbi_decode(self, features, masks):
-        """decode to tags using viterbi algorithm
-        :param features: [B, L, C], batch of unary scores
-        :param masks: [B, L] masks
-        :return: (best_score, best_paths)
-            best_score: [B]
-            best_paths: [B, L]
-        """
-        B, L, C = features.shape
-        bps = torch.zeros(B, L, C, dtype=torch.long, device=features.device)  # back pointers
-        # Initialize the viterbi variables in log space
-        max_score = torch.full((B, C), IMPOSSIBLE, device=features.device)  # [B, C]
-        max_score[:, self.start_idx] = 0
-        for t in range(L):
-            mask_t = masks[:, t].unsqueeze(1)  # [B, 1]
-            emit_score_t = features[:, t]  # [B, C]
-            # [B, 1, C] + [C, C]
-            acc_score_t = max_score.unsqueeze(1) + self.transitions  # [B, C, C]
-            acc_score_t, bps[:, t, :] = acc_score_t.max(dim=-1)
-            acc_score_t += emit_score_t
-            max_score = acc_score_t * mask_t + max_score * (1 - mask_t)  # max_score or acc_score_t
-        # Transition to STOP_TAG
-        max_score += self.transitions[self.stop_idx]
-        best_score, best_tag = max_score.max(dim=-1)
-        # Follow the back pointers to decode the best path.
-        best_paths = []
-        bps = bps.cpu().numpy()
-        for b in range(B):
-            best_tag_b = best_tag[b].item()
-            seq_len = int(masks[b, :].sum().item())
-            best_path = [best_tag_b]
-            for bps_t in reversed(bps[b, :seq_len]):
-                best_tag_b = bps_t[best_tag_b]
-                best_path.append(best_tag_b)
-            # drop the last tag and reverse the left
-            best_paths.append(best_path[-2::-1])
-        return best_score, best_paths
-    def __forward_algorithm(self, features, masks):
-        """calculate the partition function with forward algorithm.
-        TRICK: log_sum_exp([x1, x2, x3, x4, ...]) = log_sum_exp([log_sum_exp([x1, x2]), log_sum_exp([x3, x4]), ...])
-        :param features: features. [B, L, C]
-        :param masks: [B, L] masks
-        :return:    [B], score in the log space
-        """
-        B, L, C = features.shape
-        scores = torch.full((B, C), IMPOSSIBLE, device=features.device)  # [B, C]
-        scores[:, self.start_idx] = 0.
-        trans = self.transitions.unsqueeze(0)  # [1, C, C]
-        # Iterate through the sentence
-        for t in range(L):
-            emit_score_t = features[:, t].unsqueeze(2)  # [B, C, 1]
-            score_t = scores.unsqueeze(1) + trans + emit_score_t  # [B, 1, C] + [1, C, C] + [B, C, 1] => [B, C, C]
-            score_t = log_sum_exp(score_t)  # [B, C]
-            mask_t = masks[:, t].unsqueeze(1)  # [B, 1]
-            scores = score_t * mask_t + scores * (1 - mask_t)
-        scores = log_sum_exp(scores + self.transitions[self.stop_idx])
-        return scores

pipelines/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .ner_pipeline import NERPredictorPipe

pipelines/ner_pipeline.py DELETED Viewed

@@ -1,114 +0,0 @@
-from transformers import Pipeline
-from typing import Dict, Any, Union
-from transformers.pipelines.base import GenericTensor
-from transformers.modeling_outputs import ModelOutput
-import torch
-class NERPredictorPipe(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        return {},{},{}
-    def __token_preprocess(self, input, tokenizer, max_length=512):
-        tokenized = tokenizer(input,
-                  padding="max_length",
-                  max_length=max_length,
-                  truncation=True,
-                  return_tensors="pt"
-        )
-        return tokenized
-    def preprocess(self, sentence: Union[str,list], max_length=512) -> Dict[str, GenericTensor]:
-        input_tensors = self.__token_preprocess(
-            sentence,
-            self.tokenizer,
-            max_length=max_length
-        )
-        input_tensors["input_mask"] = (~(input_tensors["input_ids"]>0)).long()
-        for key in input_tensors:
-            if input_tensors[key] is not None:
-                input_tensors[key] = input_tensors[key].to(self.device)
-        return input_tensors
-    def _forward(self, input_tensors: Dict[str, GenericTensor]) -> ModelOutput:
-        self.model.eval()
-        with torch.no_grad():
-            _,(best_path,_) = self.model(**input_tensors)
-        return (input_tensors["input_ids"].tolist(),best_path)
-    def __format_output(self, start, end, text, label):
-        return {
-            "text": text,
-            "start": start,
-            "end": end,
-            "label": label
-        }
-    def postprocess(self, model_outputs: ModelOutput) -> Any:
-        batch_slices = []
-        input_ids_list = model_outputs[0]
-        label_ids_list = model_outputs[1]
-        for input_ids,label_ids in zip(input_ids_list,label_ids_list):
-            slices = []
-            labels = list(self.model.config.id2tag[str(id)] for id in label_ids)
-            # get slice
-            past = "O"
-            start = -1
-            end = -1
-            for i,label in enumerate(labels):
-                if label.startswith("B-"):
-                    if start!=-1 and end!=-1:
-                        slices.append(
-                            self.__format_output(
-                                start, end,
-                                ''.join(self.tokenizer.convert_ids_to_tokens(
-                                input_ids[start+1:end+2])), past
-                            )
-                        )
-                    start = i
-                    end = i
-                    past = "-".join(label.split("-")[1:])
-                elif label.startswith("I-") or label.startswith("M-") or label.startswith("E-"):
-                    cur = "-".join(label.split("-")[1:])
-                    if cur!=past:
-                        # cut and skip to next entity
-                        if start!=-1 and end!=-1:
-                            slices.append(
-                                self.__format_output(
-                                    start, end,
-                                    ''.join(self.tokenizer.convert_ids_to_tokens(
-                                    input_ids[start+1:end+2])), past
-                                )
-                            )
-                        start = i
-                        past = cur
-                    end = i
-                elif label.startswith("S-"):
-                    if start!=-1 and end!=-1:
-                        slices.append(
-                            self.__format_output(
-                                start, end,
-                                ''.join(self.tokenizer.convert_ids_to_tokens(
-                                input_ids[start+1:end+2])), past
-                            )
-                        )
-                    slices.append(
-                        self.__format_output(
-                            i, i,
-                            ''.join(self.tokenizer.convert_ids_to_tokens(
-                            input_ids[i+1:i+2])), past
-                        )
-                    )
-                    start = -1
-                    end = -1
-                    past = "O"
-            if start!=-1 and end!=-1:
-                slices.append(
-                    self.__format_output(
-                        start, end,
-                        ''.join(self.tokenizer.convert_ids_to_tokens(
-                        input_ids[start+1:end+2])), past
-                    )
-                )
-            batch_slices.append(slices)
-        return batch_slices

@@ -1,8 +0,0 @@
-from transformers.pipelines import PIPELINE_REGISTRY,AutoModel,AutoConfig
-from models.bert import BertCrfModel,BertCrfConfig
-from pipelines import NERPredictorPipe
-def register():
-    PIPELINE_REGISTRY.register_pipeline("ner_predictor", pipeline_class=NERPredictorPipe)
-    AutoConfig.register("bert_crf",BertCrfConfig)
-    AutoModel.register(BertCrfConfig,BertCrfModel)