minskiter commited on
Commit
07eca22
1 Parent(s): 7d71864

feat(app.py): update app.py

Browse files
app.py CHANGED
@@ -1,18 +1,24 @@
1
- from transformers import BertTokenizer,AutoModel
2
  from transformers.pipelines import pipeline
3
- from register import register
4
  import gradio as gr
5
  from huggingface_hub import login
6
  import os
7
- register()
8
  login(os.environ["HF_Token"])
9
- tokenizer = BertTokenizer.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
10
- model = AutoModel.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
 
 
 
 
 
 
11
  ner_predictor = pipeline(
12
- "ner_predictor",
13
  model=model,
 
14
  tokenizer=tokenizer,
15
- device="cpu"
 
16
  )
17
 
18
  def ner_predictor_gradio(input):
 
1
+ from transformers import AutoTokenizer,AutoModel,BertTokenizer
2
  from transformers.pipelines import pipeline
 
3
  import gradio as gr
4
  from huggingface_hub import login
5
  import os
 
6
  login(os.environ["HF_Token"])
7
+ model = AutoModel.from_pretrained(
8
+ "minskiter/resume-token-classification",
9
+ use_auth_token=True,
10
+ trust_remote_code=True
11
+ )
12
+ tokenizer = AutoTokenizer.from_pretrained(
13
+ "hfl/chinese-bert-wwm"
14
+ )
15
  ner_predictor = pipeline(
16
+ task="nerpipe",
17
  model=model,
18
+ config=model.config,
19
  tokenizer=tokenizer,
20
+ device="cpu",
21
+ trust_remote_code=True
22
  )
23
 
24
  def ner_predictor_gradio(input):
models/bert/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .model_bert import BertCrfModel,BertCrfConfig
 
 
models/bert/configuration_bert.py DELETED
@@ -1,51 +0,0 @@
1
- from transformers import PretrainedConfig
2
-
3
- class BertCrfConfig(PretrainedConfig):
4
-
5
- model_type="bert_crf"
6
-
7
- def __init__(
8
- self,
9
- vocab_size=30522,
10
- hidden_size=768,
11
- num_hidden_layers=12,
12
- num_attention_heads=12,
13
- intermediate_size=3072,
14
- hidden_act="gelu",
15
- hidden_dropout_prob=0.1,
16
- attention_probs_dropout_prob=0.1,
17
- max_position_embeddings=512,
18
- type_vocab_size=2,
19
- initializer_range=0.02,
20
- layer_norm_eps=1e-12,
21
- pad_token_id=0,
22
- position_embedding_type="absolute",
23
- use_cache=True,
24
- classifier_dropout=None,
25
- lstm_hidden_state=300,
26
- num_tags=2,
27
- tag2id={"O":0,"I":1},
28
- id2tag={"0":"O","1":"I"},
29
- **kwargs
30
- ):
31
- super().__init__(pad_token_id=pad_token_id,**kwargs)
32
- self.vocab_size = vocab_size
33
- self.hidden_size = hidden_size
34
- self.num_hidden_layers = num_hidden_layers
35
- self.num_attention_heads = num_attention_heads
36
- self.intermediate_size = intermediate_size
37
- self.hidden_act = hidden_act
38
- self.hidden_dropout_prob = hidden_dropout_prob
39
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
40
- self.max_position_embeddings = max_position_embeddings
41
- self.type_vocab_size = type_vocab_size
42
- self.initializer_range = initializer_range
43
- self.layer_norm_eps = layer_norm_eps
44
- self.position_embedding_type = position_embedding_type
45
- self.use_cache = use_cache
46
- self.classifier_dropout = classifier_dropout
47
- self.lstm_hidden_state = lstm_hidden_state
48
- self.num_tags = num_tags
49
- self.tag2id = tag2id
50
- self.id2tag = id2tag
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/bert/model_bert.py DELETED
@@ -1,41 +0,0 @@
1
- from transformers import PreTrainedModel,BertModel
2
- from torch import nn
3
- from transformers.configuration_utils import PretrainedConfig
4
- from ..crf import CRF
5
- from .configuration_bert import BertCrfConfig
6
-
7
- class BertCrfModel(PreTrainedModel):
8
- """BERT LSTM CRF Classify
9
-
10
- Args:
11
- PreTrainedModel (BertConfig): config
12
-
13
- Returns:
14
- loss: (torch.Tensor) batch loss
15
- (best_path, labels): crf best path with true labels
16
- """
17
- config_class = BertCrfConfig
18
-
19
- def __init__(self, config, num_tags = None):
20
- super().__init__(config)
21
- if num_tags is not None:
22
- config.num_tags = num_tags
23
- self.bert = BertModel(config=config, add_pooling_layer=False)
24
- self.lstm = nn.LSTM(config.hidden_size, config.lstm_hidden_state, 1, batch_first=True, bidirectional=True)
25
- self.crf = CRF(config.num_tags)
26
- self.fc = nn.Linear(config.lstm_hidden_state*2, config.num_tags)
27
-
28
- def forward(self, input_ids, attention_mask, token_type_ids, input_mask, labels=None):
29
- outputs = self.bert(
30
- input_ids = input_ids,
31
- attention_mask = attention_mask,
32
- token_type_ids = token_type_ids
33
- )
34
- hidden_states = outputs[0]
35
- lstm_hidden_states = self.lstm(hidden_states)[0]
36
- emission_scores = self.fc(lstm_hidden_states)
37
- loss = None
38
- if labels is not None:
39
- loss = self.crf.loss(emission_scores, labels, input_mask==0)
40
- _,best_path = self.crf(emission_scores, input_mask==0)
41
- return loss,(list(i[1:-1] for i in best_path), labels.cpu() if labels is not None else None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/crf/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .model_crf import CRF
 
 
models/crf/model_crf.py DELETED
@@ -1,166 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
-
5
- def log_sum_exp(x):
6
- """calculate log(sum(exp(x))) = max(x) + log(sum(exp(x - max(x))))
7
- """
8
- max_score = x.max(-1)[0]
9
- return max_score + (x - max_score.unsqueeze(-1)).exp().sum(-1).log()
10
-
11
-
12
- IMPOSSIBLE = -1e4
13
-
14
-
15
- class CRF(nn.Module):
16
- """General CRF module.
17
- The CRF module contain a inner Linear Layer which transform the input from features space to tag space.
18
- :param in_features: number of features for the input
19
- :param num_tag: number of tags. DO NOT include START, STOP tags, they are included internal.
20
- """
21
-
22
- def __init__(self, num_tags):
23
- super(CRF, self).__init__()
24
-
25
- self.num_tags = num_tags + 2
26
- self.start_idx = self.num_tags - 2
27
- self.stop_idx = self.num_tags - 1
28
-
29
- # transition factor, Tij mean transition from j to i
30
- self.transitions = nn.Parameter(torch.randn(self.num_tags, self.num_tags), requires_grad=True)
31
- self.transitions.data[self.start_idx, :] = IMPOSSIBLE
32
- self.transitions.data[:, self.stop_idx] = IMPOSSIBLE
33
-
34
- def __get_emission_score(self, features):
35
- # features
36
- b,seq,_ = features.size()
37
- start_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
38
- end_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
39
- return torch.cat([features,start_score,end_score],dim=-1)
40
-
41
- def forward(self, features, masks):
42
- """decode tags
43
- :param features: [B, L, C], batch of unary scores
44
- :param masks: [B, L] masks
45
- :return: (best_score, best_paths)
46
- best_score: [B]
47
- best_paths: [B, L]
48
- """
49
- features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
50
- return self.__viterbi_decode(features, masks[:, :features.size(1)].float())
51
-
52
- def loss(self, features, ys, masks):
53
- """negative log likelihood loss
54
- B: batch size, L: sequence length, D: dimension
55
- :param features: [B, L, D]
56
- :param ys: tags, [B, L]
57
- :param masks: masks for padding, [B, L]
58
- :return: loss
59
- """
60
- features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
61
-
62
- L = features.size(1)
63
- masks_ = masks[:, :L].float()
64
- forward_score = self.__forward_algorithm(features, masks_)
65
- ys = ys.clone().detach()
66
- ys[ys<0] = 0
67
- gold_score = self.__score_sentence(features, ys[:, :L].long(), masks_)
68
- loss = (forward_score - gold_score).mean()
69
- return loss
70
-
71
- def __score_sentence(self, features, tags, masks):
72
- """Gives the score of a provided tag sequence
73
- :param features: [B, L, C]
74
- :param tags: [B, L]
75
- :param masks: [B, L]
76
- :return: [B] score in the log space
77
- """
78
- B, L, C = features.shape
79
-
80
- # emission score
81
- emit_scores = features.gather(dim=2, index=tags.unsqueeze(-1)).squeeze(-1)
82
-
83
- # transition score
84
- start_tag = torch.full((B, 1), self.start_idx, dtype=torch.long, device=tags.device)
85
- tags = torch.cat([start_tag, tags], dim=1) # [B, L+1]
86
- trans_scores = self.transitions[tags[:, 1:], tags[:, :-1]]
87
-
88
- # last transition score to STOP tag
89
- last_tag = tags.gather(dim=1, index=masks.sum(1).long().unsqueeze(1)).squeeze(1) # [B]
90
- last_score = self.transitions[self.stop_idx, last_tag]
91
-
92
- score = ((trans_scores + emit_scores) * masks).sum(1) + last_score
93
- return score
94
-
95
- def __viterbi_decode(self, features, masks):
96
- """decode to tags using viterbi algorithm
97
- :param features: [B, L, C], batch of unary scores
98
- :param masks: [B, L] masks
99
- :return: (best_score, best_paths)
100
- best_score: [B]
101
- best_paths: [B, L]
102
- """
103
- B, L, C = features.shape
104
-
105
- bps = torch.zeros(B, L, C, dtype=torch.long, device=features.device) # back pointers
106
-
107
- # Initialize the viterbi variables in log space
108
-
109
- max_score = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
110
- max_score[:, self.start_idx] = 0
111
-
112
- for t in range(L):
113
- mask_t = masks[:, t].unsqueeze(1) # [B, 1]
114
- emit_score_t = features[:, t] # [B, C]
115
-
116
- # [B, 1, C] + [C, C]
117
- acc_score_t = max_score.unsqueeze(1) + self.transitions # [B, C, C]
118
- acc_score_t, bps[:, t, :] = acc_score_t.max(dim=-1)
119
- acc_score_t += emit_score_t
120
- max_score = acc_score_t * mask_t + max_score * (1 - mask_t) # max_score or acc_score_t
121
-
122
- # Transition to STOP_TAG
123
- max_score += self.transitions[self.stop_idx]
124
- best_score, best_tag = max_score.max(dim=-1)
125
-
126
- # Follow the back pointers to decode the best path.
127
- best_paths = []
128
- bps = bps.cpu().numpy()
129
- for b in range(B):
130
- best_tag_b = best_tag[b].item()
131
- seq_len = int(masks[b, :].sum().item())
132
-
133
- best_path = [best_tag_b]
134
- for bps_t in reversed(bps[b, :seq_len]):
135
- best_tag_b = bps_t[best_tag_b]
136
- best_path.append(best_tag_b)
137
- # drop the last tag and reverse the left
138
- best_paths.append(best_path[-2::-1])
139
-
140
- return best_score, best_paths
141
-
142
- def __forward_algorithm(self, features, masks):
143
- """calculate the partition function with forward algorithm.
144
- TRICK: log_sum_exp([x1, x2, x3, x4, ...]) = log_sum_exp([log_sum_exp([x1, x2]), log_sum_exp([x3, x4]), ...])
145
- :param features: features. [B, L, C]
146
- :param masks: [B, L] masks
147
- :return: [B], score in the log space
148
- """
149
- B, L, C = features.shape
150
-
151
- scores = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
152
- scores[:, self.start_idx] = 0.
153
- trans = self.transitions.unsqueeze(0) # [1, C, C]
154
-
155
- # Iterate through the sentence
156
- for t in range(L):
157
- emit_score_t = features[:, t].unsqueeze(2) # [B, C, 1]
158
- score_t = scores.unsqueeze(1) + trans + emit_score_t # [B, 1, C] + [1, C, C] + [B, C, 1] => [B, C, C]
159
- score_t = log_sum_exp(score_t) # [B, C]
160
-
161
- mask_t = masks[:, t].unsqueeze(1) # [B, 1]
162
- scores = score_t * mask_t + scores * (1 - mask_t)
163
- scores = log_sum_exp(scores + self.transitions[self.stop_idx])
164
- return scores
165
-
166
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipelines/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .ner_pipeline import NERPredictorPipe
 
 
pipelines/ner_pipeline.py DELETED
@@ -1,114 +0,0 @@
1
- from transformers import Pipeline
2
- from typing import Dict, Any, Union
3
- from transformers.pipelines.base import GenericTensor
4
- from transformers.modeling_outputs import ModelOutput
5
- import torch
6
-
7
- class NERPredictorPipe(Pipeline):
8
-
9
- def _sanitize_parameters(self, **kwargs):
10
- return {},{},{}
11
-
12
- def __token_preprocess(self, input, tokenizer, max_length=512):
13
- tokenized = tokenizer(input,
14
- padding="max_length",
15
- max_length=max_length,
16
- truncation=True,
17
- return_tensors="pt"
18
- )
19
- return tokenized
20
-
21
- def preprocess(self, sentence: Union[str,list], max_length=512) -> Dict[str, GenericTensor]:
22
- input_tensors = self.__token_preprocess(
23
- sentence,
24
- self.tokenizer,
25
- max_length=max_length
26
- )
27
- input_tensors["input_mask"] = (~(input_tensors["input_ids"]>0)).long()
28
- for key in input_tensors:
29
- if input_tensors[key] is not None:
30
- input_tensors[key] = input_tensors[key].to(self.device)
31
- return input_tensors
32
-
33
- def _forward(self, input_tensors: Dict[str, GenericTensor]) -> ModelOutput:
34
- self.model.eval()
35
- with torch.no_grad():
36
- _,(best_path,_) = self.model(**input_tensors)
37
- return (input_tensors["input_ids"].tolist(),best_path)
38
-
39
- def __format_output(self, start, end, text, label):
40
- return {
41
- "text": text,
42
- "start": start,
43
- "end": end,
44
- "label": label
45
- }
46
-
47
- def postprocess(self, model_outputs: ModelOutput) -> Any:
48
- batch_slices = []
49
- input_ids_list = model_outputs[0]
50
- label_ids_list = model_outputs[1]
51
- for input_ids,label_ids in zip(input_ids_list,label_ids_list):
52
- slices = []
53
- labels = list(self.model.config.id2tag[str(id)] for id in label_ids)
54
- # get slice
55
- past = "O"
56
- start = -1
57
- end = -1
58
- for i,label in enumerate(labels):
59
- if label.startswith("B-"):
60
- if start!=-1 and end!=-1:
61
- slices.append(
62
- self.__format_output(
63
- start, end,
64
- ''.join(self.tokenizer.convert_ids_to_tokens(
65
- input_ids[start+1:end+2])), past
66
- )
67
- )
68
- start = i
69
- end = i
70
- past = "-".join(label.split("-")[1:])
71
- elif label.startswith("I-") or label.startswith("M-") or label.startswith("E-"):
72
- cur = "-".join(label.split("-")[1:])
73
- if cur!=past:
74
- # cut and skip to next entity
75
- if start!=-1 and end!=-1:
76
- slices.append(
77
- self.__format_output(
78
- start, end,
79
- ''.join(self.tokenizer.convert_ids_to_tokens(
80
- input_ids[start+1:end+2])), past
81
- )
82
- )
83
- start = i
84
- past = cur
85
- end = i
86
- elif label.startswith("S-"):
87
- if start!=-1 and end!=-1:
88
- slices.append(
89
- self.__format_output(
90
- start, end,
91
- ''.join(self.tokenizer.convert_ids_to_tokens(
92
- input_ids[start+1:end+2])), past
93
- )
94
- )
95
- slices.append(
96
- self.__format_output(
97
- i, i,
98
- ''.join(self.tokenizer.convert_ids_to_tokens(
99
- input_ids[i+1:i+2])), past
100
- )
101
- )
102
- start = -1
103
- end = -1
104
- past = "O"
105
- if start!=-1 and end!=-1:
106
- slices.append(
107
- self.__format_output(
108
- start, end,
109
- ''.join(self.tokenizer.convert_ids_to_tokens(
110
- input_ids[start+1:end+2])), past
111
- )
112
- )
113
- batch_slices.append(slices)
114
- return batch_slices
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
register.py DELETED
@@ -1,8 +0,0 @@
1
- from transformers.pipelines import PIPELINE_REGISTRY,AutoModel,AutoConfig
2
- from models.bert import BertCrfModel,BertCrfConfig
3
- from pipelines import NERPredictorPipe
4
-
5
- def register():
6
- PIPELINE_REGISTRY.register_pipeline("ner_predictor", pipeline_class=NERPredictorPipe)
7
- AutoConfig.register("bert_crf",BertCrfConfig)
8
- AutoModel.register(BertCrfConfig,BertCrfModel)