arubenruben commited on
Commit
9941bd6
1 Parent(s): 224f9e1

commit files to HF hub

Browse files
Files changed (5) hide show
  1. config.json +9 -0
  2. model.safetensors +1 -1
  3. srl.py +130 -0
  4. tokenizer.json +3 -15
  5. tokenizer_config.json +1 -1
config.json CHANGED
@@ -4,6 +4,15 @@
4
  "DebertaForTokenClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
 
 
 
 
 
 
 
 
 
7
  "hidden_act": "gelu",
8
  "hidden_dropout_prob": 0.1,
9
  "hidden_size": 768,
 
4
  "DebertaForTokenClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "custom_pipelines": {
8
+ "srl": {
9
+ "impl": "srl.SRLPipeline",
10
+ "pt": [
11
+ "AutoModelForTokenClassification"
12
+ ],
13
+ "tf": []
14
+ }
15
+ },
16
  "hidden_act": "gelu",
17
  "hidden_dropout_prob": 0.1,
18
  "hidden_size": 768,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2077f067de2afd8c7a58df297697d135007c8f9ce497d047adbcfb15a3a448db
3
  size 554618516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c9f753a87fed0bc07db2ef6b4fabd8976f055b8a27155218dac9965f5bef6ef
3
  size 554618516
srl.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import numpy as np
3
+ from transformers import Pipeline
4
+
5
+
6
+ class SRLPipeline(Pipeline):
7
+
8
+ def __init__(self, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
+
11
+ spacy.prefer_gpu()
12
+
13
+ if not spacy.util.is_package("pt_core_news_sm"):
14
+ spacy.cli.download("pt_core_news_sm")
15
+
16
+ self.nlp = spacy.load("pt_core_news_sm")
17
+
18
+ def align_labels_with_tokens(self, tokenized_inputs, all_labels):
19
+ results = []
20
+
21
+ for i, labels in enumerate(all_labels):
22
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
23
+ type_ids = tokenized_inputs[i].type_ids
24
+
25
+ num_special_tokens = len(
26
+ [type_id for type_id in type_ids if type_id != 0])
27
+
28
+ if num_special_tokens > 0:
29
+ word_ids = word_ids[:-num_special_tokens]
30
+
31
+ new_labels = []
32
+ current_word = None
33
+
34
+ for word_id in word_ids:
35
+
36
+ if word_id != current_word:
37
+ # Start of a new word!
38
+ current_word = word_id
39
+ label = -100 if word_id is None else labels[word_id]
40
+ new_labels.append(label)
41
+ elif word_id is None:
42
+ # Special token
43
+ new_labels.append(-100)
44
+ else:
45
+ """
46
+ # Same word as previous token
47
+ label = labels[word_id]
48
+ # If the label is B-XXX we change it to I-XXX
49
+ if label % 2 == 1:
50
+ label += 1
51
+ """
52
+ new_labels.append(-100)
53
+
54
+ results.append(new_labels)
55
+
56
+ tokenized_inputs['labels'] = results
57
+
58
+ return tokenized_inputs
59
+
60
+ def _sanitize_parameters(self, **kwargs):
61
+ preprocess_kwargs = {}
62
+
63
+ if "verb" in kwargs:
64
+ preprocess_kwargs["verb"] = kwargs["verb"]
65
+
66
+ return preprocess_kwargs, {}, {}
67
+
68
+ def preprocess(self, text):
69
+
70
+ self.text = text
71
+
72
+ doc = self.nlp(text.strip())
73
+
74
+ self.label_names = self.model.config.id2label
75
+
76
+ # Extract list with verbs from the text
77
+ self.verbs = [token.text for token in doc if token.pos_ == "VERB"]
78
+
79
+ results = []
80
+
81
+ tokenized_input = [token.text for token in doc]
82
+ raw_labels = [0] * len(tokenized_input)
83
+
84
+ for verb in self.verbs:
85
+ tokenized_results = self.tokenizer(
86
+ tokenized_input, [verb], truncation=True,
87
+ is_split_into_words=True,
88
+ return_tensors="pt", max_length=self.model.config.max_position_embeddings)
89
+
90
+ tokenized_results = self.align_labels_with_tokens(
91
+ tokenized_inputs=tokenized_results, all_labels=[raw_labels])
92
+
93
+ self.labels = tokenized_results["labels"]
94
+
95
+ # Remove labels temporarily to avoid conflicts in the forward pass
96
+ tokenized_results.pop("labels")
97
+
98
+ results.append(tokenized_results)
99
+
100
+ return results
101
+
102
+ def _forward(self, batch_inputs):
103
+ results = []
104
+
105
+ for entry in batch_inputs:
106
+ results.append(self.model(**entry))
107
+
108
+ return results
109
+
110
+ def postprocess(self, batch_outputs):
111
+ outputs = []
112
+
113
+ for i, entry in enumerate(batch_outputs):
114
+ logits = entry.logits
115
+
116
+ predictions = np.argmax(logits, axis=-1).squeeze().tolist()
117
+
118
+ true_predictions = []
119
+
120
+ for prediction, label in zip(predictions, self.labels[0]):
121
+ if label != -100:
122
+ true_predictions.append(self.label_names[prediction])
123
+
124
+ outputs.append({
125
+ "tokens": self.text.split(),
126
+ "predictions": true_predictions,
127
+ "verb": self.verbs[i]
128
+ })
129
+
130
+ return outputs
tokenizer.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": "BatchLongest",
11
- "direction": "Right",
12
- "pad_to_multiple_of": null,
13
- "pad_id": 0,
14
- "pad_type_id": 0,
15
- "pad_token": "[PAD]"
16
- },
17
  "added_tokens": [
18
  {
19
  "id": 0,
@@ -64,7 +52,7 @@
64
  "normalizer": null,
65
  "pre_tokenizer": {
66
  "type": "ByteLevel",
67
- "add_prefix_space": true,
68
  "trim_offsets": true,
69
  "use_regex": true
70
  },
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "ByteLevel",
55
+ "add_prefix_space": false,
56
  "trim_offsets": true,
57
  "use_regex": true
58
  },
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "add_bos_token": false,
3
- "add_prefix_space": true,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "[PAD]",
 
1
  {
2
  "add_bos_token": false,
3
+ "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "[PAD]",