commit files to HF hub

Browse files

Files changed (5) hide show

config.json +9 -0
model.safetensors +1 -1
srl.py +130 -0
tokenizer.json +3 -15
tokenizer_config.json +1 -1

config.json CHANGED Viewed

@@ -4,6 +4,15 @@
     "DebertaForTokenClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,

     "DebertaForTokenClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
+  "custom_pipelines": {
+    "srl": {
+      "impl": "srl.SRLPipeline",
+      "pt": [
+        "AutoModelForTokenClassification"
+      ],
+      "tf": []
+    }
+  },
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2077f067de2afd8c7a58df297697d135007c8f9ce497d047adbcfb15a3a448db
 size 554618516

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c9f753a87fed0bc07db2ef6b4fabd8976f055b8a27155218dac9965f5bef6ef
 size 554618516

srl.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import spacy
+import numpy as np
+from transformers import Pipeline
+class SRLPipeline(Pipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        spacy.prefer_gpu()
+        if not spacy.util.is_package("pt_core_news_sm"):
+            spacy.cli.download("pt_core_news_sm")
+        self.nlp = spacy.load("pt_core_news_sm")
+    def align_labels_with_tokens(self, tokenized_inputs, all_labels):
+        results = []
+        for i, labels in enumerate(all_labels):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            type_ids = tokenized_inputs[i].type_ids
+            num_special_tokens = len(
+                [type_id for type_id in type_ids if type_id != 0])
+            if num_special_tokens > 0:
+                word_ids = word_ids[:-num_special_tokens]
+            new_labels = []
+            current_word = None
+            for word_id in word_ids:
+                if word_id != current_word:
+                    # Start of a new word!
+                    current_word = word_id
+                    label = -100 if word_id is None else labels[word_id]
+                    new_labels.append(label)
+                elif word_id is None:
+                    # Special token
+                    new_labels.append(-100)
+                else:
+                    """
+                    # Same word as previous token
+                    label = labels[word_id]
+                    # If the label is B-XXX we change it to I-XXX
+                    if label % 2 == 1:
+                        label += 1
+                    """
+                    new_labels.append(-100)
+            results.append(new_labels)
+        tokenized_inputs['labels'] = results
+        return tokenized_inputs
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "verb" in kwargs:
+            preprocess_kwargs["verb"] = kwargs["verb"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text):
+        self.text = text
+        doc = self.nlp(text.strip())
+        self.label_names = self.model.config.id2label
+        # Extract list with verbs from the text
+        self.verbs = [token.text for token in doc if token.pos_ == "VERB"]
+        results = []
+        tokenized_input = [token.text for token in doc]
+        raw_labels = [0] * len(tokenized_input)
+        for verb in self.verbs:
+            tokenized_results = self.tokenizer(
+                tokenized_input, [verb], truncation=True,
+                is_split_into_words=True,
+                return_tensors="pt", max_length=self.model.config.max_position_embeddings)
+            tokenized_results = self.align_labels_with_tokens(
+                tokenized_inputs=tokenized_results, all_labels=[raw_labels])
+            self.labels = tokenized_results["labels"]
+            # Remove labels temporarily to avoid conflicts in the forward pass
+            tokenized_results.pop("labels")
+            results.append(tokenized_results)
+        return results
+    def _forward(self, batch_inputs):
+        results = []
+        for entry in batch_inputs:
+            results.append(self.model(**entry))
+        return results
+    def postprocess(self, batch_outputs):
+        outputs = []
+        for i, entry in enumerate(batch_outputs):
+            logits = entry.logits
+            predictions = np.argmax(logits, axis=-1).squeeze().tolist()
+            true_predictions = []
+            for prediction, label in zip(predictions, self.labels[0]):
+                if label != -100:
+                    true_predictions.append(self.label_names[prediction])
+            outputs.append({
+                "tokens": self.text.split(),
+                "predictions": true_predictions,
+                "verb": self.verbs[i]
+            })
+        return outputs

tokenizer.json CHANGED Viewed

@@ -1,19 +1,7 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 512,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
-  "padding": {
-    "strategy": "BatchLongest",
-    "direction": "Right",
-    "pad_to_multiple_of": null,
-    "pad_id": 0,
-    "pad_type_id": 0,
-    "pad_token": "[PAD]"
-  },
   "added_tokens": [
     {
       "id": 0,
@@ -64,7 +52,7 @@
   "normalizer": null,
   "pre_tokenizer": {
     "type": "ByteLevel",
-    "add_prefix_space": true,
     "trim_offsets": true,
     "use_regex": true
   },

 {
   "version": "1.0",
+  "truncation": null,
+  "padding": null,
   "added_tokens": [
     {
       "id": 0,
   "normalizer": null,
   "pre_tokenizer": {
     "type": "ByteLevel",
+    "add_prefix_space": false,
     "trim_offsets": true,
     "use_regex": true
   },

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "add_bos_token": false,
-  "add_prefix_space": true,
   "added_tokens_decoder": {
     "0": {
       "content": "[PAD]",

 {
   "add_bos_token": false,
+  "add_prefix_space": false,
   "added_tokens_decoder": {
     "0": {
       "content": "[PAD]",