arubenruben
/

NER-PT-BERT-CRF-HAREM-Default

@@ -1,13 +1,22 @@
 {
-  "_name_or_path": "/notebooks/src/hugging_face_pipeline/BERT-CRF/out/model",
   "architectures": [
     "BERT_CRF"
   ],
   "auto_map": {
-    "AutoConfig": "model.BERT_CRF_Config",
-    "AutoModelForTokenClassification": "model.BERT_CRF"
   },
   "bert_name": "neuralmind/bert-large-portuguese-cased",
   "id2label": {
     "0": "O",
     "1": "B-PESSOA",

 {
+  "_name_or_path": "arubenruben/PT-BERT-Large-CRF-HAREM-Default",
   "architectures": [
     "BERT_CRF"
   ],
   "auto_map": {
+    "AutoConfig": "arubenruben/PT-BERT-Large-CRF-HAREM-Default--model.BERT_CRF_Config",
+    "AutoModelForTokenClassification": "arubenruben/PT-BERT-Large-CRF-HAREM-Default--model.BERT_CRF"
   },
   "bert_name": "neuralmind/bert-large-portuguese-cased",
+  "custom_pipelines": {
+    "arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline": {
+      "impl": "deploy_pipeline.BERT_CRF_Pipeline",
+      "pt": [
+        "AutoModelForTokenClassification"
+      ],
+      "tf": []
+    }
+  },
   "id2label": {
     "0": "O",
     "1": "B-PESSOA",

deploy_pipeline.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from transformers import Pipeline
+from transformers import AutoTokenizer
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import pipeline
+from transformers import AutoModelForTokenClassification
+from huggingface_hub import Repository
+import sys
+import os
+class TokenizeAndAlignLabelsStep():
+    # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
+    def tokenize_and_align_labels(self, examples, tokenizer):
+        tokenized_inputs = tokenizer(examples, padding='max_length', max_length=512)
+        # Map tokens to their respective word.
+        word_ids = tokenized_inputs.word_ids()
+        previous_word_idx = None
+        labels_mask = []
+        for word_idx in word_ids:  # Set the special tokens to -100.
+            if word_idx is None:
+                labels_mask.append(False)
+            # Only label the first token of a given word.
+            elif word_idx != previous_word_idx:
+                labels_mask.append(True)
+            else:
+                labels_mask.append(False)
+            previous_word_idx = word_idx
+        tokenized_inputs["tokens"] = examples
+        tokenized_inputs["ner_tags"] = []
+        tokenized_inputs["labels"] = []
+        tokenized_inputs["labels_mask"] = labels_mask
+        return tokenized_inputs
+class BERT_CRF_Pipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}
+    def preprocess(self, text):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "neuralmind/bert-base-portuguese-cased", do_lower_case=False)
+        TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
+            examples=text, tokenizer=tokenizer)
+        return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
+    def _forward(self, tokenizer_results):
+        input_ids = torch.tensor(
+            tokenizer_results['input_ids'], dtype=torch.long).unsqueeze(0)
+        token_type_ids = torch.tensor(
+            tokenizer_results['token_type_ids'], dtype=torch.long).unsqueeze(0)
+        attention_mask = torch.tensor(
+            tokenizer_results['attention_mask'], dtype=torch.bool).unsqueeze(0)
+        labels_mask = torch.tensor(
+            tokenizer_results['labels_mask'], dtype=torch.bool).unsqueeze(0)
+        # input_ids, token_type_ids, attention_mask, labels, labels_mask
+        outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
+                             attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
+        return outputs
+    def postprocess(self, model_outputs):
+        # From Ner_tags to Ner_labels
+        for i, label in enumerate(model_outputs[0]):
+            model_outputs[0][i] = self.model.config.id2label[label]
+        return model_outputs[0]
+def main():
+    PIPELINE_REGISTRY.register_pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline",
+                                        pipeline_class=BERT_CRF_Pipeline,
+                                        pt_model=AutoModelForTokenClassification,
+                                        )
+    classifier = pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
+                          device='cuda' if torch.cuda.is_available() else 'cpu', trust_remote_code=True)
+    out_path = os.path.join(sys.path[0], 'out', 'pipeline')
+    repo = Repository(
+        out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)
+    # repo.git_pull()
+    classifier.save_pretrained(out_path)
+    repo.push_to_hub()