dicta-il
/

dictabert-syntax

@@ -2,11 +2,11 @@ import math
 from transformers.utils import ModelOutput
 import torch
 from torch import nn
-from typing import List, Tuple, Optional, Union
 from dataclasses import dataclass
 from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
-ALL_FUNCTION_LABELS = ["nsubj", "punct", "mark", "case", "fixed", "obl", "det", "amod", "acl:relcl", "nmod", "cc", "conj", "root", "compound", "cop", "compound:affix", "advmod", "nummod", "appos", "nsubj:pass", "nmod:poss", "xcomp", "obj", "aux", "parataxis", "advcl", "ccomp", "csubj", "acl", "obl:tmod", "csubj:pass", "dep", "dislocated", "nmod:tmod", "nmod:npmod", "flat", "obl:npmod", "goeswith", "reparandum", "orphan", "list", "discourse", "iobj", "vocative", "expl", "flat:name"]
 @dataclass
 class SyntaxLogitsOutput(ModelOutput):
@@ -160,44 +160,46 @@ class BertForSyntaxParsing(BertPreTrainedModel):
         inputs = tokenizer(sentences, padding='longest', truncation=True, return_tensors='pt')
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         logits = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_mst).logits
-        outputs = []
-        for i in range(len(sentences)):
-            deps = logits.dependency_head_indices[i].tolist()
-            funcs = logits.function_logits.argmax(-1)[i].tolist()
-            toks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][i])[1:-1] # ignore cls and sep
-            # first, go through the tokens and create a mapping between each dependency index and the index without wordpieces
-            # wordpieces. At the same time, append the wordpieces in
-            idx_mapping = {-1:-1} # default root
-            real_idx = -1
-            for i in range(len(toks)):
-                if not toks[i].startswith('##'):
-                    real_idx += 1
-                idx_mapping[i] = real_idx
-            # build our tree, keeping tracking of the root idx
-            tree = []
-            root_idx = 0
-            for i in range(len(toks)):
-                if toks[i].startswith('##'):
-                    tree[-1]['word'] += toks[i][2:]
-                    continue
-                dep_idx = deps[i + 1] - 1 # increase 1 for cls, decrease 1 for cls
-                dep_head = 'root' if dep_idx == -1 else toks[dep_idx]
-                dep_func = ALL_FUNCTION_LABELS[funcs[i + 1]]
-                if dep_head == 'root': root_idx = len(tree)
-                tree.append(dict(word=toks[i], dep_head_idx=idx_mapping[dep_idx], dep_func=dep_func))
-            # append the head word
-            for d in tree:
-                d['dep_head'] = tree[d['dep_head_idx']]['word']
-            outputs.append(dict(tree=tree, root_idx=root_idx))
-        return outputs
 def compute_mst_tree(attention_scores: torch.Tensor):
     # attention scores should be 3 dimensions - batch x seq x seq (if it is 2 - just unsqueeze)
     if attention_scores.ndim == 2: attention_scores = attention_scores.unsqueeze(0)

 from transformers.utils import ModelOutput
 import torch
 from torch import nn
+from typing import Dict, List, Tuple, Optional, Union
 from dataclasses import dataclass
 from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
+ALL_FUNCTION_LABELS = ["nsubj", "nsubj:cop", "punct", "mark", "mark:q", "case", "case:gen", "case:acc", "fixed", "obl", "det", "amod", "acl:relcl", "nmod", "cc", "conj", "root", "compound:smixut", "cop", "compound:affix", "advmod", "nummod", "appos", "nsubj:pass", "nmod:poss", "xcomp", "obj", "aux", "parataxis", "advcl", "ccomp", "csubj", "acl", "obl:tmod", "csubj:pass", "dep", "dislocated", "nmod:tmod", "nmod:npmod", "flat", "obl:npmod", "goeswith", "reparandum", "orphan", "list", "discourse", "iobj", "vocative", "expl", "flat:name"]
 @dataclass
 class SyntaxLogitsOutput(ModelOutput):
         inputs = tokenizer(sentences, padding='longest', truncation=True, return_tensors='pt')
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         logits = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_mst).logits
+        return parse_logits(inputs, sentences, tokenizer, logits)
+def parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: SyntaxLogitsOutput):
+    outputs = []
+    for i in range(len(sentences)):
+        deps = logits.dependency_head_indices[i].tolist()
+        funcs = logits.function_logits.argmax(-1)[i].tolist()
+        toks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][i])[1:-1] # ignore cls and sep
+        # first, go through the tokens and create a mapping between each dependency index and the index without wordpieces
+        # wordpieces. At the same time, append the wordpieces in
+        idx_mapping = {-1:-1} # default root
+        real_idx = -1
+        for i in range(len(toks)):
+            if not toks[i].startswith('##'):
+                real_idx += 1
+            idx_mapping[i] = real_idx
+        # build our tree, keeping tracking of the root idx
+        tree = []
+        root_idx = 0
+        for i in range(len(toks)):
+            if toks[i].startswith('##'):
+                tree[-1]['word'] += toks[i][2:]
+                continue
+            dep_idx = deps[i + 1] - 1 # increase 1 for cls, decrease 1 for cls
+            dep_head = 'root' if dep_idx == -1 else toks[dep_idx]
+            dep_func = ALL_FUNCTION_LABELS[funcs[i + 1]]
+            if dep_head == 'root': root_idx = len(tree)
+            tree.append(dict(word=toks[i], dep_head_idx=idx_mapping[dep_idx], dep_func=dep_func))
+        # append the head word
+        for d in tree:
+            d['dep_head'] = tree[d['dep_head_idx']]['word']
+        outputs.append(dict(tree=tree, root_idx=root_idx))
+    return outputs
 def compute_mst_tree(attention_scores: torch.Tensor):
     # attention scores should be 3 dimensions - batch x seq x seq (if it is 2 - just unsqueeze)
     if attention_scores.ndim == 2: attention_scores = attention_scores.unsqueeze(0)