dicta-il
/

dictabert-joint

@@ -186,13 +186,15 @@ class BertForJointParsing(BertPreTrainedModel):
             morph_logits=morph_logits
         )
-    def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, as_iahlt_ud=False, as_htb_ud=False):
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
-        if (as_htb_ud or as_iahlt_ud) and (self.prefix is None or self.morph is None or self.syntax is None or self.lex is None):
-            raise ValueError("Cannot output UD format when any of the prefix,morph,syntax,lex heads aren't loaded.")
         # predict the logits for the sentence
         if self.prefix is not None:
@@ -233,8 +235,8 @@ class BertForJointParsing(BertPreTrainedModel):
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
                 final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
-        if as_iahlt_ud or as_htb_ud:
-            final_output = convert_output_to_ud(final_output, htb_extras=as_htb_ud)
         if is_single_sentence:
             final_output = final_output[0]
@@ -339,7 +341,10 @@ ud_suffix_to_htb_str = {
 	'Gender=Fem|Number=Sing|Person=2': '_את',
 	'Gender=Masc|Number=Plur|Person=3': '_הם'
 }
-def convert_output_to_ud(output_sentences, htb_extras=False):
     final_output = []
     for sent_idx, sentence in enumerate(output_sentences):
         # next, go through each word and insert it in the UD format. Store in a temp format for the post process
@@ -363,9 +368,9 @@ def convert_output_to_ud(output_sentences, htb_extras=False):
                 # if there was an implicit heh, add it in dependent on the method
                 if not 'ה' in pre and intermediate_output[-1]['pos'] == 'ADP' and 'DET' in word['morph']['prefixes']:
-                    if htb_extras:
                         intermediate_output.append(dict(word='ה_', lex='ה', pos='DET', dep=word_idx, func='det', feats='_'))
-                    else:
                         intermediate_output[-1]['feats'] = 'Definite=Def|PronType=Art'
@@ -394,7 +399,7 @@ def convert_output_to_ud(output_sentences, htb_extras=False):
                 s_word, s_lex = word['seg'][-1], word['lex']
                 # update the word of the string and extract the string of the suffix!
                 # for IAHLT:
-                if not htb_extras:
                     # we need to shorten the main word and extract the suffix
                     # if it is longer than the lexeme - just take off the lexeme.
                     if len(s_word) > len(s_lex):
@@ -407,7 +412,7 @@ def convert_output_to_ud(output_sentences, htb_extras=False):
                     suf = s_word[idx:]
                     intermediate_output[-1]['word'] = s_word[:idx]
                 # for htb:
-                else:
                     # main word becomes the lexeme, the suffix is based on the features
                     intermediate_output[-1]['word'] = (s_lex if s_lex != s_word else s_word[:-1]) + '_'
                     suf_feats = word['morph']['suffix_feats']
@@ -438,6 +443,7 @@ def convert_output_to_ud(output_sentences, htb_extras=False):
             for idx,output in enumerate(intermediate_output[start:end], start + 1):
                 # compute the actual dependency location
                 dep = output['dep'] if output.get('absolute_dep', False) else idx_to_key[output['dep']]
                 # and add the full ud string in
                 cur_output.append('\t'.join([
                     str(idx),
@@ -447,12 +453,20 @@ def convert_output_to_ud(output_sentences, htb_extras=False):
                     output['pos'],
                     output['feats'],
                     str(dep),
-                    output['func'],
                     '_', '_'
                 ]))
     return final_output
 def ud_get_prefix_dep(pre, word, word_idx):
     does_follow_main = False

             morph_logits=morph_logits
         )
+    def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
+        if output_style not in ['json', 'ud', 'iahlt_ud']:
+            raise ValueError('output_style must be in json/ud/iahlt_ud')
+        if output_style in ['ud', 'iahlt_ud'] and (self.prefix is None or self.morph is None or self.syntax is None or self.lex is None):
+            raise ValueError("Cannot output UD format when any of the prefix,morph,syntax, and lex heads aren't loaded.")
         # predict the logits for the sentence
         if self.prefix is not None:
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
                 final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
+        if output_style in ['ud', 'iahlt_ud']:
+            final_output = convert_output_to_ud(final_output, style='htb' if output_style == 'ud' else 'iahlt')
         if is_single_sentence:
             final_output = final_output[0]
 	'Gender=Fem|Number=Sing|Person=2': '_את',
 	'Gender=Masc|Number=Plur|Person=3': '_הם'
 }
+def convert_output_to_ud(output_sentences, style: Literal['htb', 'iahlt']):
+    if style not in ['htb', 'iahlt']:
+        raise ValueError('style must be htb/iahlt')
     final_output = []
     for sent_idx, sentence in enumerate(output_sentences):
         # next, go through each word and insert it in the UD format. Store in a temp format for the post process
                 # if there was an implicit heh, add it in dependent on the method
                 if not 'ה' in pre and intermediate_output[-1]['pos'] == 'ADP' and 'DET' in word['morph']['prefixes']:
+                    if style == 'htb':
                         intermediate_output.append(dict(word='ה_', lex='ה', pos='DET', dep=word_idx, func='det', feats='_'))
+                    elif style == 'iahlt':
                         intermediate_output[-1]['feats'] = 'Definite=Def|PronType=Art'
                 s_word, s_lex = word['seg'][-1], word['lex']
                 # update the word of the string and extract the string of the suffix!
                 # for IAHLT:
+                if style == 'iahlt':
                     # we need to shorten the main word and extract the suffix
                     # if it is longer than the lexeme - just take off the lexeme.
                     if len(s_word) > len(s_lex):
                     suf = s_word[idx:]
                     intermediate_output[-1]['word'] = s_word[:idx]
                 # for htb:
+                elif style == 'htb':
                     # main word becomes the lexeme, the suffix is based on the features
                     intermediate_output[-1]['word'] = (s_lex if s_lex != s_word else s_word[:-1]) + '_'
                     suf_feats = word['morph']['suffix_feats']
             for idx,output in enumerate(intermediate_output[start:end], start + 1):
                 # compute the actual dependency location
                 dep = output['dep'] if output.get('absolute_dep', False) else idx_to_key[output['dep']]
+                func = normalize_dep_rel(output['func'], style)
                 # and add the full ud string in
                 cur_output.append('\t'.join([
                     str(idx),
                     output['pos'],
                     output['feats'],
                     str(dep),
+                    func,
                     '_', '_'
                 ]))
     return final_output
+def normalize_dep_rel(dep, style: Literal['htb', 'iahlt']):
+    if style == 'iahlt':
+        if dep == 'compound:smixut': return 'compound'
+        if dep == 'nsubj:cop': return 'nsubj'
+        if dep == 'mark:q': return 'mark'
+        if dep == 'case:gen' or dep == 'case:acc': return 'case'
+    return dep
 def ud_get_prefix_dep(pre, word, word_idx):
     does_follow_main = False

BertForSyntaxParsing.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple, Optional, Union
 from dataclasses import dataclass
 from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
-ALL_FUNCTION_LABELS = ["nsubj", "punct", "mark", "case", "fixed", "obl", "det", "amod", "acl:relcl", "nmod", "cc", "conj", "root", "compound", "cop", "compound:affix", "advmod", "nummod", "appos", "nsubj:pass", "nmod:poss", "xcomp", "obj", "aux", "parataxis", "advcl", "ccomp", "csubj", "acl", "obl:tmod", "csubj:pass", "dep", "dislocated", "nmod:tmod", "nmod:npmod", "flat", "obl:npmod", "goeswith", "reparandum", "orphan", "list", "discourse", "iobj", "vocative", "expl", "flat:name"]
 @dataclass
 class SyntaxLogitsOutput(ModelOutput):

 from dataclasses import dataclass
 from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
+ALL_FUNCTION_LABELS = ["nsubj", "nsubj:cop", "punct", "mark", "mark:q", "case", "case:gen", "case:acc", "fixed", "obl", "det", "amod", "acl:relcl", "nmod", "cc", "conj", "root", "compound:smixut", "cop", "compound:affix", "advmod", "nummod", "appos", "nsubj:pass", "nmod:poss", "xcomp", "obj", "aux", "parataxis", "advcl", "ccomp", "csubj", "acl", "obl:tmod", "csubj:pass", "dep", "dislocated", "nmod:tmod", "nmod:npmod", "flat", "obl:npmod", "goeswith", "reparandum", "orphan", "list", "discourse", "iobj", "vocative", "expl", "flat:name"]
 @dataclass
 class SyntaxLogitsOutput(ModelOutput):