vonewman commited on
Commit
fe3337c
·
1 Parent(s): f5b6e30

add align_word_ids

Browse files
Files changed (1) hide show
  1. app.py +29 -0
app.py CHANGED
@@ -26,6 +26,35 @@ def load_model():
26
  tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
27
  return trainer, model, tokenizer
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def predict_ner_labels(model, tokenizer, sentence):
30
  use_cuda = torch.cuda.is_available()
31
  device = torch.device("cuda" if use_cuda else "cpu")
 
26
  tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
27
  return trainer, model, tokenizer
28
 
29
+ def align_word_ids(texts):
30
+
31
+ tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
32
+
33
+ word_ids = tokenized_inputs.word_ids()
34
+
35
+ previous_word_idx = None
36
+ label_ids = []
37
+
38
+ for word_idx in word_ids:
39
+
40
+ if word_idx is None:
41
+ label_ids.append(-100)
42
+
43
+ elif word_idx != previous_word_idx:
44
+ try:
45
+ label_ids.append(1)
46
+ except:
47
+ label_ids.append(-100)
48
+ else:
49
+ try:
50
+ label_ids.append(1 if label_all_tokens else -100)
51
+ except:
52
+ label_ids.append(-100)
53
+ previous_word_idx = word_idx
54
+
55
+ return label_ids
56
+
57
+
58
  def predict_ner_labels(model, tokenizer, sentence):
59
  use_cuda = torch.cuda.is_available()
60
  device = torch.device("cuda" if use_cuda else "cpu")