DD0101 commited on
Commit
6ee777c
1 Parent(s): deccc68

Changes in preprocess() method from class MyPipeline (According to Huggingfaces updates)

Browse files
Files changed (1) hide show
  1. app.py +21 -13
app.py CHANGED
@@ -11,20 +11,24 @@ os.mkdir('/home/user/app/vncorenlp')
11
  py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
12
  rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
13
 
 
14
  class MyPipeline(TokenClassificationPipeline):
15
- def preprocess(self, sentence, offset_mapping=None):
16
- truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
17
-
18
- model_inputs = self.tokenizer(
19
  sentence,
20
  return_tensors=self.framework,
21
  truncation=truncation,
22
  return_special_tokens_mask=True,
23
  return_offsets_mapping=self.tokenizer.is_fast,
 
24
  )
 
 
25
 
26
-
27
- length = len(model_inputs['input_ids'][0]) - 2
28
  tokens = self.tokenizer.tokenize(sentence)
29
  seek = 0
30
  offset_mapping_list = [[(0, 0)]]
@@ -37,15 +41,19 @@ class MyPipeline(TokenClassificationPipeline):
37
  seek += len(tokens[i]) + 1
38
  offset_mapping_list[0].append((0, 0))
39
 
40
- # if offset_mapping:
41
- # model_inputs["offset_mapping"] = offset_mapping
42
-
43
- model_inputs['offset_mapping'] = offset_mapping_list
44
- model_inputs["sentence"] = sentence
 
 
 
 
45
 
46
- return model_inputs
47
 
48
- model_checkpoint = "DD0101/disfluency-base"
49
 
50
  my_classifier = pipeline(
51
  "token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)
 
11
  py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
12
  rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
13
 
14
+ # I have to make some changes to the preprocess() method since they (Hugging Face) had changed some attributes
15
  class MyPipeline(TokenClassificationPipeline):
16
+ def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
17
+ tokenizer_params = preprocess_params.pop("tokenizer_params", {})
18
+ truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
19
+ inputs = self.tokenizer(
20
  sentence,
21
  return_tensors=self.framework,
22
  truncation=truncation,
23
  return_special_tokens_mask=True,
24
  return_offsets_mapping=self.tokenizer.is_fast,
25
+ **tokenizer_params,
26
  )
27
+ inputs.pop("overflow_to_sample_mapping", None)
28
+ num_chunks = len(inputs["input_ids"])
29
 
30
+ # Override preprocess method with these offset_mapping lines
31
+ length = len(inputs['input_ids'][0]) - 2
32
  tokens = self.tokenizer.tokenize(sentence)
33
  seek = 0
34
  offset_mapping_list = [[(0, 0)]]
 
41
  seek += len(tokens[i]) + 1
42
  offset_mapping_list[0].append((0, 0))
43
 
44
+ for i in range(num_chunks):
45
+ if self.framework == "tf":
46
+ model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
47
+ else:
48
+ model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
49
+
50
+ model_inputs['offset_mapping'] = offset_mapping_list
51
+ model_inputs["sentence"] = sentence if i == 0 else None
52
+ model_inputs["is_last"] = i == num_chunks - 1
53
 
54
+ yield model_inputs
55
 
56
+ model_checkpoint = "DD0101/disfluency-large"
57
 
58
  my_classifier = pipeline(
59
  "token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)