Disfluency-large

Sleeping

App Files Files Community

DD0101 commited on May 3, 2023

Commit

02758ba

•

1 Parent(s): 0d7455a

change preprocess() method of MyPipeline Class

Browse files

Files changed (1) hide show

app.py +21 -13

app.py CHANGED Viewed

@@ -11,20 +11,24 @@ os.mkdir('/home/user/app/vncorenlp')
 py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
 rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
 class MyPipeline(TokenClassificationPipeline):
-  def preprocess(self, sentence, offset_mapping=None):
-      truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
-      model_inputs = self.tokenizer(
           sentence,
           return_tensors=self.framework,
           truncation=truncation,
           return_special_tokens_mask=True,
           return_offsets_mapping=self.tokenizer.is_fast,
       )
-      length = len(model_inputs['input_ids'][0]) - 2
       tokens = self.tokenizer.tokenize(sentence)
       seek = 0
       offset_mapping_list = [[(0, 0)]]
@@ -37,13 +41,17 @@ class MyPipeline(TokenClassificationPipeline):
           seek += len(tokens[i]) + 1
       offset_mapping_list[0].append((0, 0))
-      # if offset_mapping:
-      #     model_inputs["offset_mapping"] = offset_mapping
-      model_inputs['offset_mapping'] = offset_mapping_list
-      model_inputs["sentence"] = sentence
-      return model_inputs
 model_checkpoint = "DD0101/disfluency-large"

 py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
 rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
+# I have to make some changes to the preprocess() method since they (Hugging Face) had changed some attributes
 class MyPipeline(TokenClassificationPipeline):
+  def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+      tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+      truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+      inputs = self.tokenizer(
           sentence,
           return_tensors=self.framework,
           truncation=truncation,
           return_special_tokens_mask=True,
           return_offsets_mapping=self.tokenizer.is_fast,
+          **tokenizer_params,
       )
+      inputs.pop("overflow_to_sample_mapping", None)
+      num_chunks = len(inputs["input_ids"])
+      # Override preprocess method with these offset_mapping lines
+      length = len(inputs['input_ids'][0]) - 2
       tokens = self.tokenizer.tokenize(sentence)
       seek = 0
       offset_mapping_list = [[(0, 0)]]
           seek += len(tokens[i]) + 1
       offset_mapping_list[0].append((0, 0))
+      for i in range(num_chunks):
+          if self.framework == "tf":
+              model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+          else:
+              model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+          model_inputs['offset_mapping'] = offset_mapping_list
+          model_inputs["sentence"] = sentence if i == 0 else None
+          model_inputs["is_last"] = i == num_chunks - 1
+          yield model_inputs
 model_checkpoint = "DD0101/disfluency-large"