Spaces:

DD0101
/

Disfluency-base

Runtime error

App Files Files Community

DD0101 commited on Sep 10, 2023

Commit

6ee777c

•

1 Parent(s): deccc68

Changes in preprocess() method from class MyPipeline (According to Huggingfaces updates)

Browse files

Files changed (1) hide show

app.py +21 -13

app.py CHANGED Viewed

@@ -11,20 +11,24 @@ os.mkdir('/home/user/app/vncorenlp')
 py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
 rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
 class MyPipeline(TokenClassificationPipeline):
-  def preprocess(self, sentence, offset_mapping=None):
-      truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
-      model_inputs = self.tokenizer(
           sentence,
           return_tensors=self.framework,
           truncation=truncation,
           return_special_tokens_mask=True,
           return_offsets_mapping=self.tokenizer.is_fast,
       )
-      length = len(model_inputs['input_ids'][0]) - 2
       tokens = self.tokenizer.tokenize(sentence)
       seek = 0
       offset_mapping_list = [[(0, 0)]]
@@ -37,15 +41,19 @@ class MyPipeline(TokenClassificationPipeline):
           seek += len(tokens[i]) + 1
       offset_mapping_list[0].append((0, 0))
-      # if offset_mapping:
-      #     model_inputs["offset_mapping"] = offset_mapping
-      model_inputs['offset_mapping'] = offset_mapping_list
-      model_inputs["sentence"] = sentence
-      return model_inputs
-model_checkpoint = "DD0101/disfluency-base"
 my_classifier = pipeline(
   "token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)

 py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
 rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
+# I have to make some changes to the preprocess() method since they (Hugging Face) had changed some attributes
 class MyPipeline(TokenClassificationPipeline):
+  def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+      tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+      truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+      inputs = self.tokenizer(
           sentence,
           return_tensors=self.framework,
           truncation=truncation,
           return_special_tokens_mask=True,
           return_offsets_mapping=self.tokenizer.is_fast,
+          **tokenizer_params,
       )
+      inputs.pop("overflow_to_sample_mapping", None)
+      num_chunks = len(inputs["input_ids"])
+      # Override preprocess method with these offset_mapping lines
+      length = len(inputs['input_ids'][0]) - 2
       tokens = self.tokenizer.tokenize(sentence)
       seek = 0
       offset_mapping_list = [[(0, 0)]]
           seek += len(tokens[i]) + 1
       offset_mapping_list[0].append((0, 0))
+      for i in range(num_chunks):
+          if self.framework == "tf":
+              model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+          else:
+              model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+          model_inputs['offset_mapping'] = offset_mapping_list
+          model_inputs["sentence"] = sentence if i == 0 else None
+          model_inputs["is_last"] = i == num_chunks - 1
+          yield model_inputs
+model_checkpoint = "DD0101/disfluency-large"
 my_classifier = pipeline(
   "token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)