ClementRomac HF staff commited on
Commit
327431a
1 Parent(s): af34ccc

Upload processor

Browse files
Files changed (1) hide show
  1. processor.py +28 -13
processor.py CHANGED
@@ -6,24 +6,39 @@ class GIAProcessor(GitProcessor):
6
  super().__init__(image_processor, tokenizer)
7
  self._block_size = 1024
8
 
9
- def _group_texts(self, examples):
10
- # Concatenate all texts.
11
- concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
12
- total_length = len(concatenated_examples[list(examples.keys())[0]])
13
- # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
14
- # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
15
- total_length = (total_length // self._block_size) * self._block_size
16
- # Split by chunks of max_len.
17
- result = {
18
- k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
19
- for k, t in concatenated_examples.items()
20
  }
21
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def __call__(self, examples, return_tensors=None, **kwargs):
24
  if "text" in examples and not "images" in examples:
25
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
26
- encoding = self._group_texts(encoded_text)
27
  elif "text" in examples and "images" in examples:
28
  encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
29
 
 
6
  super().__init__(image_processor, tokenizer)
7
  self._block_size = 1024
8
 
9
+ def _cut_text(self, examples):
10
+ results = {
11
+ "input_ids": [],
12
+ "attention_mask": []
 
 
 
 
 
 
 
13
  }
14
+ for i in range(len(examples["input_ids"])):
15
+ _input_size = len(examples["input_ids"][i])
16
+ for j in range(_input_size // self._block_size):
17
+ results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
18
+ results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
19
+
20
+ return results
21
+
22
+ # def _group_texts(self, examples):
23
+ # # Concatenate all texts.
24
+ # concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
25
+ # total_length = len(concatenated_examples[list(examples.keys())[0]])
26
+ # # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
27
+ # # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
28
+ # if total_length > self._block_size:
29
+ # total_length = (total_length // self._block_size) * self._block_size
30
+ #
31
+ # # Split by chunks of max_len.
32
+ # result = {
33
+ # k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
34
+ # for k, t in concatenated_examples.items()
35
+ # }
36
+ # return result
37
 
38
  def __call__(self, examples, return_tensors=None, **kwargs):
39
  if "text" in examples and not "images" in examples:
40
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
41
+ encoding = self._cut_text(encoded_text)
42
  elif "text" in examples and "images" in examples:
43
  encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
44