Commit
•
327431a
1
Parent(s):
af34ccc
Upload processor
Browse files- processor.py +28 -13
processor.py
CHANGED
@@ -6,24 +6,39 @@ class GIAProcessor(GitProcessor):
|
|
6 |
super().__init__(image_processor, tokenizer)
|
7 |
self._block_size = 1024
|
8 |
|
9 |
-
def
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
14 |
-
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
15 |
-
total_length = (total_length // self._block_size) * self._block_size
|
16 |
-
# Split by chunks of max_len.
|
17 |
-
result = {
|
18 |
-
k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
19 |
-
for k, t in concatenated_examples.items()
|
20 |
}
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
24 |
if "text" in examples and not "images" in examples:
|
25 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
26 |
-
encoding = self.
|
27 |
elif "text" in examples and "images" in examples:
|
28 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
29 |
|
|
|
6 |
super().__init__(image_processor, tokenizer)
|
7 |
self._block_size = 1024
|
8 |
|
9 |
+
def _cut_text(self, examples):
|
10 |
+
results = {
|
11 |
+
"input_ids": [],
|
12 |
+
"attention_mask": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
+
for i in range(len(examples["input_ids"])):
|
15 |
+
_input_size = len(examples["input_ids"][i])
|
16 |
+
for j in range(_input_size // self._block_size):
|
17 |
+
results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
|
18 |
+
results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
|
19 |
+
|
20 |
+
return results
|
21 |
+
|
22 |
+
# def _group_texts(self, examples):
|
23 |
+
# # Concatenate all texts.
|
24 |
+
# concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
25 |
+
# total_length = len(concatenated_examples[list(examples.keys())[0]])
|
26 |
+
# # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
27 |
+
# # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
28 |
+
# if total_length > self._block_size:
|
29 |
+
# total_length = (total_length // self._block_size) * self._block_size
|
30 |
+
#
|
31 |
+
# # Split by chunks of max_len.
|
32 |
+
# result = {
|
33 |
+
# k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
34 |
+
# for k, t in concatenated_examples.items()
|
35 |
+
# }
|
36 |
+
# return result
|
37 |
|
38 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
39 |
if "text" in examples and not "images" in examples:
|
40 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
41 |
+
encoding = self._cut_text(encoded_text)
|
42 |
elif "text" in examples and "images" in examples:
|
43 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
44 |
|