import numpy as np import shutil import json import gzip import random import torch class TransformersTokenizerWrapper: def __init__(self, tokenizer): self.T = tokenizer def __call__(self, texts): token_ids_batch = self.T(texts)["input_ids"] tokens_batch = [[self.T._convert_id_to_token(id) for id in ids] for ids in token_ids_batch] tokens_batch = [[self.T.convert_tokens_to_string(t).strip() for t in tokens[1:-1]] for tokens in tokens_batch] return tokens_batch def set_random_seed(seed): torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) def ask_rmdir(dir): val = input( f"WARNING: Proceed with deleting this directory: {dir} ? (yes|no) " ) if val == "yes": shutil.rmtree(dir) def load_numpy(path): with open(path, "rb") as f: x = np.load(f) return x def save_numpy(x, path): with open(path, "wb") as f: np.save(f, x) def batchify(items, batch_size): for i in range(0, len(items), batch_size): yield items[i:i + batch_size] def move_generator(items, idx): if idx == 0: return else: for i, x in enumerate(items): if i >= idx - 1: break def read_json(path): with open(path) as f: obj = json.load(f) return obj def write_json(obj, path): with open(path, 'w') as f: json.dump(obj, f) def write_jsonl(items, path, mode): with open(path, mode) as f: lines = [json.dumps(x) for x in items] f.write("\n".join(lines) + "\n") def read_jsonl(path): with open(path) as f: for line in f: yield json.loads(line) def read_jsonl_gz(path): with gzip.open(path) as f: for l in f: yield json.loads(l)