Spaces:
Runtime error
Runtime error
import numpy as np | |
import shutil | |
import json | |
import gzip | |
import random | |
import torch | |
class TransformersTokenizerWrapper: | |
def __init__(self, tokenizer): | |
self.T = tokenizer | |
def __call__(self, texts): | |
token_ids_batch = self.T(texts)["input_ids"] | |
tokens_batch = [[self.T._convert_id_to_token(id) for id in ids] for ids in token_ids_batch] | |
tokens_batch = [[self.T.convert_tokens_to_string(t).strip() for t in tokens[1:-1]] for tokens in tokens_batch] | |
return tokens_batch | |
def set_random_seed(seed): | |
torch.manual_seed(seed) | |
random.seed(seed) | |
np.random.seed(seed) | |
def ask_rmdir(dir): | |
val = input( | |
f"WARNING: Proceed with deleting this directory: {dir} ? (yes|no) " | |
) | |
if val == "yes": | |
shutil.rmtree(dir) | |
def load_numpy(path): | |
with open(path, "rb") as f: | |
x = np.load(f) | |
return x | |
def save_numpy(x, path): | |
with open(path, "wb") as f: | |
np.save(f, x) | |
def batchify(items, batch_size): | |
for i in range(0, len(items), batch_size): | |
yield items[i:i + batch_size] | |
def move_generator(items, idx): | |
if idx == 0: | |
return | |
else: | |
for i, x in enumerate(items): | |
if i >= idx - 1: | |
break | |
def read_json(path): | |
with open(path) as f: | |
obj = json.load(f) | |
return obj | |
def write_json(obj, path): | |
with open(path, 'w') as f: | |
json.dump(obj, f) | |
def write_jsonl(items, path, mode): | |
with open(path, mode) as f: | |
lines = [json.dumps(x) for x in items] | |
f.write("\n".join(lines) + "\n") | |
def read_jsonl(path): | |
with open(path) as f: | |
for line in f: | |
yield json.loads(line) | |
def read_jsonl_gz(path): | |
with gzip.open(path) as f: | |
for l in f: | |
yield json.loads(l) | |