from __future__ import annotations from typing import TYPE_CHECKING, Sequence import joblib from tqdm import tqdm if TYPE_CHECKING: from pathlib import Path __all__ = ["serialize", "deserialize"] def serialize(data: Sequence[str], path: Path, max_size: int = 100000) -> None: """Serialize data to a file Args: data: The data to serialize path: The path to save the serialized data max_size: The maximum size a chunk can be (in elements) """ # first file is path, next chunks have ".1", ".2", etc. appended for i, chunk in enumerate(tqdm([data[i : i + max_size] for i in range(0, len(data), max_size)])): fd = path.with_suffix(f".{i}.pkl" if i else ".pkl") with"wb") as f: joblib.dump(chunk, f, compress=3) def deserialize(path: Path) -> Sequence[str]: """Deserialize data from a file Args: path: The path to the serialized data Returns: The deserialized data """ data = [] i = 0 while (fd := path.with_suffix(f".{i}.pkl" if i else ".pkl")).exists(): with"rb") as f: data.extend(joblib.load(f)) i += 1 return data