from __future__ import annotations from typing import TYPE_CHECKING, Sequence import joblib from tqdm import tqdm if TYPE_CHECKING: from pathlib import Path __all__ = ["serialize", "deserialize"] def serialize(data: Sequence[str], path: Path, max_size: int = 100000, show_progress: bool = False) -> None: """Serialize data to a file Args: data: The data to serialize path: The path to save the serialized data max_size: The maximum size a chunk can be (in elements) show_progress: Whether to show a progress bar """ # first file is path, next chunks have ".1", ".2", etc. appended for i, chunk in enumerate( tqdm( [data[i : i + max_size] for i in range(0, len(data), max_size)], unit="chunk", disable=not show_progress, ), ): fd = path.with_suffix(f".{i}.pkl" if i else ".pkl") with fd.open("wb") as f: joblib.dump(chunk, f, compress=3) def deserialize(path: Path) -> Sequence[str]: """Deserialize data from a file Args: path: The path to the serialized data Returns: The deserialized data """ data = [] i = 0 while (fd := path.with_suffix(f".{i}.pkl" if i else ".pkl")).exists(): with fd.open("rb") as f: data.extend(joblib.load(f)) i += 1 return data