import os import glob import json import logging from typing import Any, Mapping, Iterable, Union, List, Callable, Optional from tqdm.auto import tqdm def resolve_globs(glob_paths: Union[str, Iterable[str]]): """Returns filepaths corresponding to input filepath pattern(s).""" filepaths = [] if isinstance(glob_paths, str): glob_paths = [glob_paths] for path in glob_paths: filepaths.extend(glob.glob(path)) return filepaths def read_jsonlines(filename: str) -> Iterable[Mapping[str, Any]]: """Yields an iterable of Python dicts after reading jsonlines from the input file.""" file_size = os.path.getsize(filename) with open(filename) as fp: for line in tqdm(fp.readlines(), desc=f"Reading JSON lines from {filename}", unit="lines"): try: example = json.loads(line) yield example except json.JSONDecodeError as ex: logging.error(f'Input text: "{line}"') logging.error(ex.args) raise ex def hf_read_jsonlines( filename: str, n: Optional[int] = None, minimal_questions: Optional[bool] = False, unique_questions: Optional[bool] = False, ) -> Iterable[Mapping[str, Any]]: """Yields an iterable of Python dicts after reading jsonlines from the input file. Optionally reads only first n lines from file.""" file_size = os.path.getsize(filename) # O(n) but no memory with open(filename) as f: num_lines = sum(1 for _ in f) if n is None: n = num_lines # returning a generator with the scope stmt seemed to be the issue, but I am not 100% sure # I also don't know if there's a side effect, but I can't see how the scope wouldn't have # remained upen in the first place with the original version... # with open(filename) as fp: def line_generator(): unique_qc_ids = set() # note, I am p sure that readlines is not lazy, returns a list, thus really only the # object conversion is lazy for i, line in tqdm( enumerate(open(filename).readlines()[:n]), desc=f"Reading JSON lines from {filename}", unit="lines", ): try: full_example = json.loads(line) if unique_questions: qc_id = full_example["object"]["qc_id"] if qc_id in unique_qc_ids: continue else: unique_qc_ids.add(qc_id) if not minimal_questions: example = full_example else: full_example = full_example q_object = full_example["object"] q_object.pop("question_info") example = {} example["object"] = { "answer": q_object["answer"], "clue_spans": q_object["clue_spans"], "qc_id": q_object["qc_id"], "question_text": q_object["question_text"], } yield example except json.JSONDecodeError as ex: logging.error(f'Input text: "{line}"') logging.error(ex.args) raise ex return line_generator def load_jsonlines(filename: str) -> List[Mapping[str, Any]]: """Returns a list of Python dicts after reading jsonlines from the input file.""" return list(read_jsonlines(filename)) def write_jsonlines( objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x ): """Writes a list of Python Mappings as jsonlines at the input file.""" with open(filename, "w") as fp: for obj in tqdm(objs, desc=f"Writing JSON lines at {filename}"): fp.write(json.dumps(to_dict(obj))) fp.write("\n") def write_lst_json( objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x ): """Writes a list of Python Mappings as a list of json/dicts at the input file.""" with open(filename, "w") as fp: fp.write("[\n") num_rows = len(objs) for i, obj in tqdm(enumerate(objs), desc=f"Writing list of JSON objs at {filename}"): fp.write(json.dumps(to_dict(obj))) if i != num_rows - 1: fp.write(",\n") fp.write("]\n") def read_json(filename: str) -> Mapping[str, Any]: """Returns a Python dict representation of JSON object at input file.""" with open(filename) as fp: return json.load(fp) def write_json(obj: Mapping[str, Any], filename: str, indent: int = None): """Writes a Python Mapping at the input file in JSON format.""" with open(filename, "w") as fp: json.dump(obj, fp, indent=indent) def print_json(d, indent=4): print(json.dumps(d, indent=indent))