nevi1's picture
Upload 244 files
73f4c20
import os
import glob
import json
import logging
from typing import Any, Mapping, Iterable, Union, List, Callable, Optional
from tqdm.auto import tqdm
def resolve_globs(glob_paths: Union[str, Iterable[str]]):
"""Returns filepaths corresponding to input filepath pattern(s)."""
filepaths = []
if isinstance(glob_paths, str):
glob_paths = [glob_paths]
for path in glob_paths:
filepaths.extend(glob.glob(path))
return filepaths
def read_jsonlines(filename: str) -> Iterable[Mapping[str, Any]]:
"""Yields an iterable of Python dicts after reading jsonlines from the input file."""
file_size = os.path.getsize(filename)
with open(filename) as fp:
for line in tqdm(fp.readlines(), desc=f"Reading JSON lines from {filename}", unit="lines"):
try:
example = json.loads(line)
yield example
except json.JSONDecodeError as ex:
logging.error(f'Input text: "{line}"')
logging.error(ex.args)
raise ex
def hf_read_jsonlines(
filename: str,
n: Optional[int] = None,
minimal_questions: Optional[bool] = False,
unique_questions: Optional[bool] = False,
) -> Iterable[Mapping[str, Any]]:
"""Yields an iterable of Python dicts after reading jsonlines from the input file.
Optionally reads only first n lines from file."""
file_size = os.path.getsize(filename)
# O(n) but no memory
with open(filename) as f:
num_lines = sum(1 for _ in f)
if n is None:
n = num_lines
# returning a generator with the scope stmt seemed to be the issue, but I am not 100% sure
# I also don't know if there's a side effect, but I can't see how the scope wouldn't have
# remained upen in the first place with the original version...
# with open(filename) as fp:
def line_generator():
unique_qc_ids = set()
# note, I am p sure that readlines is not lazy, returns a list, thus really only the
# object conversion is lazy
for i, line in tqdm(
enumerate(open(filename).readlines()[:n]),
desc=f"Reading JSON lines from {filename}",
unit="lines",
):
try:
full_example = json.loads(line)
if unique_questions:
qc_id = full_example["object"]["qc_id"]
if qc_id in unique_qc_ids:
continue
else:
unique_qc_ids.add(qc_id)
if not minimal_questions:
example = full_example
else:
full_example = full_example
q_object = full_example["object"]
q_object.pop("question_info")
example = {}
example["object"] = {
"answer": q_object["answer"],
"clue_spans": q_object["clue_spans"],
"qc_id": q_object["qc_id"],
"question_text": q_object["question_text"],
}
yield example
except json.JSONDecodeError as ex:
logging.error(f'Input text: "{line}"')
logging.error(ex.args)
raise ex
return line_generator
def load_jsonlines(filename: str) -> List[Mapping[str, Any]]:
"""Returns a list of Python dicts after reading jsonlines from the input file."""
return list(read_jsonlines(filename))
def write_jsonlines(
objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x
):
"""Writes a list of Python Mappings as jsonlines at the input file."""
with open(filename, "w") as fp:
for obj in tqdm(objs, desc=f"Writing JSON lines at {filename}"):
fp.write(json.dumps(to_dict(obj)))
fp.write("\n")
def write_lst_json(
objs: Iterable[Mapping[str, Any]], filename: str, to_dict: Callable = lambda x: x
):
"""Writes a list of Python Mappings as a list of json/dicts at the input file."""
with open(filename, "w") as fp:
fp.write("[\n")
num_rows = len(objs)
for i, obj in tqdm(enumerate(objs), desc=f"Writing list of JSON objs at {filename}"):
fp.write(json.dumps(to_dict(obj)))
if i != num_rows - 1:
fp.write(",\n")
fp.write("]\n")
def read_json(filename: str) -> Mapping[str, Any]:
"""Returns a Python dict representation of JSON object at input file."""
with open(filename) as fp:
return json.load(fp)
def write_json(obj: Mapping[str, Any], filename: str, indent: int = None):
"""Writes a Python Mapping at the input file in JSON format."""
with open(filename, "w") as fp:
json.dump(obj, fp, indent=indent)
def print_json(d, indent=4):
print(json.dumps(d, indent=indent))