#!/usr/bin/env python # coding=utf-8 ''' This script is used to reformat the downloaded datasets into the format that can be used by the model. Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows: { "dataset": "dataset_name", "id": "unique_id", "messages": [ {"role": "system", "content": "message_text"}, # optional {"role": "user", "content": "message_text"}, {"role": "assistant", "content": "message_text"}, {"role": "user", "content": "message_text"}, {"role": "assistant", "content": "message_text"}, ... ], } ''' import json import random import re import os import pandas as pd import argparse from instruction_encode_templates import encode_instruction_example, encode_few_shot_example def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2): os.makedirs(output_dir, exist_ok=True) train_tasks = [] with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin: for line in fin: if not "_mmmlu_" in line: # skip mmlu to avoid test leakage train_tasks.append(line.strip()) with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout: for task in train_tasks: with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin: task_data = json.load(fin) instruction = task_data["Definition"][0] if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]): instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task) else: instances = task_data["Instances"] for instance in instances[:zero_shot_examples_per_task]: encoded_example = encode_instruction_example( instruction=instruction, input=instance["input"], output=instance["output"][0], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "super_ni", "id": f"super_ni_{instance['id']}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") for instance in instances[zero_shot_examples_per_task:]: if n_few_shot < len(task_data["Positive Examples"]): examplars = random.sample(task_data["Positive Examples"], k=n_few_shot) else: examplars = task_data["Positive Examples"] encoded_example = encode_few_shot_example( instruction=instruction, examplars=examplars, input=instance["input"], output=instance["output"][0], eos_token=None ) fout.write(json.dumps({ "dataset": "super_ni", "id": f"super_ni_{instance['id']}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000): os.makedirs(output_dir, exist_ok=True) examples = [] if num_few_shot_examples > 0: with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin: zero_shot_examples = [json.loads(line) for line in fin] if num_zero_shot_examples < len(zero_shot_examples): zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples) examples.extend(zero_shot_examples) if num_few_shot_examples > 0: with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin: few_shot_examples = [json.loads(line) for line in fin] if num_few_shot_examples < len(few_shot_examples): few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples) examples.extend(few_shot_examples) output_path = os.path.join(output_dir, "cot_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): prompt = example["inputs"] if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"): prompt += "\n" completion = example["targets"] fout.write(json.dumps({ "dataset": "cot", "id": f"cot_{idx}", "messages": [ {"role": "user", "content": prompt}, {"role": "assistant", "content": completion}, ] }) + "\n") def convert_flan_v2_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin: for line in fin: examples.append(json.loads(line)) output_path = os.path.join(output_dir, "flan_v2_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): prompt = example["inputs"] if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"): prompt += "\n" completion = example["targets"] fout.write(json.dumps({ "dataset": "flan_v2", "id": f"flan_v2_{idx}", "messages": [ {"role": "user", "content": prompt}, {"role": "assistant", "content": completion}, ] }) + "\n") def convert_dolly_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin: for line in fin: examples.append(json.loads(line)) output_path = os.path.join(output_dir, "dolly_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): encoded_example = encode_instruction_example( instruction=example["instruction"], input=example["context"], output=example["response"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "dolly", "id": f"dolly_{idx}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_self_instruct_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin: for line in fin: examples.append(json.loads(line)) output_path = os.path.join(output_dir, "self_instruct_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): encoded_example = encode_instruction_example( instruction=example["instruction"], input=example["input"], output=example["output"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "self_instruct", "id": f"self_instruct_{idx}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_unnatural_instructions_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) instance_cnt = 0 with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout: for line in fin: task_data = json.loads(line) instruction = task_data["instruction"] for instance in task_data["instances"]: if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]: instance_instruction = instruction + "\n" + instance["constraints"] else: instance_instruction = instruction encoded_example = encode_instruction_example( instruction=instance_instruction, input=instance["input"], output=instance["output"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "unnatural_instructions", "id": f"unnatural_instructions_{instance_cnt}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") instance_cnt += 1 def convert_stanford_alpaca_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin: examples.extend(json.load(fin)) output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): encoded_example = encode_instruction_example( instruction=example["instruction"], input=example["input"], output=example["output"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "stanford_alpaca", "id": f"stanford_alpaca_{idx}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_code_alpaca_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin: examples.extend(json.load(fin)) output_path = os.path.join(output_dir, "code_alpaca_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): encoded_example = encode_instruction_example( instruction=example["instruction"], input=example["input"], output=example["output"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "code_alpaca", "id": f"code_alpaca_{idx}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False): os.makedirs(output_dir, exist_ok=True) examples = [] if load_en: with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin: examples.extend(json.load(fin)) if load_zh: with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin: examples.extend(json.load(fin)) output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): encoded_example = encode_instruction_example( instruction=example["instruction"], input=example["input"], output=example["output"], random_template=True, eos_token=None ) fout.write(json.dumps({ "dataset": "gpt4_alpaca", "id": f"gpt4_alpaca_{idx}", "messages": [ {"role": "user", "content": encoded_example["prompt"]}, {"role": "assistant", "content": encoded_example["completion"]}, ] }) + "\n") def convert_sharegpt_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin: examples.extend(json.load(fin)) output_path = os.path.join(output_dir, "sharegpt_data.jsonl") with open(output_path, "w") as fout: invalid_cnt = 0 for idx, example in enumerate(examples): messages = [] valid = True for message in example["conversations"]: if message["from"] == "human" or message["from"] == "user": messages.append({ "role": "user", "content": message["value"] }) elif message["from"] == "gpt" or message["from"] == "chatgpt": messages.append({ "role": "assistant", "content": message["value"] }) elif message["from"] == "system": valid = False invalid_cnt += 1 break elif message["from"] == "bing": valid = False invalid_cnt += 1 break else: raise ValueError(f"Unknown message sender: {message['from']}") if messages and valid: fout.write(json.dumps({ "dataset": "sharegpt", "id": f"sharegpt_{example['id']}", "messages": messages }) + "\n") print(f"# of invalid examples in sharegpt data: {invalid_cnt}") def convert_baize_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] for source in ["alpaca", "medical", "quora", "stackoverflow"]: with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin: examples.extend(json.load(fin)) output_path = os.path.join(output_dir, "baize_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): # split example["input"] by [|Human|] and [|AI|] messages = [] rounds = example["input"].split("[|Human|]")[1:] for round in rounds: if not round.strip() or "[|AI|]" not in round: continue human, assistant = round.split("[|AI|]") messages.append({ "role": "user", "content": human.strip() }) messages.append({ "role": "assistant", "content": assistant.strip() }) fout.write(json.dumps({ "dataset": "baize", "id": f"baize_{idx}", "messages": messages }) + "\n") def convert_oasst1_data(data_dir, output_dir): ''' For OASST1, because it's in a tree structure, where every user input might get multiple replies, we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node). This results in some of the messages being duplicated among different paths (instances). Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path. ''' os.makedirs(output_dir, exist_ok=True) conversations = [] with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin: for line in fin: conversations.append(json.loads(line)) output_path = os.path.join(output_dir, "oasst1_data.jsonl") # we filter out the sequences that mention the creator information filter_strings = [ "LAION", "Open Asssistant", "OpenAssistant", ] # tranvers the conversation tree, and collect all valid sequences def dfs(reply, messages, valid_sequences): if any([filter_string in reply["text"] for filter_string in filter_strings]): return if reply["role"] == "assistant": messages.append( {"role": "assistant", "content": reply["text"]} ) if not reply["replies"]: # leaf node valid_sequences.append(messages[:]) else: for child in reply["replies"]: dfs(child, messages, valid_sequences) messages.pop() elif reply["role"] == "prompter": messages.append( {"role": "user", "content": reply["text"]} ) for child in reply["replies"]: dfs(child, messages, valid_sequences) messages.pop() else: raise ValueError(f"Unknown role: {reply['role']}") with open(output_path, "w") as fout: example_cnt = 0 for _, conversation in enumerate(conversations): valid_sequences = [] dfs(conversation["prompt"], [], valid_sequences) for sequence in valid_sequences: fout.write(json.dumps({ "dataset": "oasst1", "id": f"oasst1_{example_cnt}", "messages": sequence }) + "\n") example_cnt += 1 def convert_lima_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "train.jsonl"), "r") as fin: for line in fin: examples.append(json.loads(line)) output_path = os.path.join(output_dir, "lima_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): messages = [] if not len(example["conversations"]) % 2 == 0: print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.") example["conversations"] = example["conversations"][:-1] for i in range(0, len(example["conversations"]), 2): messages.append({ "role": "user", "content": example["conversations"][i] }) messages.append({ "role": "assistant", "content": example["conversations"][i+1] }) fout.write(json.dumps({ "dataset": "lima", "id": f"lima_{idx}", "messages": messages, }) + "\n") def convert_wizardlm_data(data_dir, output_dir): os.makedirs(output_dir, exist_ok=True) examples = [] with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin: examples = json.load(fin) output_path = os.path.join(output_dir, "wizardlm_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): messages = [] assert len(example["conversations"]) % 2 == 0 for i in range(0, len(example["conversations"]), 2): assert example["conversations"][i]["from"] == "human" assert example["conversations"][i+1]["from"] == "gpt" messages.append({ "role": "user", "content": example["conversations"][i]["value"] }) messages.append({ "role": "assistant", "content": example["conversations"][i+1]["value"] }) fout.write(json.dumps({ "dataset": "wizardlm", "id": f"wizardlm_{example['idx']}", "messages": messages, }) + "\n") def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0): os.makedirs(output_dir, exist_ok=True) examples = [] df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet")) gpt4_examples = [row.to_dict() for _, row in df.iterrows()] random.shuffle(gpt4_examples) examples.extend(gpt4_examples[:num_gpt4_examples]) df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet")) gpt35_examples = [row.to_dict() for _, row in df.iterrows()] random.shuffle(gpt35_examples) examples.extend(gpt35_examples[:num_gpt35_examples]) output_path = os.path.join(output_dir, "open_orca_data.jsonl") with open(output_path, "w") as fout: for idx, example in enumerate(examples): messages = [ {"role": "system", "content": example["system_prompt"]}, {"role": "user", "content": example["question"]}, {"role": "assistant", "content": example["response"]} ] fout.write(json.dumps({ "dataset": "open_orca", "id": f"open_orca_{example['id']}", "messages": messages, }) + "\n") if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads") arg_parser.add_argument("--output_dir", type=str, default="data/processed") arg_parser.add_argument("--seed", type=int, default=42) args = arg_parser.parse_args() random.seed(args.seed) # get the subfolder names in raw_data_dir subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))] # all supported datasets supported_datasets = [] all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])] for func_name in all_funcs: if re.match(r"convert_.+_data", func_name): supported_datasets.append(func_name[8:-5]) # check if the subfolder names are supported datasets valid_subfolders = [] for subfolder in subfolders: if subfolder not in supported_datasets: print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.") else: valid_subfolders.append(subfolder) # prepare data for each dataset statistics = {} for subfolder in valid_subfolders: print(f"Processing {subfolder} data...") globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))