|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import os |
|
import json |
|
from tqdm import tqdm |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
|
|
|
current_path = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
def replate_newline(text:str) -> str: |
|
return text.replace("\n","\\n") |
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
json_open = open(current_path+"/train_data/databricks-dolly-15k-ja.json",'r', encoding="utf-8_sig") |
|
json_load = json.load(json_open) |
|
|
|
print(json_load[0]) |
|
|
|
f = open(current_path+'/train_data/databricks-dolly-15k-ja.txt', 'w', encoding="utf-8_sig") |
|
|
|
for v in tqdm(json_load): |
|
string = r"<s>\n" |
|
if(v["input"]==""): |
|
|
|
string += r"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n[SEP]\n" |
|
string += r"指示:\n" |
|
string += replate_newline(v["instruction"]) |
|
string += r"\n[SEP]\n応答:\n" |
|
string += replate_newline(v["output"]) |
|
string += r"\n</s>" |
|
else: |
|
|
|
string += r"以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n[SEP]\n" |
|
string += r"指示:\n" |
|
string += replate_newline(v["instruction"]) |
|
string += r"\n[SEP]\n入力:\n" |
|
string += replate_newline(v["input"]) |
|
string += r"\n[SEP]\n応答:\n" |
|
string += replate_newline(v["output"]) |
|
string += r"\n</s>" |
|
|
|
f.write(string) |
|
f.write("\n") |
|
|
|
|
|
f.close() |