inu-ai commited on
Commit
7dcdc0b
1 Parent(s): 9b295e6

Upload make_train_data_from_json.py

Browse files
train_data/make_train_data_from_json.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+
4
+ if len(sys.argv) != 2:
5
+ print("Usage: python script.py <json filename>")
6
+ sys.exit(1)
7
+
8
+ filename = sys.argv[1]
9
+
10
+ # JSONファイルを読み込む
11
+ with open(filename, 'r', encoding='utf-8') as f:
12
+ data = json.load(f)
13
+ print(len(data))
14
+
15
+ INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n入力:\n{input}\n[SEP]\n応答:\n{output}\n</s>'
16
+ NO_INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n応答:\n{output}\n</s>'
17
+ with open('databricks-dolly-15k-ja.txt', 'w', encoding='utf-8') as output_file:
18
+ for d in data:
19
+ if d['input'] != "":
20
+ text = INPUT_PROMPT.format(
21
+ instruction=d["instruction"].replace("\n", "\\n"),
22
+ input=d["input"].replace("\n", "\\n"),
23
+ output=d["output"].replace("\n", "\\n")
24
+ )
25
+ else:
26
+ text = NO_INPUT_PROMPT.format(
27
+ instruction=d["instruction"].replace("\n", "\\n"),
28
+ output=d["output"].replace("\n", "\\n")
29
+ )
30
+ output_file.write(text + '\n')
31
+