File size: 2,907 Bytes
8bb9743 13e968d 8bb9743 13e968d 4e8ab50 13e968d 4e8ab50 13e968d 4e8ab50 13e968d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
---
library_name: transformers
datasets:
- hiroki-rad/elyza_generated_data-3031
language:
- ja
- en
base_model:
- google/gemma-2-2b
---
```Python
import os
import random
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import json
# JSONLファイルを読み込む
file_path = 'elyza-tasks-100-TV_0.jsonl'
data = pd.read_json(file_path, lines=True)
def set_seed(seed):
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
model_name = "hiroki-rad/google-gemma-2-2b-128-ft-3000"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
)
def generate_text(data):
prompt = f"""## 指示:あなたは優秀な日本人の問題解決のエキスパートです。以下のステップで質問に取り組んでください:\n\n1. 質問の種類を特定する(事実確認/推論/創造的回答/計算など)\n2. 重要な情報や制約条件を抽出する\n3. 解決に必要なステップを明確にする\n4. 回答を組み立てる
質問をよく読んで、冷静に考え、考えをステップバイステップで考えをまとめてましょう。それをもう一度じっくり考えて、思考のプロセスを整理してください。質問に対して適切な回答を簡潔に出力してください。
質問:{data.input}\n回答:"""
# 推論の実行
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
# Remove token_type_ids from the input_ids
input_ids.pop('token_type_ids', None)
outputs = model.generate(
**input_ids,
max_new_tokens=2048,
do_sample=True,
top_p=0.95,
temperature=0.9,
repetition_penalty=1.1,
)
return tokenizer.decode(outputs[0][len(input_ids['input_ids'][0]):], skip_special_tokens=True)
results = []
for d in tqdm(data.itertuples(), position=0):
results.append(generate_text(d))
jsonl_data = []
# Iterate through the data and outputs
for i in range(len(data)):
task_id = data.iloc[i]["task_id"] # Access task_id using the index
output = results[i]
# Create a dictionary for each row
jsonl_object = {
"task_id": task_id,
"output": output
}
jsonl_data.append(jsonl_object)
with open("gemma2-output.jsonl", "w", encoding="utf-8") as outfile:
for entry in jsonl_data:
# Convert task_id to a regular Python integer before dumping
entry["task_id"] = int(entry["task_id"])
json.dump(entry, outfile, ensure_ascii=False)
outfile.write('\n') |