|
--- |
|
library_name: transformers |
|
datasets: |
|
- elyza/ELYZA-tasks-100 |
|
license: apache-2.0 |
|
language: |
|
- ja |
|
base_model: |
|
- llm-jp/llm-jp-3-13b-instruct |
|
--- |
|
|
|
# Model Card for Model ID |
|
|
|
<!-- Provide a quick summary of what the model is/does. --> |
|
|
|
## Required Libraries and Their Versions |
|
|
|
- trl==0.12.2 |
|
- transformers<4.47.0 |
|
- tokenizers==0.21.0 |
|
- bitsandbytes==0.45.0 |
|
- peft==0.14.0 |
|
- datasets==3.2.0 |
|
|
|
## Usage |
|
|
|
Google Colaboratory(L4 GPU)にて実行 |
|
|
|
```py |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
BitsAndBytesConfig, |
|
TrainingArguments, |
|
logging, |
|
) |
|
from peft import ( |
|
LoraConfig, |
|
PeftModel, |
|
get_peft_model, |
|
) |
|
|
|
import os, torch, gc, json |
|
from tqdm import tqdm |
|
from datasets import load_dataset |
|
import bitsandbytes as bnb |
|
from trl import SFTTrainer |
|
from google.colab import userdata |
|
|
|
# Hugging Face Token |
|
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN") |
|
``` |
|
|
|
```py |
|
# 推論データ準備 |
|
datasets = [] |
|
|
|
inference_data_path = '/content/drive/MyDrive/your_path' |
|
with open(f"{inference_data_path}/elyza-tasks-100-TV_0.jsonl", "r") as f: |
|
item = "" |
|
for line in f: |
|
line = line.strip() |
|
item += line |
|
if item.endswith("}"): |
|
datasets.append(json.loads(item)) |
|
item = "" |
|
|
|
# モデルとトークナイザー準備 |
|
new_model_id = "yottan-wywy/llm-jp-3-13b-instruct-finetune_1217" |
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
new_model_id, |
|
quantization_config=bnb_config, |
|
device_map="auto" |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(new_model_id, trust_remote_code=True) |
|
``` |
|
|
|
```py |
|
# 推論実行 |
|
results = [] |
|
system_text = "以下は、タスクを説明する指示です。要求を適切に満たす回答を**簡潔に**書きなさい。" |
|
for data in tqdm(datasets): |
|
|
|
input_text = data["input"] |
|
|
|
prompt = f""" |
|
{system_text} |
|
### 指示 |
|
{input_text} |
|
### 応答 |
|
""" |
|
|
|
tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device) |
|
attention_mask = torch.ones_like(tokenized_input) |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
tokenized_input, |
|
attention_mask=attention_mask, |
|
max_new_tokens=100, |
|
do_sample=False, |
|
repetition_penalty=1.2, |
|
pad_token_id=tokenizer.eos_token_id |
|
)[0] |
|
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True) |
|
|
|
results.append({"task_id": data["task_id"], "input": input_text, "output": output}) |
|
|
|
``` |
|
|
|
|
|
## Model Details |
|
|
|
- **Model type:** Transformer-based Language Model |
|
|
|
## Datasets |
|
|
|
### Instruction tuning |
|
|
|
| Language | Dataset | description | |
|
|:---|:---|:---| |
|
|Japanese|[elyza/ELYZA-tasks-100](https://huggingface.co/datasets/elyza/ELYZA-tasks-100)| A manually constructed instruction dataset | |
|
|
|
## License |
|
|
|
[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) |
|
|