File size: 6,020 Bytes
1820156 e876caa 1820156 e876caa 1820156 e876caa 1820156 e876caa c81104c e876caa 1820156 c81104c 1820156 e876caa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
---
base_model:
- oxygen65/llm-jp-3-13b-finetune-2
- llm-jp/llm-jp-3-13b
tags:
- text-generation-inference
- transformers
- unsloth
- llama
- trl
license: cc-by-nc-sa-4.0
language:
- ja
datasets:
- elyza/ELYZA-tasks-100
---
# How to Use
## 1. load this model and tokenizer
```python
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
import torch
from tqdm import tqdm
import json
model_name = "oxygen65/llm-jp-3-13b-finetune-3"
# QLoRA config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
```
## 2. load Eval Datasets
```python
tasks = []
with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
item = ""
for line in f:
line = line.strip()
item += line
if item.endswith("}"):
tasks.append(json.loads(item))
item = ""
```
## 3. set up retrievers
if you can't find "rank_bm25" python package in your environment
```bash
!pip install rank_bm25
```
```python
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
# 必要なデータをダウンロード(初回のみ)
nltk.download('punkt')
nltk.download('punkt_tab')
def search_similar_documents_bm25(query, sample_tasks):
# トークン化(BM25はトークン化されたデータを要求します)
tokenized_documents = [word_tokenize(doc) for doc in sample_tasks['input']]
# BM25オブジェクトの作成
bm25 = BM25Okapi(tokenized_documents)
tokenized_query = word_tokenize(query)
# 類似度の計算
doc_scores = bm25.get_scores(tokenized_query)
# 類似度が高い順にソート
sorted_indexes = np.argsort(doc_scores)[::-1]
indexes = []
for i in range(len(doc_scores)):
if doc_scores[sorted_indexes[i]] < 20.0:
break
else:
indexes.append(sorted_indexes[i])
return indexes
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
SentTF = SentenceTransformer('all-MiniLM-L6-v2')
def seearch_similar_documents_neuralRetriver(query, sample_tasks):
global SentTF
emb1 = SentTF.encode([query])
emb2 = SentTF.encode(sample_tasks['input'])
# 全ての組み合わせで類似度を計算
similarity_matrix = cosine_similarity(emb1, emb2) #時間かかるので先に計算しておくべき
# 類似度が高い順にソート
sorted_indexes = np.argsort(similarity_matrix[0])[::-1]
#print(sorted_indexes)
indexes = []
for i in range(len(sample_tasks['input'])):
if similarity_matrix[0][sorted_indexes[i]] < 0.75:
break
else:
indexes.append(sorted_indexes[i])
return indexes
def create_icl_prompt(input, sample_tasks, task_id):
indexes_bm25 = search_similar_documents_bm25(input, sample_tasks)
indexes_neu = seearch_similar_documents_neuralRetriver(input, sample_tasks)
indexes = list(set(indexes_bm25 + indexes_neu))
icl_prompt = ""
if indexes == []:
return ""
icl_prompt = f"""## 例題\n"""
for i in range(len(indexes)):
icl_prompt += f"""### 指示
{sample_tasks["input"][indexes[i]]}
### 回答
{sample_tasks["output"][indexes[i]]}
"""
icl_prompt += f"""
## 本題: 以下の指示に従って回答してください。step by stepで回答してください。
"""
return icl_prompt
create_icl_prompt(tasks[2]["input"], sample_tasks, 0)
```
### 4. Inference
```python
# llmjp
import re
pattern = r"^以下.*$"
# プロンプトの作成
sys_prompt = ""
icl_prompt = ""
results = []
loop = 0
for data in tqdm(tasks):
task_id = data["task_id"]
input = data["input"]
# in context learning用のプロンプト
icl_prompt = create_icl_prompt(input, sample_tasks, task_id)
prompt = f"""{sys_prompt}{icl_prompt}### 指示
{input}
### 回答
"""
tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
tokenized_input,
max_new_tokens=512,
do_sample=False,
repetition_penalty=1.2,
eos_token_id=tokenizer.eos_token_id,
)[0]
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
while (True): #とりあえず出力。
line = output.splitlines()
if re.match(pattern, line[0]) and len(line) == 1:
print(f"#========================= Unexpected answer =========================#\n {line}")
outputs = model.generate(
tokenized_input,
max_new_tokens=512,
do_sample=True,
temperature=0.4,
repetition_penalty=1.2
)[0]
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
else: break
results.append({"task_id": data["task_id"], "input": input, "output": output})
print(f"task_id: {data['task_id']}, prompt: {prompt}, output: {output}")
```
### 5. Dump results
```python
import re
model_name = re.sub(".*/", "", model_name)
with open(f"./{model_name}-outputs.jsonl", 'w', encoding='utf-8') as f:
for result in results:
json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
f.write('\n')
```
# Uploaded model
- **Developed by:** oxygen65
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth) |