|
--- |
|
library_name: transformers |
|
datasets: |
|
- elyza/ELYZA-tasks-100 |
|
language: |
|
- ja |
|
base_model: |
|
- llm-jp/llm-jp-3-13b |
|
pipeline_tag: text-generation |
|
--- |
|
|
|
|
|
## How to Uses |
|
```python |
|
# ライブラリのインストール |
|
!pip install -U langchain-community langchain-huggingface vllm triton fugashi unidic-lite |
|
# インストール |
|
import json |
|
import pandas as pd |
|
from tqdm import tqdm |
|
from transformers import pipeline |
|
from langchain_community.llms import VLLM |
|
from langchain.prompts import PromptTemplate |
|
from langchain_core.runnables import RunnablePassthrough |
|
from langchain.schema.output_parser import StrOutputParser |
|
|
|
|
|
# GitHub repositoryのclone |
|
!git clone https://github.com/y-hiroki-radiotech/llm-final-task.git |
|
%cd llm-final-task |
|
|
|
# タスク別に設定したプロンプトを使うために、PromptStockクラスをインスタンス化 |
|
from prompt import PromptStock |
|
prompt_stock = PromptStock() |
|
|
|
# データのpandas形式で準備する |
|
file_path = 'elyza-tasks-100-TV_0.jsonl' # ここにjsonlを指定する |
|
data = pd.read_json(file_path, lines=True) |
|
|
|
# データのinputに対して、タスクラベルを与える。タスクを8分類してある。 |
|
model_name = "hiroki-rad/bert-base-classification-ft" |
|
classify_pipe = pipeline(model=model_name, device="cuda:0") |
|
|
|
results: list[dict[str, float | str]] = [] |
|
for example in data.itertuples(): |
|
# モデルの予測結果を取得 |
|
model_prediction = classify_pipe(example.input)[0] |
|
# 正解のラベルIDをラベル名に変換 |
|
results.append( model_prediction["label"]) |
|
|
|
data["label"] = results |
|
|
|
# タスク回答のためのモデルをvLLMを使ってインストール |
|
model_name = "hiroki-rad/llm-jp-llm-jp-3-13b-16-ft" |
|
|
|
llm = VLLM(model=model_name, |
|
quantization="awq") |
|
|
|
# テンプレートの作成 |
|
template = """ |
|
ユーザー: 質問を良く読んで、適切な回答をしてください。 |
|
{context} |
|
質問:{input} |
|
回答:""" |
|
|
|
prompt = PromptTemplate( |
|
template=template, |
|
input_variables=["context", "input"], |
|
template_format="f-string" |
|
) |
|
# chainの作成 |
|
vllm_chain = prompt | llm |
|
|
|
chain = ( |
|
RunnablePassthrough() |
|
| vllm_chain |
|
| StrOutputParser() |
|
) |
|
|
|
outputs = [] |
|
total_rows = len(data) |
|
with tqdm(total=total_rows, |
|
desc="Processing rows", |
|
position=0, |
|
leave=True |
|
) as pbar: |
|
for row in data.itertuples(): |
|
prompt_string = prompt_stock.get_prompt(row.label) |
|
|
|
input_dict = { |
|
"context": prompt_string, |
|
"input": row.input |
|
} |
|
|
|
output = chain.invoke(input_dict) |
|
outputs.append(output) |
|
|
|
pbar.update(1) |
|
|
|
# 出力 |
|
jsonl_data = [] |
|
|
|
for i in range(len(data)): |
|
task_id = data.iloc[i]["task_id"] |
|
output = outputs[i] |
|
|
|
jsonl_object = { |
|
"task_id": task_id, |
|
"output": output |
|
} |
|
jsonl_data.append(jsonl_object) |
|
|
|
with open("output.jsonl", "w") as outfile: |
|
for entry in jsonl_data: |
|
entry["task_id"] = int(entry["task_id"]) |
|
json.dump(entry, outfile) |
|
outfile.write('\n') |