In [None]:
!pip install transformers datasets trl bitsandbytes peft
!pip install datasets
!pip install -U accelerate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
import torchvision
torchvision.disable_beta_transforms_warning()
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

# W&B 비활성화

In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)




# 환경 변수 로드 및 Google Colab 환경 설정

In [None]:
if os.path.exists('C:/Users/yd170/OneDrive/바탕 화면/Coding/KRX.env'):
    load_dotenv('C:/Users/yd170/OneDrive/바탕 화면/Coding/KRX.env')
    hf_token = os.getenv("HF_TOKEN")
else:
    hf_token = "YOUR_HF_TOKEN"

model_name = "Qwen/Qwen2-1.5B"
max_seq_length = 2048

# 양자화 설정 (4비트 양자화 사용)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4비트 양자화 적용
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 모델 및 토크나이저 로드 (GPU 사용하도록 설정)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# LoRA 설정 추가

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)

# 두 개의 데이터셋 로드 및 병합

In [None]:
first_dataset = load_dataset("amphora/krx-sample-instructions", split="train")
second_dataset = load_dataset("Cartinoe5930/web_text_synthetic_dataset_50k", split="train")

# 데이터셋 병합
dataset = concatenate_datasets([first_dataset, second_dataset])

# 프롬프트 포맷 설정
prompt_format = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = []
    outputs = []

    # 데이터셋 내 필드 확인 후 각 필드에 따라 처리
    if "prompt" in examples and "response" in examples:
        instructions = examples["prompt"]
        outputs = examples["response"]
    elif "question" in examples and "response" in examples:
        instructions = examples["question"]
        outputs = examples["response"]
    else:
        raise KeyError("The dataset fields do not match the expected format.")

    texts = [prompt_format.format(instr, output) + EOS_TOKEN for instr, output in zip(instructions, outputs)]
    return {"formatted_text": texts}

# 데이터셋 가공
dataset = dataset.map(formatting_prompts_func, batched=True)

# 모델 학습 설정

In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=1,  d
    gradient_accumulation_steps=8,
    max_steps=100,
    logging_steps=10,
    learning_rate=2e-5,
    seed=42,
    save_steps=100,
    fp16=True, # 혼합 정밀도 사용으로 메모리 최적화
    report_to="none",
)

# GPU 메모리 관리 최적화 환경 변수 설정

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# SFTTrainer 초기화

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,  # 병합된 데이터셋 사용
    dataset_text_field="formatted_text",
    max_seq_length=1024,
    args=training_args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


# 학습

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print("모델 학습 시작...")
trainer.train()
print("모델 학습 완료.")

# 학습된 모델 저장
print("모델 저장 중...")
model.save_pretrained("/content/drive/My Drive/KRX_Qwen2_1_5B")
print("모델 저장 완료.")

print("토크나이저 저장 중...")
tokenizer.save_pretrained("/content/drive/My Drive/KRX_Qwen2_1_5B")
print("토크나이저 저장 완료.")


모델 학습 시작...


Step,Training Loss
10,1.8213
20,1.8044
30,1.7777
40,1.8279
50,1.7705
60,1.7346
70,1.8031
80,1.6932
90,1.7279
100,1.7092


모델 학습 완료.
모델 저장 중...
모델 저장 완료.
토크나이저 저장 중...
토크나이저 저장 완료.


# 학습된 모델 로드

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

# 학습된 모델 로드 (Google Drive에서 저장된 경로 지정)
model_name = "/content/drive/My Drive/KRX_Qwen2_1_5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print("모델과 토크나이저가 성공적으로 로드되었습니다.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


모델과 토크나이저가 성공적으로 로드되었습니다.


# 추론을 위한 프롬프트 설정

In [None]:
prompt_format = """The following is a detailed financial question or instruction, and the corresponding answer is expected to be precise and informative. Use relevant financial terms and provide a comprehensive explanation.

### Instruction:
{}

### Response:"""

# 예제 프롬프트

In [None]:
instruction = "선물옵션에 대해 설명해줘."
prompt = prompt_format.format(instruction)
inputs = tokenizer(prompt, return_tensors="pt")

# 텍스트 생성

In [None]:
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7,         # 다양성 조절
    top_k=50,                # 최상위 K개의 단어만 고려
    repetition_penalty=1.2,  # 반복을 줄이기 위한 패널티
    use_cache=True
)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


# 결과 출력

In [None]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 입력된 프롬프트 이후의 응답 부분만 출력
print(response[len(prompt):].strip())

Orange price at the port of origin?

Yes, that's correct! The term "orange" in this context refers to the orange fruit itself rather than its value as an investment asset.
