Spaces:
Build error
Build error
File size: 4,657 Bytes
9b4ef96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
#!/usr/bin/env python3
"""
Convert dataset from instruction/input/output format to text format for fine-tuning
"""
import json
import os
from pathlib import Path
def convert_dataset(input_file, output_file):
"""Convert dataset from instruction format to text format"""
print(f"π Converting dataset from {input_file} to {output_file}")
# Read input dataset
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
converted_data = []
for i, line in enumerate(lines, 1):
try:
data = json.loads(line.strip())
# Extract fields
instruction = data.get('instruction', '')
input_text = data.get('input', '')
output = data.get('output', '')
metadata = data.get('metadata', {})
# Create training text in instruction-following format
if input_text.strip():
# If there's input, use instruction + input format
training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
else:
# If no input, use simple instruction format
training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
# Add to converted data
converted_data.append({
"text": training_text,
"instruction": instruction,
"input": input_text,
"output": output,
"metadata": metadata
})
except json.JSONDecodeError as e:
print(f"β οΈ Warning: Invalid JSON at line {i}: {e}")
continue
# Save converted dataset
with open(output_file, 'w', encoding='utf-8') as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"β
Converted {len(converted_data)} samples")
print(f"π Saved to: {output_file}")
return output_file
def create_training_config(model_name, dataset_path):
"""Create training configuration file"""
config = {
"model_name": model_name,
"model_path": f"./models/{model_name.split('/')[-1]}",
"dataset_path": dataset_path,
"max_length": 2048,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 40,
"repetition_penalty": 1.1,
"lora_config": {
"r": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
},
"training_config": {
"learning_rate": 2e-4,
"batch_size": 4,
"gradient_accumulation_steps": 4,
"num_epochs": 3,
"warmup_steps": 100,
"save_steps": 500,
"eval_steps": 500
}
}
config_path = "configs/training_config.yaml"
os.makedirs("configs", exist_ok=True)
import yaml
with open(config_path, 'w', encoding='utf-8') as f:
yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
print(f"β
Created training config: {config_path}")
return config_path
def main():
print("π Dataset Converter for Textilindo AI")
print("=" * 50)
# Input and output files
input_file = "data/lora_dataset_20250829_113330.jsonl"
output_file = "data/textilindo_training_data.jsonl"
# Check if input file exists
if not os.path.exists(input_file):
print(f"β Input file not found: {input_file}")
return
# Convert dataset
converted_file = convert_dataset(input_file, output_file)
# Create training config
model_name = "meta-llama/llama-3.2-1b-instruct" # Lightweight model for testing
config_path = create_training_config(model_name, converted_file)
print("\nπ Dataset conversion complete!")
print("\nπ Next steps:")
print("1. Run fine-tuning: python scripts/finetune_lora.py")
print("2. Test the model: python scripts/test_model.py")
print("3. Deploy to Novita AI (manual process for now)")
# Show sample of converted data
print(f"\nπ Sample converted data:")
with open(output_file, 'r', encoding='utf-8') as f:
sample = json.loads(f.readline())
print(f"Text length: {len(sample['text'])} characters")
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Output: {sample['output'][:100]}...")
if __name__ == "__main__":
main()
|