textilindo-ai-assistant / convert_dataset.py
Stefanus Simandjuntak
initial commit
9b4ef96
#!/usr/bin/env python3
"""
Convert dataset from instruction/input/output format to text format for fine-tuning
"""
import json
import os
from pathlib import Path
def convert_dataset(input_file, output_file):
"""Convert dataset from instruction format to text format"""
print(f"πŸ”„ Converting dataset from {input_file} to {output_file}")
# Read input dataset
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
converted_data = []
for i, line in enumerate(lines, 1):
try:
data = json.loads(line.strip())
# Extract fields
instruction = data.get('instruction', '')
input_text = data.get('input', '')
output = data.get('output', '')
metadata = data.get('metadata', {})
# Create training text in instruction-following format
if input_text.strip():
# If there's input, use instruction + input format
training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
else:
# If no input, use simple instruction format
training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
# Add to converted data
converted_data.append({
"text": training_text,
"instruction": instruction,
"input": input_text,
"output": output,
"metadata": metadata
})
except json.JSONDecodeError as e:
print(f"⚠️ Warning: Invalid JSON at line {i}: {e}")
continue
# Save converted dataset
with open(output_file, 'w', encoding='utf-8') as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"βœ… Converted {len(converted_data)} samples")
print(f"πŸ“ Saved to: {output_file}")
return output_file
def create_training_config(model_name, dataset_path):
"""Create training configuration file"""
config = {
"model_name": model_name,
"model_path": f"./models/{model_name.split('/')[-1]}",
"dataset_path": dataset_path,
"max_length": 2048,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 40,
"repetition_penalty": 1.1,
"lora_config": {
"r": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
},
"training_config": {
"learning_rate": 2e-4,
"batch_size": 4,
"gradient_accumulation_steps": 4,
"num_epochs": 3,
"warmup_steps": 100,
"save_steps": 500,
"eval_steps": 500
}
}
config_path = "configs/training_config.yaml"
os.makedirs("configs", exist_ok=True)
import yaml
with open(config_path, 'w', encoding='utf-8') as f:
yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
print(f"βœ… Created training config: {config_path}")
return config_path
def main():
print("πŸš€ Dataset Converter for Textilindo AI")
print("=" * 50)
# Input and output files
input_file = "data/lora_dataset_20250829_113330.jsonl"
output_file = "data/textilindo_training_data.jsonl"
# Check if input file exists
if not os.path.exists(input_file):
print(f"❌ Input file not found: {input_file}")
return
# Convert dataset
converted_file = convert_dataset(input_file, output_file)
# Create training config
model_name = "meta-llama/llama-3.2-1b-instruct" # Lightweight model for testing
config_path = create_training_config(model_name, converted_file)
print("\nπŸŽ‰ Dataset conversion complete!")
print("\nπŸ“‹ Next steps:")
print("1. Run fine-tuning: python scripts/finetune_lora.py")
print("2. Test the model: python scripts/test_model.py")
print("3. Deploy to Novita AI (manual process for now)")
# Show sample of converted data
print(f"\nπŸ“„ Sample converted data:")
with open(output_file, 'r', encoding='utf-8') as f:
sample = json.loads(f.readline())
print(f"Text length: {len(sample['text'])} characters")
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Output: {sample['output'][:100]}...")
if __name__ == "__main__":
main()