#!/usr/bin/env python3 """ Convert dataset from instruction/input/output format to text format for fine-tuning """ import json import os from pathlib import Path def convert_dataset(input_file, output_file): """Convert dataset from instruction format to text format""" print(f"šŸ”„ Converting dataset from {input_file} to {output_file}") # Read input dataset with open(input_file, 'r', encoding='utf-8') as f: lines = f.readlines() converted_data = [] for i, line in enumerate(lines, 1): try: data = json.loads(line.strip()) # Extract fields instruction = data.get('instruction', '') input_text = data.get('input', '') output = data.get('output', '') metadata = data.get('metadata', {}) # Create training text in instruction-following format if input_text.strip(): # If there's input, use instruction + input format training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}" else: # If no input, use simple instruction format training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}" # Add to converted data converted_data.append({ "text": training_text, "instruction": instruction, "input": input_text, "output": output, "metadata": metadata }) except json.JSONDecodeError as e: print(f"āš ļø Warning: Invalid JSON at line {i}: {e}") continue # Save converted dataset with open(output_file, 'w', encoding='utf-8') as f: for item in converted_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') print(f"āœ… Converted {len(converted_data)} samples") print(f"šŸ“ Saved to: {output_file}") return output_file def create_training_config(model_name, dataset_path): """Create training configuration file""" config = { "model_name": model_name, "model_path": f"./models/{model_name.split('/')[-1]}", "dataset_path": dataset_path, "max_length": 2048, "temperature": 0.7, "top_p": 0.9, "top_k": 40, "repetition_penalty": 1.1, "lora_config": { "r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] }, "training_config": { "learning_rate": 2e-4, "batch_size": 4, "gradient_accumulation_steps": 4, "num_epochs": 3, "warmup_steps": 100, "save_steps": 500, "eval_steps": 500 } } config_path = "configs/training_config.yaml" os.makedirs("configs", exist_ok=True) import yaml with open(config_path, 'w', encoding='utf-8') as f: yaml.dump(config, f, default_flow_style=False, allow_unicode=True) print(f"āœ… Created training config: {config_path}") return config_path def main(): print("šŸš€ Dataset Converter for Textilindo AI") print("=" * 50) # Input and output files input_file = "data/lora_dataset_20250829_113330.jsonl" output_file = "data/textilindo_training_data.jsonl" # Check if input file exists if not os.path.exists(input_file): print(f"āŒ Input file not found: {input_file}") return # Convert dataset converted_file = convert_dataset(input_file, output_file) # Create training config model_name = "meta-llama/llama-3.2-1b-instruct" # Lightweight model for testing config_path = create_training_config(model_name, converted_file) print("\nšŸŽ‰ Dataset conversion complete!") print("\nšŸ“‹ Next steps:") print("1. Run fine-tuning: python scripts/finetune_lora.py") print("2. Test the model: python scripts/test_model.py") print("3. Deploy to Novita AI (manual process for now)") # Show sample of converted data print(f"\nšŸ“„ Sample converted data:") with open(output_file, 'r', encoding='utf-8') as f: sample = json.loads(f.readline()) print(f"Text length: {len(sample['text'])} characters") print(f"Instruction: {sample['instruction'][:100]}...") print(f"Output: {sample['output'][:100]}...") if __name__ == "__main__": main()