Spaces:

harismlnaslm
/

textilindo-ai-assistant

Build error

File size: 4,657 Bytes

9b4ef96

#!/usr/bin/env python3
"""
Convert dataset from instruction/input/output format to text format for fine-tuning
"""

import json
import os
from pathlib import Path

def convert_dataset(input_file, output_file):
    """Convert dataset from instruction format to text format"""
    
    print(f"🔄 Converting dataset from {input_file} to {output_file}")
    
    # Read input dataset
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    converted_data = []
    
    for i, line in enumerate(lines, 1):
        try:
            data = json.loads(line.strip())
            
            # Extract fields
            instruction = data.get('instruction', '')
            input_text = data.get('input', '')
            output = data.get('output', '')
            metadata = data.get('metadata', {})
            
            # Create training text in instruction-following format
            if input_text.strip():
                # If there's input, use instruction + input format
                training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
            else:
                # If no input, use simple instruction format
                training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
            
            # Add to converted data
            converted_data.append({
                "text": training_text,
                "instruction": instruction,
                "input": input_text,
                "output": output,
                "metadata": metadata
            })
            
        except json.JSONDecodeError as e:
            print(f"⚠️  Warning: Invalid JSON at line {i}: {e}")
            continue
    
    # Save converted dataset
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"✅ Converted {len(converted_data)} samples")
    print(f"📁 Saved to: {output_file}")
    
    return output_file

def create_training_config(model_name, dataset_path):
    """Create training configuration file"""
    
    config = {
        "model_name": model_name,
        "model_path": f"./models/{model_name.split('/')[-1]}",
        "dataset_path": dataset_path,
        "max_length": 2048,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 40,
        "repetition_penalty": 1.1,
        
        "lora_config": {
            "r": 16,
            "lora_alpha": 32,
            "lora_dropout": 0.1,
            "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        },
        
        "training_config": {
            "learning_rate": 2e-4,
            "batch_size": 4,
            "gradient_accumulation_steps": 4,
            "num_epochs": 3,
            "warmup_steps": 100,
            "save_steps": 500,
            "eval_steps": 500
        }
    }
    
    config_path = "configs/training_config.yaml"
    os.makedirs("configs", exist_ok=True)
    
    import yaml
    with open(config_path, 'w', encoding='utf-8') as f:
        yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"✅ Created training config: {config_path}")
    return config_path

def main():
    print("🚀 Dataset Converter for Textilindo AI")
    print("=" * 50)
    
    # Input and output files
    input_file = "data/lora_dataset_20250829_113330.jsonl"
    output_file = "data/textilindo_training_data.jsonl"
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"❌ Input file not found: {input_file}")
        return
    
    # Convert dataset
    converted_file = convert_dataset(input_file, output_file)
    
    # Create training config
    model_name = "meta-llama/llama-3.2-1b-instruct"  # Lightweight model for testing
    config_path = create_training_config(model_name, converted_file)
    
    print("\n🎉 Dataset conversion complete!")
    print("\n📋 Next steps:")
    print("1. Run fine-tuning: python scripts/finetune_lora.py")
    print("2. Test the model: python scripts/test_model.py")
    print("3. Deploy to Novita AI (manual process for now)")
    
    # Show sample of converted data
    print(f"\n📄 Sample converted data:")
    with open(output_file, 'r', encoding='utf-8') as f:
        sample = json.loads(f.readline())
        print(f"Text length: {len(sample['text'])} characters")
        print(f"Instruction: {sample['instruction'][:100]}...")
        print(f"Output: {sample['output'][:100]}...")

if __name__ == "__main__":
    main()