File size: 4,657 Bytes
9b4ef96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
"""
Convert dataset from instruction/input/output format to text format for fine-tuning
"""

import json
import os
from pathlib import Path

def convert_dataset(input_file, output_file):
    """Convert dataset from instruction format to text format"""
    
    print(f"πŸ”„ Converting dataset from {input_file} to {output_file}")
    
    # Read input dataset
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    converted_data = []
    
    for i, line in enumerate(lines, 1):
        try:
            data = json.loads(line.strip())
            
            # Extract fields
            instruction = data.get('instruction', '')
            input_text = data.get('input', '')
            output = data.get('output', '')
            metadata = data.get('metadata', {})
            
            # Create training text in instruction-following format
            if input_text.strip():
                # If there's input, use instruction + input format
                training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
            else:
                # If no input, use simple instruction format
                training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
            
            # Add to converted data
            converted_data.append({
                "text": training_text,
                "instruction": instruction,
                "input": input_text,
                "output": output,
                "metadata": metadata
            })
            
        except json.JSONDecodeError as e:
            print(f"⚠️  Warning: Invalid JSON at line {i}: {e}")
            continue
    
    # Save converted dataset
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"βœ… Converted {len(converted_data)} samples")
    print(f"πŸ“ Saved to: {output_file}")
    
    return output_file

def create_training_config(model_name, dataset_path):
    """Create training configuration file"""
    
    config = {
        "model_name": model_name,
        "model_path": f"./models/{model_name.split('/')[-1]}",
        "dataset_path": dataset_path,
        "max_length": 2048,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 40,
        "repetition_penalty": 1.1,
        
        "lora_config": {
            "r": 16,
            "lora_alpha": 32,
            "lora_dropout": 0.1,
            "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        },
        
        "training_config": {
            "learning_rate": 2e-4,
            "batch_size": 4,
            "gradient_accumulation_steps": 4,
            "num_epochs": 3,
            "warmup_steps": 100,
            "save_steps": 500,
            "eval_steps": 500
        }
    }
    
    config_path = "configs/training_config.yaml"
    os.makedirs("configs", exist_ok=True)
    
    import yaml
    with open(config_path, 'w', encoding='utf-8') as f:
        yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
    
    print(f"βœ… Created training config: {config_path}")
    return config_path

def main():
    print("πŸš€ Dataset Converter for Textilindo AI")
    print("=" * 50)
    
    # Input and output files
    input_file = "data/lora_dataset_20250829_113330.jsonl"
    output_file = "data/textilindo_training_data.jsonl"
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"❌ Input file not found: {input_file}")
        return
    
    # Convert dataset
    converted_file = convert_dataset(input_file, output_file)
    
    # Create training config
    model_name = "meta-llama/llama-3.2-1b-instruct"  # Lightweight model for testing
    config_path = create_training_config(model_name, converted_file)
    
    print("\nπŸŽ‰ Dataset conversion complete!")
    print("\nπŸ“‹ Next steps:")
    print("1. Run fine-tuning: python scripts/finetune_lora.py")
    print("2. Test the model: python scripts/test_model.py")
    print("3. Deploy to Novita AI (manual process for now)")
    
    # Show sample of converted data
    print(f"\nπŸ“„ Sample converted data:")
    with open(output_file, 'r', encoding='utf-8') as f:
        sample = json.loads(f.readline())
        print(f"Text length: {len(sample['text'])} characters")
        print(f"Instruction: {sample['instruction'][:100]}...")
        print(f"Output: {sample['output'][:100]}...")

if __name__ == "__main__":
    main()