Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """ | |
| Convert dataset from instruction/input/output format to text format for fine-tuning | |
| """ | |
| import json | |
| import os | |
| from pathlib import Path | |
| def convert_dataset(input_file, output_file): | |
| """Convert dataset from instruction format to text format""" | |
| print(f"π Converting dataset from {input_file} to {output_file}") | |
| # Read input dataset | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| converted_data = [] | |
| for i, line in enumerate(lines, 1): | |
| try: | |
| data = json.loads(line.strip()) | |
| # Extract fields | |
| instruction = data.get('instruction', '') | |
| input_text = data.get('input', '') | |
| output = data.get('output', '') | |
| metadata = data.get('metadata', {}) | |
| # Create training text in instruction-following format | |
| if input_text.strip(): | |
| # If there's input, use instruction + input format | |
| training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}" | |
| else: | |
| # If no input, use simple instruction format | |
| training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}" | |
| # Add to converted data | |
| converted_data.append({ | |
| "text": training_text, | |
| "instruction": instruction, | |
| "input": input_text, | |
| "output": output, | |
| "metadata": metadata | |
| }) | |
| except json.JSONDecodeError as e: | |
| print(f"β οΈ Warning: Invalid JSON at line {i}: {e}") | |
| continue | |
| # Save converted dataset | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for item in converted_data: | |
| f.write(json.dumps(item, ensure_ascii=False) + '\n') | |
| print(f"β Converted {len(converted_data)} samples") | |
| print(f"π Saved to: {output_file}") | |
| return output_file | |
| def create_training_config(model_name, dataset_path): | |
| """Create training configuration file""" | |
| config = { | |
| "model_name": model_name, | |
| "model_path": f"./models/{model_name.split('/')[-1]}", | |
| "dataset_path": dataset_path, | |
| "max_length": 2048, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "top_k": 40, | |
| "repetition_penalty": 1.1, | |
| "lora_config": { | |
| "r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.1, | |
| "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] | |
| }, | |
| "training_config": { | |
| "learning_rate": 2e-4, | |
| "batch_size": 4, | |
| "gradient_accumulation_steps": 4, | |
| "num_epochs": 3, | |
| "warmup_steps": 100, | |
| "save_steps": 500, | |
| "eval_steps": 500 | |
| } | |
| } | |
| config_path = "configs/training_config.yaml" | |
| os.makedirs("configs", exist_ok=True) | |
| import yaml | |
| with open(config_path, 'w', encoding='utf-8') as f: | |
| yaml.dump(config, f, default_flow_style=False, allow_unicode=True) | |
| print(f"β Created training config: {config_path}") | |
| return config_path | |
| def main(): | |
| print("π Dataset Converter for Textilindo AI") | |
| print("=" * 50) | |
| # Input and output files | |
| input_file = "data/lora_dataset_20250829_113330.jsonl" | |
| output_file = "data/textilindo_training_data.jsonl" | |
| # Check if input file exists | |
| if not os.path.exists(input_file): | |
| print(f"β Input file not found: {input_file}") | |
| return | |
| # Convert dataset | |
| converted_file = convert_dataset(input_file, output_file) | |
| # Create training config | |
| model_name = "meta-llama/llama-3.2-1b-instruct" # Lightweight model for testing | |
| config_path = create_training_config(model_name, converted_file) | |
| print("\nπ Dataset conversion complete!") | |
| print("\nπ Next steps:") | |
| print("1. Run fine-tuning: python scripts/finetune_lora.py") | |
| print("2. Test the model: python scripts/test_model.py") | |
| print("3. Deploy to Novita AI (manual process for now)") | |
| # Show sample of converted data | |
| print(f"\nπ Sample converted data:") | |
| with open(output_file, 'r', encoding='utf-8') as f: | |
| sample = json.loads(f.readline()) | |
| print(f"Text length: {len(sample['text'])} characters") | |
| print(f"Instruction: {sample['instruction'][:100]}...") | |
| print(f"Output: {sample['output'][:100]}...") | |
| if __name__ == "__main__": | |
| main() | |