Spaces:

harismlnaslm
/

textilindo-ai-assistant

Build error

textilindo-ai-assistant / convert_dataset.py

Stefanus Simandjuntak

initial commit

9b4ef96 3 months ago

4.66 kB

	#!/usr/bin/env python3
	"""
	Convert dataset from instruction/input/output format to text format for fine-tuning
	"""

	import json
	import os
	from pathlib import Path

	def convert_dataset(input_file, output_file):
	"""Convert dataset from instruction format to text format"""

	print(f"🔄 Converting dataset from {input_file} to {output_file}")

	# Read input dataset
	with open(input_file, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	converted_data = []

	for i, line in enumerate(lines, 1):
	try:
	data = json.loads(line.strip())

	# Extract fields
	instruction = data.get('instruction', '')
	input_text = data.get('input', '')
	output = data.get('output', '')
	metadata = data.get('metadata', {})

	# Create training text in instruction-following format
	if input_text.strip():
	# If there's input, use instruction + input format
	training_text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
	else:
	# If no input, use simple instruction format
	training_text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"

	# Add to converted data
	converted_data.append({
	"text": training_text,
	"instruction": instruction,
	"input": input_text,
	"output": output,
	"metadata": metadata
	})

	except json.JSONDecodeError as e:
	print(f"⚠️ Warning: Invalid JSON at line {i}: {e}")
	continue

	# Save converted dataset
	with open(output_file, 'w', encoding='utf-8') as f:
	for item in converted_data:
	f.write(json.dumps(item, ensure_ascii=False) + '\n')

	print(f"✅ Converted {len(converted_data)} samples")
	print(f"📁 Saved to: {output_file}")

	return output_file

	def create_training_config(model_name, dataset_path):
	"""Create training configuration file"""

	config = {
	"model_name": model_name,
	"model_path": f"./models/{model_name.split('/')[-1]}",
	"dataset_path": dataset_path,
	"max_length": 2048,
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repetition_penalty": 1.1,

	"lora_config": {
	"r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.1,
	"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
	},

	"training_config": {
	"learning_rate": 2e-4,
	"batch_size": 4,
	"gradient_accumulation_steps": 4,
	"num_epochs": 3,
	"warmup_steps": 100,
	"save_steps": 500,
	"eval_steps": 500
	}
	}

	config_path = "configs/training_config.yaml"
	os.makedirs("configs", exist_ok=True)

	import yaml
	with open(config_path, 'w', encoding='utf-8') as f:
	yaml.dump(config, f, default_flow_style=False, allow_unicode=True)

	print(f"✅ Created training config: {config_path}")
	return config_path

	def main():
	print("🚀 Dataset Converter for Textilindo AI")
	print("=" * 50)

	# Input and output files
	input_file = "data/lora_dataset_20250829_113330.jsonl"
	output_file = "data/textilindo_training_data.jsonl"

	# Check if input file exists
	if not os.path.exists(input_file):
	print(f"❌ Input file not found: {input_file}")
	return

	# Convert dataset
	converted_file = convert_dataset(input_file, output_file)

	# Create training config
	model_name = "meta-llama/llama-3.2-1b-instruct" # Lightweight model for testing
	config_path = create_training_config(model_name, converted_file)

	print("\n🎉 Dataset conversion complete!")
	print("\n📋 Next steps:")
	print("1. Run fine-tuning: python scripts/finetune_lora.py")
	print("2. Test the model: python scripts/test_model.py")
	print("3. Deploy to Novita AI (manual process for now)")

	# Show sample of converted data
	print(f"\n📄 Sample converted data:")
	with open(output_file, 'r', encoding='utf-8') as f:
	sample = json.loads(f.readline())
	print(f"Text length: {len(sample['text'])} characters")
	print(f"Instruction: {sample['instruction'][:100]}...")
	print(f"Output: {sample['output'][:100]}...")

	if __name__ == "__main__":
	main()