# After saving processed_data.json from utils import tokenize, build_vocab, save_vocab from utils import load_data import json def prepare_training_data(processed_data, vocab_path='vocab.json'): tokenized_texts = [] for entry in processed_data: if isinstance(entry, str): tokens = tokenize(entry) tokenized_texts.append(tokens) elif isinstance(entry, list): for item in entry: if isinstance(item, str): tokens = tokenize(item) tokenized_texts.append(tokens) vocab = build_vocab(tokenized_texts) save_vocab(vocab, vocab_path) return tokenized_texts, vocab if __name__ == "__main__": data = load_data() tokenized_texts, vocab = prepare_training_data(data) # Save tokenized data with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f: json.dump(tokenized_texts, f, ensure_ascii=False, indent=4) print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json")