import os from hindi_bpe import HindiBPE from tqdm import tqdm def load_processed_data_in_chunks(file_path: str, max_sentences: int = 1_000_000) -> str: """Load data in chunks, up to max_sentences""" buffer = [] sentence_count = 0 with open(file_path, 'r', encoding='utf-8') as f: for line in tqdm(f, desc="Reading sentences"): if sentence_count >= max_sentences: break line = line.strip() if not line: continue buffer.append(line) sentence_count += 1 if len(buffer) >= 10000: # Process in chunks of 10K sentences yield ' '.join(buffer) buffer = [] if buffer: # Don't forget the last chunk yield ' '.join(buffer) def main(): # Initialize paths data_dir = os.path.join("..", "data") processed_file = os.path.join(data_dir, "hi_processed.txt") # Check if processed data exists if not os.path.exists(processed_file): print("Processed data not found. Please run download_data.py first.") return # Initialize BPE print("Initializing BPE tokenizer...") print("Training Parameters:") print("1. Using first 1 million sentences") print("2. Vocabulary size must be < 5000 tokens") print("3. Compression ratio must be ≥ 3.2") bpe = HindiBPE() print("\nTraining BPE model...") is_first_chunk = True total_sentences = 0 for chunk in load_processed_data_in_chunks(processed_file): if not chunk.strip(): continue bpe.train_on_chunk(chunk, is_first_chunk=is_first_chunk) is_first_chunk = False # Check if we've met both requirements test_text = chunk[:10000] # Use a sample of text compression_ratio = bpe.get_compression_ratio(test_text) vocab_size = len(bpe.vocab) print(f"\nCurrent status:") print(f"Vocabulary size: {vocab_size} tokens") print(f"Compression ratio: {compression_ratio:.2f}") if compression_ratio >= 3.2: if vocab_size < 5000: print("\nSuccess! Met all requirements:") print(f"1. Vocabulary size: {vocab_size} tokens (< 5000)") print(f"2. Compression ratio: {compression_ratio:.2f} (≥ 3.2)") break else: print("\nWarning: Need to reduce vocabulary size while maintaining compression ratio") print("\nFinal Results:") print(f"Vocabulary size: {len(bpe.vocab)} tokens") print(f"Compression ratio: {compression_ratio:.2f}") # Test the model with various Hindi texts test_cases = [ "नमस्ते भारत", "मैं हिंदी सीख रहा हूं", "यह एक परीक्षण वाक्य है", "भारत एक विशाल देश है", "मुझे हिंदी भाषा बहुत पसंद है" ] print("\nTesting encoding/decoding on multiple examples:") for i, test_text in enumerate(test_cases, 1): print(f"\nTest case {i}:") print(f"Original: {test_text}") encoded = bpe.encode(test_text) print(f"Encoded: {encoded}") decoded = bpe.decode(encoded) print(f"Decoded: {decoded}") print(f"Matches: {'✓' if decoded == test_text else '✗'}") if __name__ == "__main__": main()