File size: 3,582 Bytes
ca9ce93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
from hindi_bpe import HindiBPE
from tqdm import tqdm

def load_processed_data_in_chunks(file_path: str, max_sentences: int = 1_000_000) -> str:
    """Load data in chunks, up to max_sentences"""
    buffer = []
    sentence_count = 0
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading sentences"):
            if sentence_count >= max_sentences:
                break
                
            line = line.strip()
            if not line:
                continue
                
            buffer.append(line)
            sentence_count += 1
            
            if len(buffer) >= 10000:  # Process in chunks of 10K sentences
                yield ' '.join(buffer)
                buffer = []
    
    if buffer:  # Don't forget the last chunk
        yield ' '.join(buffer)

def main():
    # Initialize paths
    data_dir = os.path.join("..", "data")
    processed_file = os.path.join(data_dir, "hi_processed.txt")
    
    # Check if processed data exists
    if not os.path.exists(processed_file):
        print("Processed data not found. Please run download_data.py first.")
        return
    
    # Initialize BPE
    print("Initializing BPE tokenizer...")
    print("Training Parameters:")
    print("1. Using first 1 million sentences")
    print("2. Vocabulary size must be < 5000 tokens")
    print("3. Compression ratio must be ≥ 3.2")
    bpe = HindiBPE()
    
    print("\nTraining BPE model...")
    is_first_chunk = True
    total_sentences = 0
    
    for chunk in load_processed_data_in_chunks(processed_file):
        if not chunk.strip():
            continue
            
        bpe.train_on_chunk(chunk, is_first_chunk=is_first_chunk)
        is_first_chunk = False
        
        # Check if we've met both requirements
        test_text = chunk[:10000]  # Use a sample of text
        compression_ratio = bpe.get_compression_ratio(test_text)
        vocab_size = len(bpe.vocab)
        
        print(f"\nCurrent status:")
        print(f"Vocabulary size: {vocab_size} tokens")
        print(f"Compression ratio: {compression_ratio:.2f}")
        
        if compression_ratio >= 3.2:
            if vocab_size < 5000:
                print("\nSuccess! Met all requirements:")
                print(f"1. Vocabulary size: {vocab_size} tokens (< 5000)")
                print(f"2. Compression ratio: {compression_ratio:.2f} (≥ 3.2)")
                break
            else:
                print("\nWarning: Need to reduce vocabulary size while maintaining compression ratio")
    
    print("\nFinal Results:")
    print(f"Vocabulary size: {len(bpe.vocab)} tokens")
    print(f"Compression ratio: {compression_ratio:.2f}")
    
    # Test the model with various Hindi texts
    test_cases = [
        "नमस्ते भारत",
        "मैं हिंदी सीख रहा हूं",
        "यह एक परीक्षण वाक्य है",
        "भारत एक विशाल देश है",
        "मुझे हिंदी भाषा बहुत पसंद है"
    ]
    
    print("\nTesting encoding/decoding on multiple examples:")
    for i, test_text in enumerate(test_cases, 1):
        print(f"\nTest case {i}:")
        print(f"Original: {test_text}")
        encoded = bpe.encode(test_text)
        print(f"Encoded: {encoded}")
        decoded = bpe.decode(encoded)
        print(f"Decoded: {decoded}")
        print(f"Matches: {'✓' if decoded == test_text else '✗'}")

if __name__ == "__main__":
    main()