Hindi-BPE / src /train_bpe.py
jatingocodeo's picture
Upload 2 files
ca9ce93 verified
raw
history blame
3.58 kB
import os
from hindi_bpe import HindiBPE
from tqdm import tqdm
def load_processed_data_in_chunks(file_path: str, max_sentences: int = 1_000_000) -> str:
"""Load data in chunks, up to max_sentences"""
buffer = []
sentence_count = 0
with open(file_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc="Reading sentences"):
if sentence_count >= max_sentences:
break
line = line.strip()
if not line:
continue
buffer.append(line)
sentence_count += 1
if len(buffer) >= 10000: # Process in chunks of 10K sentences
yield ' '.join(buffer)
buffer = []
if buffer: # Don't forget the last chunk
yield ' '.join(buffer)
def main():
# Initialize paths
data_dir = os.path.join("..", "data")
processed_file = os.path.join(data_dir, "hi_processed.txt")
# Check if processed data exists
if not os.path.exists(processed_file):
print("Processed data not found. Please run download_data.py first.")
return
# Initialize BPE
print("Initializing BPE tokenizer...")
print("Training Parameters:")
print("1. Using first 1 million sentences")
print("2. Vocabulary size must be < 5000 tokens")
print("3. Compression ratio must be ≥ 3.2")
bpe = HindiBPE()
print("\nTraining BPE model...")
is_first_chunk = True
total_sentences = 0
for chunk in load_processed_data_in_chunks(processed_file):
if not chunk.strip():
continue
bpe.train_on_chunk(chunk, is_first_chunk=is_first_chunk)
is_first_chunk = False
# Check if we've met both requirements
test_text = chunk[:10000] # Use a sample of text
compression_ratio = bpe.get_compression_ratio(test_text)
vocab_size = len(bpe.vocab)
print(f"\nCurrent status:")
print(f"Vocabulary size: {vocab_size} tokens")
print(f"Compression ratio: {compression_ratio:.2f}")
if compression_ratio >= 3.2:
if vocab_size < 5000:
print("\nSuccess! Met all requirements:")
print(f"1. Vocabulary size: {vocab_size} tokens (< 5000)")
print(f"2. Compression ratio: {compression_ratio:.2f} (≥ 3.2)")
break
else:
print("\nWarning: Need to reduce vocabulary size while maintaining compression ratio")
print("\nFinal Results:")
print(f"Vocabulary size: {len(bpe.vocab)} tokens")
print(f"Compression ratio: {compression_ratio:.2f}")
# Test the model with various Hindi texts
test_cases = [
"नमस्ते भारत",
"मैं हिंदी सीख रहा हूं",
"यह एक परीक्षण वाक्य है",
"भारत एक विशाल देश है",
"मुझे हिंदी भाषा बहुत पसंद है"
]
print("\nTesting encoding/decoding on multiple examples:")
for i, test_text in enumerate(test_cases, 1):
print(f"\nTest case {i}:")
print(f"Original: {test_text}")
encoded = bpe.encode(test_text)
print(f"Encoded: {encoded}")
decoded = bpe.decode(encoded)
print(f"Decoded: {decoded}")
print(f"Matches: {'✓' if decoded == test_text else '✗'}")
if __name__ == "__main__":
main()