Spaces:
Running
Running
import os | |
from hindi_bpe import HindiBPE | |
from tqdm import tqdm | |
def load_processed_data_in_chunks(file_path: str, max_sentences: int = 1_000_000) -> str: | |
"""Load data in chunks, up to max_sentences""" | |
buffer = [] | |
sentence_count = 0 | |
with open(file_path, 'r', encoding='utf-8') as f: | |
for line in tqdm(f, desc="Reading sentences"): | |
if sentence_count >= max_sentences: | |
break | |
line = line.strip() | |
if not line: | |
continue | |
buffer.append(line) | |
sentence_count += 1 | |
if len(buffer) >= 10000: # Process in chunks of 10K sentences | |
yield ' '.join(buffer) | |
buffer = [] | |
if buffer: # Don't forget the last chunk | |
yield ' '.join(buffer) | |
def main(): | |
# Initialize paths | |
data_dir = os.path.join("..", "data") | |
processed_file = os.path.join(data_dir, "hi_processed.txt") | |
# Check if processed data exists | |
if not os.path.exists(processed_file): | |
print("Processed data not found. Please run download_data.py first.") | |
return | |
# Initialize BPE | |
print("Initializing BPE tokenizer...") | |
print("Training Parameters:") | |
print("1. Using first 1 million sentences") | |
print("2. Vocabulary size must be < 5000 tokens") | |
print("3. Compression ratio must be ≥ 3.2") | |
bpe = HindiBPE() | |
print("\nTraining BPE model...") | |
is_first_chunk = True | |
total_sentences = 0 | |
for chunk in load_processed_data_in_chunks(processed_file): | |
if not chunk.strip(): | |
continue | |
bpe.train_on_chunk(chunk, is_first_chunk=is_first_chunk) | |
is_first_chunk = False | |
# Check if we've met both requirements | |
test_text = chunk[:10000] # Use a sample of text | |
compression_ratio = bpe.get_compression_ratio(test_text) | |
vocab_size = len(bpe.vocab) | |
print(f"\nCurrent status:") | |
print(f"Vocabulary size: {vocab_size} tokens") | |
print(f"Compression ratio: {compression_ratio:.2f}") | |
if compression_ratio >= 3.2: | |
if vocab_size < 5000: | |
print("\nSuccess! Met all requirements:") | |
print(f"1. Vocabulary size: {vocab_size} tokens (< 5000)") | |
print(f"2. Compression ratio: {compression_ratio:.2f} (≥ 3.2)") | |
break | |
else: | |
print("\nWarning: Need to reduce vocabulary size while maintaining compression ratio") | |
print("\nFinal Results:") | |
print(f"Vocabulary size: {len(bpe.vocab)} tokens") | |
print(f"Compression ratio: {compression_ratio:.2f}") | |
# Test the model with various Hindi texts | |
test_cases = [ | |
"नमस्ते भारत", | |
"मैं हिंदी सीख रहा हूं", | |
"यह एक परीक्षण वाक्य है", | |
"भारत एक विशाल देश है", | |
"मुझे हिंदी भाषा बहुत पसंद है" | |
] | |
print("\nTesting encoding/decoding on multiple examples:") | |
for i, test_text in enumerate(test_cases, 1): | |
print(f"\nTest case {i}:") | |
print(f"Original: {test_text}") | |
encoded = bpe.encode(test_text) | |
print(f"Encoded: {encoded}") | |
decoded = bpe.decode(encoded) | |
print(f"Decoded: {decoded}") | |
print(f"Matches: {'✓' if decoded == test_text else '✗'}") | |
if __name__ == "__main__": | |
main() |