Spaces:

jatingocodeo
/

Hindi-BPE

Running

App Files Files Community

jatingocodeo commited on 4 days ago

Commit

ca9ce93

verified ·

1 Parent(s): f081412

Upload 2 files

Browse files

Files changed (2) hide show

src/hindi_bpe.py +299 -0
src/train_bpe.py +100 -0

src/hindi_bpe.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import re
+import collections
+from typing import Dict, List, Tuple, Set
+from tqdm import tqdm
+from functools import lru_cache
+class HindiBPE:
+    def __init__(self, max_vocab_size: int = 5000, target_compression: float = 3.2):
+        self.max_vocab_size = max_vocab_size
+        self.target_compression = target_compression
+        self.vocab = {"<PAD>": 0, "<UNK>": 1, "<BOS>": 2, "<EOS>": 3}
+        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
+        self.bpe_ranks = {}
+        self.cache = {}
+        self.special_tokens = {"<PAD>", "<UNK>", "<BOS>", "<EOS>"}
+        self.word_end_token = "▁"  # Special token to mark word boundaries
+        self.vocab[self.word_end_token] = len(self.vocab)
+        self.inverse_vocab[self.vocab[self.word_end_token]] = self.word_end_token
+    def _tokenize_word(self, word: str) -> List[str]:
+        """Tokenize a word into characters, handling Hindi characters properly"""
+        if word in self.cache:
+            return self.cache[word]
+        # First check if the whole word is in vocabulary
+        if word in self.vocab:
+            self.cache[word] = [word]
+            return [word]
+        # Split into individual characters while preserving character combinations
+        tokens = []
+        i = 0
+        while i < len(word):
+            # Check for Hindi character followed by combining marks
+            if re.match(r'[\u0900-\u097F]', word[i]):
+                token = word[i]
+                i += 1
+                # Add combining marks to the token
+                while i < len(word) and re.match(r'[\u0900-\u0903\u093A-\u094F\u0962-\u0963]', word[i]):
+                    token += word[i]
+                    i += 1
+                tokens.append(token)
+            else:
+                # Handle non-Hindi characters
+                token = word[i]
+                i += 1
+                tokens.append(token)
+        self.cache[word] = tokens
+        return tokens
+    def train_on_chunk(self, text: str, is_first_chunk: bool = False):
+        """Train BPE on text data"""
+        if not text.strip():
+            return
+        # Add common Hindi words and characters to vocabulary first
+        common_words = ["है", "मैं", "हूं", "का", "की", "के", "में", "से", "को", "पर", "और", "हैं", "था", "थी", "थे",
+                       "नमस्ते", "भारत", "हिंदी", "सीख", "रहा", "यह", "एक", "परीक्षण", "वाक्य", "विशाल", "देश",
+                       "मुझे", "भाषा", "बहुत", "पसंद"]
+        for word in common_words:
+            if word not in self.vocab and len(self.vocab) < self.max_vocab_size:
+                self.vocab[word] = len(self.vocab)
+                self.inverse_vocab[self.vocab[word]] = word
+        # First pass: collect word frequencies
+        word_freqs = collections.Counter(text.split())
+        # Add most frequent whole words to vocabulary (up to 10% of vocab size)
+        max_word_tokens = self.max_vocab_size // 10
+        for word, freq in word_freqs.most_common(max_word_tokens):
+            if len(word) > 1 and word not in self.vocab and len(self.vocab) < self.max_vocab_size:
+                self.vocab[word] = len(self.vocab)
+                self.inverse_vocab[self.vocab[word]] = word
+        # Tokenize words and filter out empty ones
+        words = [self._tokenize_word(word) for word in tqdm(text.split(), desc="Tokenizing words")]
+        words = [word for word in words if word]  # Filter out empty words
+        if not words:  # If no valid words found
+            return
+        # Initialize pair statistics
+        print("Computing pair statistics...")
+        pair_stats = collections.Counter()
+        for word in words:
+            if len(word) < 2:  # Skip single-character words
+                continue
+            word_freq = word_freqs[' '.join(word)]
+            for i in range(len(word) - 1):
+                pair = (word[i], word[i+1])
+                pair_stats[pair] += word_freq
+        if not pair_stats:  # If no valid pairs found
+            return
+        # Keep track of best model
+        best_vocab_size = len(self.vocab)
+        best_compression = 0.0
+        best_state = None
+        # Training loop
+        with tqdm(total=self.max_vocab_size - len(self.vocab), desc="Training BPE") as pbar:
+            while len(self.vocab) < self.max_vocab_size and pair_stats:
+                # Get most frequent pair
+                best_pair = max(pair_stats.items(), key=lambda x: (x[1], x[0]))[0]
+                new_token = ''.join(best_pair)
+                if new_token in self.vocab or len(self.vocab) >= self.max_vocab_size:
+                    # Skip if token already exists or vocab is full
+                    del pair_stats[best_pair]
+                    continue
+                # Add to vocabulary
+                token_id = len(self.vocab)
+                self.vocab[new_token] = token_id
+                self.inverse_vocab[token_id] = new_token
+                self.bpe_ranks[best_pair] = len(self.bpe_ranks)
+                # Update words and pair statistics
+                new_words = []
+                for word in words:
+                    if len(word) < 2:  # Skip single-character words
+                        new_words.append(word)
+                        continue
+                    i = 0
+                    new_word = []
+                    while i < len(word):
+                        if i < len(word) - 1 and word[i] == best_pair[0] and word[i+1] == best_pair[1]:
+                            new_word.append(new_token)
+                            i += 2
+                        else:
+                            new_word.append(word[i])
+                            i += 1
+                    new_words.append(new_word)
+                # Update statistics
+                pair_stats.clear()
+                for word in new_words:
+                    if len(word) < 2:  # Skip single-character words
+                        continue
+                    word_freq = word_freqs[' '.join(word)]
+                    for i in range(len(word) - 1):
+                        pair = (word[i], word[i+1])
+                        pair_stats[pair] += word_freq
+                words = new_words
+                # Calculate compression ratio every 50 tokens
+                if len(self.vocab) % 50 == 0:
+                    sample_text = ' '.join([''.join(w) for w in words[:2000]])
+                    current_ratio = self.get_compression_ratio(sample_text)
+                    print(f"\nVocab size: {len(self.vocab)}, Compression ratio: {current_ratio:.2f}")
+                    # Update best model if we meet requirements
+                    if current_ratio >= self.target_compression and len(self.vocab) < self.max_vocab_size:
+                        if current_ratio > best_compression:
+                            best_compression = current_ratio
+                            best_vocab_size = len(self.vocab)
+                            best_state = {
+                                'vocab': self.vocab.copy(),
+                                'inverse_vocab': self.inverse_vocab.copy(),
+                                'bpe_ranks': self.bpe_ranks.copy()
+                            }
+                pbar.update(1)
+                # Stop if we've exceeded vocab size
+                if len(self.vocab) >= self.max_vocab_size:
+                    break
+        # Restore best model if found
+        if best_state is not None:
+            print(f"\nRestoring best model (vocab size: {best_vocab_size}, compression: {best_compression:.2f})")
+            self.vocab = best_state['vocab']
+            self.inverse_vocab = best_state['inverse_vocab']
+            self.bpe_ranks = best_state['bpe_ranks']
+        # Calculate final metrics on the full text
+        final_ratio = self.get_compression_ratio(text)
+        print(f"\nFinal vocabulary size: {len(self.vocab)}")
+        print(f"Final compression ratio: {final_ratio:.2f}")
+    def encode(self, text: str) -> List[int]:
+        """Encode text to token ids"""
+        if not text.strip():
+            return []
+        result = []
+        words = text.split()
+        for i, word in enumerate(words):
+            if not word.strip():
+                continue
+            # Check if the word is in vocabulary as a whole
+            if word in self.vocab:
+                result.append(self.vocab[word])
+            else:
+                # Start with character-level tokens
+                tokens = self._tokenize_word(word)
+                word_tokens = []
+                # Try to merge tokens using learned BPE merges
+                while len(tokens) > 1:
+                    pairs = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
+                    if not pairs:
+                        break
+                    # Find the highest ranked pair
+                    best_pair = None
+                    best_rank = float('inf')
+                    best_idx = -1
+                    for i, pair in enumerate(pairs):
+                        rank = self.bpe_ranks.get(pair, float('inf'))
+                        if rank < best_rank:
+                            best_rank = rank
+                            best_pair = pair
+                            best_idx = i
+                    if best_pair is None:  # No mergeable pairs found
+                        break
+                    # Merge the best pair
+                    merged = ''.join(best_pair)
+                    if merged not in self.vocab:  # Skip if merged token not in vocab
+                        break
+                    tokens = (
+                        tokens[:best_idx] +
+                        [merged] +
+                        tokens[best_idx + 2:]
+                    )
+                # Convert tokens to ids
+                for token in tokens:
+                    if token in self.vocab:
+                        word_tokens.append(self.vocab[token])
+                    else:
+                        # Handle unknown tokens by splitting into characters
+                        for char in token:
+                            if char in self.vocab:
+                                word_tokens.append(self.vocab[char])
+                            else:
+                                word_tokens.append(self.vocab["<UNK>"])
+                result.extend(word_tokens)
+            # Add word boundary token except for the last word
+            if i < len(words) - 1:
+                result.append(self.vocab[self.word_end_token])
+        return result
+    def decode(self, ids: List[int]) -> str:
+        """Decode token ids back to text"""
+        if not ids:
+            return ""
+        tokens = []
+        current_word = []
+        for id in ids:
+            token = self.inverse_vocab.get(id, "<UNK>")
+            # Skip special tokens except word boundary
+            if token in self.special_tokens and token != self.word_end_token:
+                continue
+            # Handle word boundary
+            if token == self.word_end_token:
+                if current_word:
+                    word = ''.join(current_word)
+                    tokens.append(word)
+                    current_word = []
+            else:
+                current_word.append(token)
+        # Add the last word if exists
+        if current_word:
+            word = ''.join(current_word)
+            tokens.append(word)
+        # Join all words with spaces
+        return ' '.join(tokens)
+    def get_compression_ratio(self, text: str) -> float:
+        """Calculate compression ratio"""
+        if not text:
+            return 0.0
+        original_size = len(text.encode('utf-8'))
+        encoded = self.encode(text)
+        if not encoded:
+            return 0.0
+        # Use 1 byte per token id instead of 2 since vocab size < 5000
+        compressed_size = len(encoded)
+        return original_size / compressed_size if compressed_size > 0 else 0.0

src/train_bpe.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+from hindi_bpe import HindiBPE
+from tqdm import tqdm
+def load_processed_data_in_chunks(file_path: str, max_sentences: int = 1_000_000) -> str:
+    """Load data in chunks, up to max_sentences"""
+    buffer = []
+    sentence_count = 0
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in tqdm(f, desc="Reading sentences"):
+            if sentence_count >= max_sentences:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            buffer.append(line)
+            sentence_count += 1
+            if len(buffer) >= 10000:  # Process in chunks of 10K sentences
+                yield ' '.join(buffer)
+                buffer = []
+    if buffer:  # Don't forget the last chunk
+        yield ' '.join(buffer)
+def main():
+    # Initialize paths
+    data_dir = os.path.join("..", "data")
+    processed_file = os.path.join(data_dir, "hi_processed.txt")
+    # Check if processed data exists
+    if not os.path.exists(processed_file):
+        print("Processed data not found. Please run download_data.py first.")
+        return
+    # Initialize BPE
+    print("Initializing BPE tokenizer...")
+    print("Training Parameters:")
+    print("1. Using first 1 million sentences")
+    print("2. Vocabulary size must be < 5000 tokens")
+    print("3. Compression ratio must be ≥ 3.2")
+    bpe = HindiBPE()
+    print("\nTraining BPE model...")
+    is_first_chunk = True
+    total_sentences = 0
+    for chunk in load_processed_data_in_chunks(processed_file):
+        if not chunk.strip():
+            continue
+        bpe.train_on_chunk(chunk, is_first_chunk=is_first_chunk)
+        is_first_chunk = False
+        # Check if we've met both requirements
+        test_text = chunk[:10000]  # Use a sample of text
+        compression_ratio = bpe.get_compression_ratio(test_text)
+        vocab_size = len(bpe.vocab)
+        print(f"\nCurrent status:")
+        print(f"Vocabulary size: {vocab_size} tokens")
+        print(f"Compression ratio: {compression_ratio:.2f}")
+        if compression_ratio >= 3.2:
+            if vocab_size < 5000:
+                print("\nSuccess! Met all requirements:")
+                print(f"1. Vocabulary size: {vocab_size} tokens (< 5000)")
+                print(f"2. Compression ratio: {compression_ratio:.2f} (≥ 3.2)")
+                break
+            else:
+                print("\nWarning: Need to reduce vocabulary size while maintaining compression ratio")
+    print("\nFinal Results:")
+    print(f"Vocabulary size: {len(bpe.vocab)} tokens")
+    print(f"Compression ratio: {compression_ratio:.2f}")
+    # Test the model with various Hindi texts
+    test_cases = [
+        "नमस्ते भारत",
+        "मैं हिंदी सीख रहा हूं",
+        "यह एक परीक्षण वाक्य है",
+        "भारत एक विशाल देश है",
+        "मुझे हिंदी भाषा बहुत पसंद है"
+    ]
+    print("\nTesting encoding/decoding on multiple examples:")
+    for i, test_text in enumerate(test_cases, 1):
+        print(f"\nTest case {i}:")
+        print(f"Original: {test_text}")
+        encoded = bpe.encode(test_text)
+        print(f"Encoded: {encoded}")
+        decoded = bpe.decode(encoded)
+        print(f"Decoded: {decoded}")
+        print(f"Matches: {'✓' if decoded == test_text else '✗'}")
+if __name__ == "__main__":
+    main()