whisper-small-belarusian / src /belarusian_text_normalizer.py
ales's picture
upd src
c4adc54
import re
import regex
import unicodedata
from typing import Iterable
class BelarusianTextNormalizer:
"""
Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
but with support not to remove certain characters.
e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
"""
def __init__(self, split_letters: bool = False):
self.split_letters = split_letters
self.allowed_symbols = ("'",)
@staticmethod
def clean(s: str, allowed_symbols: Iterable[str] = None):
"""
Replace any other markers, symbols, punctuations with a space, keeping diacritics
"""
if allowed_symbols is None:
allowed_symbols = []
res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c
for c in unicodedata.normalize("NFKC", s))
return res
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s