whisper-small-belarusian / src /belarusian_text_normalizer.py

upd src

c4adc54 almost 2 years ago

1.44 kB

	import re
	import regex
	import unicodedata

	from typing import Iterable


	class BelarusianTextNormalizer:
	"""
	Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
	but with support not to remove certain characters.
	e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
	"""

	def __init__(self, split_letters: bool = False):
	self.split_letters = split_letters
	self.allowed_symbols = ("'",)

	@staticmethod
	def clean(s: str, allowed_symbols: Iterable[str] = None):
	"""
	Replace any other markers, symbols, punctuations with a space, keeping diacritics
	"""
	if allowed_symbols is None:
	allowed_symbols = []
	res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c
	for c in unicodedata.normalize("NFKC", s))
	return res

	def __call__(self, s: str):
	s = s.lower()
	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
	s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()

	if self.split_letters:
	s = " ".join(regex.findall(r"\X", s, regex.U))

	s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

	return s