dodo12

Runtime error

App Files Files Community

dodo12 / whisper /normalizers /basic.py

next-social

Duplicate from maxmax20160403/sovits5.0

27d3bc5 over 1 year ago

raw

history blame contribute delete

1.88 kB

	import re
	import unicodedata

	import regex

	# non-ASCII letters that are not separated by "NFKD" normalization
	ADDITIONAL_DIACRITICS = {
	"œ": "oe",
	"Œ": "OE",
	"ø": "o",
	"Ø": "O",
	"æ": "ae",
	"Æ": "AE",
	"ß": "ss",
	"ẞ": "SS",
	"đ": "d",
	"Đ": "D",
	"ð": "d",
	"Ð": "D",
	"þ": "th",
	"Þ": "th",
	"ł": "l",
	"Ł": "L",
	}


	def remove_symbols_and_diacritics(s: str, keep=""):
	"""
	Replace any other markers, symbols, and punctuations with a space,
	and drop any diacritics (category 'Mn' and some manual mappings)
	"""
	return "".join(
	c
	if c in keep
	else ADDITIONAL_DIACRITICS[c]
	if c in ADDITIONAL_DIACRITICS
	else ""
	if unicodedata.category(c) == "Mn"
	else " "
	if unicodedata.category(c)[0] in "MSP"
	else c
	for c in unicodedata.normalize("NFKD", s)
	)


	def remove_symbols(s: str):
	"""
	Replace any other markers, symbols, punctuations with a space, keeping diacritics
	"""
	return "".join(
	" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
	)


	class BasicTextNormalizer:
	def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
	self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
	self.split_letters = split_letters

	def __call__(self, s: str):
	s = s.lower()
	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
	s = self.clean(s).lower()

	if self.split_letters:
	s = " ".join(regex.findall(r"\X", s, regex.U))

	s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

	return s