Spaces:

fireredteam
/

FireRedTTS

Running on Zero

App Files Files Community

FireRedTTS / fireredtts /modules /text_normalizer /normalize.py

hhguo

update

37ced70 about 2 months ago

raw

history blame

6.68 kB

	import re
	import regex
	import inflect
	import unicodedata
	from lingua import Language, LanguageDetectorBuilder
	from builtins import str as unicode

	from tn.chinese.normalizer import Normalizer as ZhNormalizer
	from tn.english.normalizer import Normalizer as EnNormalizer

	from fireredtts.modules.text_normalizer.regex_common import *
	from fireredtts.modules.text_normalizer.utils import *


	def preprocess_text(sentence):
	# preprocessing
	sentence = bytes(sentence, "utf-8").decode("utf-8", "ignore")
	sentence = regex.sub("[\p{Cf}--[\u200d]]", "", sentence, flags=regex.V1)
	sentence = regex.sub("\p{Co}", "", sentence)
	sentence = sentence.replace("\u00a0", " ")
	sentence = sentence.replace("\ufffd", "")
	sentence = regex.sub("\p{Zl}", "\n", sentence)
	sentence = regex.sub("\p{Zp}", "\n", sentence)

	sentence = unicode(sentence)
	sentence = "".join(
	char
	for char in unicodedata.normalize("NFD", sentence)
	if unicodedata.category(char) != "Mn"
	) # Strip accents

	sentence = strip_kaomoji(sentence)
	# full to half with exemption (to be converted after number TN): 。，：
	sentence = f2b(sentence, exemption="。，：")

	# clean spaces
	sentence = sentence.replace("\n", "，")
	sentence = sentence.replace("\t", "，")
	sentence = sentence.replace("\r", "，")
	sentence = re.sub(r"[。.]{3,}", "…", sentence)
	sentence = re.sub(r"[…⋯]{1,}", "…", sentence)
	sentence = re.sub(r"[ ]+", " ", sentence)
	sentence = sentence.strip()

	# punctuation reduction
	result = ""
	for idx, char in enumerate(sentence):
	if char in symbol_reduction:
	char = symbol_reduction[char]

	if char == " ":
	if idx == 0:
	continue
	if is_chinese(sentence[idx + 1]) and (
	is_chinese(sentence[idx - 1]) or sentence[idx - 1] in '") '
	):
	result += "，"
	else:
	result += " "
	continue

	if is_valid_char(char):
	result += char
	result = re.sub(r"[ ]+", " ", result)
	return result


	def rettt(sentence):
	# handle abbreviations for all languages
	sentence = sentence.replace("&nd", "and")
	sentence = sentence.replace("Jan.", "january")
	sentence = sentence.replace("Feb.", "febrary")
	sentence = sentence.replace("Mar.", "march")
	sentence = sentence.replace("Apr.", "april")
	sentence = sentence.replace("May.", "may")
	sentence = sentence.replace("Jun.", "june")
	sentence = sentence.replace("Jul.", "july")
	sentence = sentence.replace("Aug.", "august")
	sentence = sentence.replace("Sept.", "september")
	sentence = sentence.replace("Sep.", "september")
	sentence = sentence.replace("Oct.", "october")
	sentence = sentence.replace("Nov.", "november")
	sentence = sentence.replace("Dec.", "december")
	sentence = sentence.replace("Mon.", "monday")
	sentence = sentence.replace("Tues.", "tuesday")
	sentence = sentence.replace("Wed.", "wednesday")
	sentence = sentence.replace("Thur.", "thursday")
	sentence = sentence.replace("Fri.", "friday")
	sentence = sentence.replace("Sat.", "saturday")
	if sentence != "Sun.":
	sentence = sentence.replace("Sun.", "sunday")
	sentence = re.sub(r" St\. ([A-Z])", r" saint \1", sentence)
	sentence = re.sub(r" St\.", " street", sentence)
	sentence = re.sub(r" Rd\.", " road", sentence)
	sentence = re.sub(r"[Aa]\.[Mm]\.", "A_M", sentence)
	sentence = re.sub(r"[Pp]\.[Mm]\.", "P_M", sentence)
	sentence = re.sub(r"[Bb]\.[Cc]\.", "B_C", sentence)
	sentence = re.sub(r"[Ad]\.[Dd]\.", "A_D", sentence)
	sentence = sentence.replace("Mr.", "mister")
	sentence = sentence.replace("Ms.", "miss")
	sentence = sentence.replace("Mrs.", "misses")
	sentence = sentence.replace("Ph.D", "P_H_D")
	sentence = sentence.replace("i.e.", "that is")
	sentence = sentence.replace("e.g.", "for example")
	sentence = sentence.replace("btw.", "by the way")
	sentence = sentence.replace("btw", "by the way")
	sentence = sentence.replace("b.t.w.", "by the way")
	sentence = sentence.replace("@", " at ")
	return sentence


	class TextNormalizer:
	def __init__(self):
	self.language_detector = LanguageDetectorBuilder.from_languages(
	Language.ENGLISH, Language.CHINESE
	).build()
	self.zh_normalizer = ZhNormalizer()
	self.en_normalizer = EnNormalizer()
	self.inflect_parser = inflect.engine()
	self.lang2token = {Language.ENGLISH: "en", Language.CHINESE: "zh"}

	def tn(self, text):
	text = preprocess_text(text)
	text = rettt(text) # regex replacements
	# for non chinese languages
	language = self.language_detector.detect_language_of(text)
	# enforce chinese if text contains any chinese character
	if contains_chinese(text):
	language = Language.CHINESE
	text_lang = self.lang2token.get(language, "zh")

	if is_upper_eng_and_digit(text):
	language = Language.CHINESE

	if language == Language.CHINESE:
	text = self.zh_normalizer.normalize(text)
	text = text.replace("\n", "")
	text = re.sub(r"[，,]+$", "。", text)
	else:
	text = re.sub(r"[^ 0-9A-Za-z\[\]'.,:?!_\-]", "", text)
	text = self.en_normalizer.normalize(text)
	# fallback number normalization
	pieces = re.split(r"(\d+)", text)
	text = "".join(
	[
	self.inflect_parser.number_to_words(p) if p.isnumeric() else p
	for p in pieces
	if len(p) > 0
	]
	)

	# cleanup
	text = text.replace("_", " ")
	text = re.sub(r"[ ]+", " ", text)

	# spell caplital words
	pieces = re.split(r"([A-Z]{2,4}\|[ ])", text)
	for idx, p in enumerate(pieces):
	if re.match("[A-Z]{2,4}", p):
	pieces[idx] = " ".join(p)
	text = " ".join([p for p in pieces if p != " "])

	# post TN full to half
	text = text.replace("。", ".")
	text = text.replace("，", ",")
	text = text.replace("：", ":")

	# model limitations
	text = text.lower().strip()
	text = text.replace('"', "")
	text = text.replace("·", " ")
	text = re.sub("[…~、！，？：；!?:;]+", ",", text)
	text = re.sub("[,]+", ",", text)
	text = re.sub(r"[,. ]+$", ".", text)
	if len(text) > 0 and text[-1] != ".":
	text = text + "."

	return text, text_lang