File size: 8,197 Bytes
b585c7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import textwrap
import re
from src.utils import flatten_list, have_emoji, have_langid
def setup_nltk():
import nltk # we'll use this to split into sentences
nltk.download("punkt")
# if followed installation, then should already be done, don't break air-gap
# setup_nltk()
sentence_keys = ['sentence_list', 'index']
def init_sentence_state():
sentence_state = dict(sentence_list=[], index=0)
return sentence_state
def unpack_state(sentence_state):
rets = []
for key in sentence_keys:
rets.append(sentence_state[key])
return tuple(rets)
def pack_state(sentence_state, *args):
# don't change dict reference so parent can reuse. Ok to lose reference for list
for keyi, key in enumerate(sentence_keys):
if isinstance(sentence_state[key], list):
sentence_state[key] = args[keyi]
else:
sentence_state[key] = args[keyi]
return sentence_state
def split_sentences(sentence, n=250):
"""
Splits a sentence by spaces into smaller sentences, each with a maximum length of n characters,
while preserving whitespace characters like new lines.
# 250 due to [!] Warning: The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.
"""
# Splitting on spaces while preserving all whitespace characters in a list
words = re.split('(\s+)', sentence)
sentences = []
current_sentence = []
current_length = 0
for word in words:
# Skip empty strings which can occur due to consecutive whitespace
if word == '':
continue
# Check if the word is a whitespace character
if word.isspace():
if word == '\n':
# If it's a newline, end the current sentence and start a new one
sentences.append("".join(current_sentence))
current_sentence = []
current_length = 0
else:
# For other whitespace characters, add them to the current sentence
current_sentence.append(word)
current_length += len(word)
else:
# Check if adding the next word would exceed the limit
if current_length + len(word) > n:
if current_sentence:
sentences.append("".join(current_sentence))
current_sentence = [word]
current_length = len(word)
else:
# If the word itself is longer than n and there's no current sentence
sentences.append(word)
current_length = 0
else:
current_sentence.append(word)
current_length += len(word)
# Add the last sentence if it exists
if current_sentence:
sentences.append("".join(current_sentence))
return sentences
def _get_sentences(response, verbose=False, min_start=15, max_length=250):
# no mutations of characters allowed here, only breaking apart or merging
import nltk
# refuse to tokenize first 15 characters into sentence, so language detection works and logic simpler
sentences = nltk.sent_tokenize(response[min_start:])
# split any long sentences
sentences = flatten_list([split_sentences(x, max_length) for x in sentences])
# drop empty sentences
sentences = [x for x in sentences if x.strip()]
# restore first min_start if set
if sentences and min_start > 0:
sentences[0] = response[:min_start] + sentences[0]
elif min_start > 0:
sentences.append(response[:min_start])
return sentences
def get_sentence(response, sentence_state, is_final=False, verbose=False):
# get state items
sentence_list, index = unpack_state(sentence_state)
sentences = _get_sentences(response[index:], min_start=15 if index == 0 else 0, verbose=verbose)
if len(sentences) >= 2:
# detected new completed sentence
# find new index
index_delta = response[index:].index(sentences[0])
index += index_delta + len(sentences[0])
sentence_list.append(sentences[0])
# only clean for result, to avoid mis-handling of sentences index
cleaned_sentence = clean_sentence(sentences[0], verbose=verbose)
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), False
elif is_final:
# then just return last sentence
cleaned_sentence = clean_sentence(' '.join(sentences), verbose=verbose)
sentence_list.append(' '.join(sentences))
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), True
else:
return None, pack_state(sentence_state, sentence_list, index), True
def clean_sentence(sentence, verbose=False):
if sentence is None or len(sentence) == 0:
if verbose:
print("empty sentence")
return ''
# Remove code blocks
sentence = re.sub("```.*?```", "", sentence, flags=re.DOTALL)
sentence = re.sub("`.*?`", "", sentence, flags=re.DOTALL)
sentence = re.sub("\(.*?\)", "", sentence, flags=re.DOTALL)
# remove marks
sentence = sentence.replace("```", "")
sentence = sentence.replace("...", " ")
sentence = sentence.replace("(", " ")
sentence = sentence.replace(")", " ")
sentence = sentence.replace("Dr. ", "Doctor ")
sentence = sentence.replace(" w/ ", " with ")
sentence = sentence.replace('H2O.ai', "aych two oh ae eye.")
sentence = sentence.replace('H2O.AI', "aych two oh ae eye.")
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
# filter out emojis
if have_emoji:
import emoji
sentence = ''.join([x for x in sentence if not emoji.is_emoji(x)])
# fix floating expressions
sentence = re.sub(r'(\d+)\.(\d+)', r"\1 dot \2", sentence)
# Fix last bad characters
sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)", r"\1\2", sentence)
sentence = sentence.strip()
if sentence.startswith('. ') or sentence.startswith('? ') or sentence.startswith('! ') or sentence.startswith(', '):
sentence = sentence[2:]
if sentence.startswith('.') or sentence.startswith('?') or sentence.startswith('!') or sentence.startswith(','):
sentence = sentence[1:]
if sentence == '1.':
sentence = 'One'
if sentence == '2.':
sentence = 'Two'
if sentence == '3.':
sentence = 'Three'
if sentence == '4.':
sentence = 'Four'
if sentence == '5.':
sentence = 'Five'
if sentence == '6.':
sentence = 'Six'
if sentence == '7.':
sentence = 'Seven'
if sentence == '8.':
sentence = 'Eight'
if sentence == '9.':
sentence = 'Nine'
if sentence == '10.':
sentence = 'Ten'
if len(sentence) == 0:
if verbose:
print("EMPTY SENTENCE after processing")
return ''
if verbose:
print("Sentence for speech: %s" % sentence)
return sentence
def detect_language(prompt, supported_languages, verbose=False):
if not have_langid:
# if no package, just return english
return "en"
import langid
# Fast language autodetection
if len(prompt) > 15:
language_predicted = langid.classify(prompt)[0].strip() # strip need as there is space at end!
if language_predicted == "zh":
# we use zh-cn on xtts
language_predicted = "zh-cn"
if language_predicted not in supported_languages:
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
language = "en"
else:
language = language_predicted
if verbose:
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
else:
# Hard to detect language fast in short sentence, use english default
language = "en"
if verbose:
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
return language
|