|
|
|
|
|
|
|
|
|
|
|
def V(c): |
|
return c in u"АЕЁИОУЫЭЮЯаеёиоуыэюя" |
|
|
|
|
|
def C(c): |
|
return c in u"БВГДЖЗКЛМНПРСТФХЦЧШЩбвгджзклмнпрстфхцчшщ" |
|
|
|
|
|
def S(c): |
|
return c in u"Йй" |
|
|
|
|
|
def M(c): |
|
return c in u"ЪЬъь" |
|
|
|
|
|
def BEG(c): |
|
return c == u"[" |
|
|
|
|
|
def END(c): |
|
return c == u"]" |
|
|
|
|
|
def split(s): |
|
cur_pos = 0 |
|
items = list(u"[" + s + u"]") |
|
while cur_pos < len(items): |
|
input_context = items[cur_pos:] |
|
res = apply1(input_context) |
|
if res is None: |
|
cur_pos += 1 |
|
else: |
|
items = items[:cur_pos] + res[0] + input_context[res[1]:] |
|
cur_pos += res[2] |
|
return items[1:-1] |
|
|
|
|
|
def apply1(s): |
|
if C(s[0]): |
|
if V(s[1]): |
|
if C(s[2]): |
|
if V(s[3]): |
|
return ([s[0]+s[1], s[2], s[3]], 4, 1) |
|
|
|
if C(s[3]): |
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if C(s[5]): |
|
if END(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) |
|
|
|
if not END(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) |
|
|
|
|
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if M(s[5]): |
|
if END(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) |
|
|
|
|
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if M(s[4]): |
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if V(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) |
|
|
|
|
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
|
|
|
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
if M(s[3]): |
|
if C(s[4]): |
|
if not END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if C(s[6]): |
|
if END(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
|
|
|
|
if END(s[2]): |
|
return ([s[0]+s[1], s[2]], 3, 1) |
|
|
|
if S(s[2]): |
|
if C(s[3]): |
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if M(s[4]): |
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
|
|
|
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
return ([s[0]+s[1]+s[2]], 3, 1) |
|
|
|
if V(s[2]): |
|
return ([s[0]+s[1], s[2]], 3, 1) |
|
|
|
|
|
if C(s[1]): |
|
if C(s[2]): |
|
if V(s[3]): |
|
if C(s[4]): |
|
if C(s[5]): |
|
if V(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) |
|
|
|
if M(s[6]): |
|
if END(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if M(s[5]): |
|
if C(s[6]): |
|
if M(s[7]): |
|
if END(s[8]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6]+s[7], s[8]], 9, 1) |
|
|
|
|
|
|
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) |
|
|
|
|
|
if S(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) |
|
|
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) |
|
|
|
if C(s[3]): |
|
if V(s[4]): |
|
if S(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]], 6, 1) |
|
|
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]], 5, 1) |
|
|
|
|
|
|
|
if V(s[2]): |
|
if C(s[3]): |
|
if C(s[4]): |
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if C(s[6]): |
|
if END(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) |
|
|
|
|
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if M(s[5]): |
|
if V(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 1) |
|
|
|
if C(s[6]): |
|
if V(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
|
|
if M(s[4]): |
|
if not C(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if V(s[6]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 1) |
|
|
|
if C(s[6]): |
|
if V(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5], s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
|
|
if V(s[3]): |
|
if C(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
if S(s[3]): |
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if C(s[6]): |
|
if END(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
|
|
return ([s[0]+s[1]+s[2]+s[3]], 4, 1) |
|
|
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
|
|
if M(s[2]): |
|
if V(s[3]): |
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
|
|
|
|
|
|
|
|
if M(s[1]): |
|
if V(s[2]): |
|
if C(s[3]): |
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4], s[5]], 6, 1) |
|
|
|
if C(s[5]): |
|
if C(s[6]): |
|
if V(s[7]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4]+s[5], s[6], s[7]], 8, 1) |
|
|
|
|
|
|
|
|
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
|
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
|
|
if C(s[2]): |
|
if V(s[3]): |
|
if S(s[4]): |
|
if END(s[5]): |
|
return ([s[0]+s[1]+s[2]+s[3]+s[4], s[5]], 6, 1) |
|
|
|
|
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if V(s[0]): |
|
if C(s[1]): |
|
if C(s[2]): |
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
if V(s[3]): |
|
return ([s[0]+s[1], s[2], s[3]], 4, 1) |
|
|
|
if C(s[3]): |
|
if V(s[4]): |
|
if C(s[5]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) |
|
|
|
return ([s[0]+s[1], s[2], s[3], s[4]], 5, 1) |
|
|
|
if C(s[4]): |
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) |
|
|
|
|
|
|
|
if M(s[3]): |
|
if END(s[4]): |
|
return ([s[0]+s[1]+s[2]+s[3], s[4]], 5, 1) |
|
|
|
|
|
|
|
if V(s[2]): |
|
return ([s[0], s[1], s[2]], 3, 1) |
|
|
|
if M(s[2]): |
|
if END(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
if C(s[3]): |
|
if C(s[4]): |
|
if V(s[5]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4], s[5]], 6, 1) |
|
|
|
|
|
if V(s[4]): |
|
return ([s[0]+s[1]+s[2], s[3], s[4]], 5, 1) |
|
|
|
|
|
if V(s[3]): |
|
return ([s[0]+s[1]+s[2], s[3]], 4, 1) |
|
|
|
|
|
if END(s[2]): |
|
return ([s[0]+s[1], s[2]], 3, 1) |
|
|
|
return ([s[0]+s[1]], 2, 1) |
|
|
|
if END(s[1]): |
|
return ([s[0], s[1]], 2, 1) |
|
|
|
if V(s[1]): |
|
return ([s[0], s[1]], 2, 1) |
|
|
|
if S(s[1]): |
|
if END(s[2]): |
|
return ([s[0]+s[1], s[2]], 3, 1) |
|
|
|
if C(s[2]): |
|
if V(s[3]): |
|
return ([s[0]+s[1], s[2], s[3]], 4, 1) |
|
|
|
|
|
|
|
|
|
|
|
if BEG(s[0]): |
|
if C(s[1]): |
|
if C(s[2]): |
|
if V(s[3]): |
|
if C(s[4]): |
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
if C(s[5]): |
|
if END(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) |
|
|
|
if M(s[6]): |
|
if END(s[7]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) |
|
|
|
|
|
|
|
|
|
if S(s[4]): |
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
|
|
if END(s[4]): |
|
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) |
|
|
|
|
|
if END(s[3]): |
|
return ([s[0], s[1]+s[2], s[3]], 4, 2) |
|
|
|
if C(s[3]): |
|
if C(s[4]): |
|
if V(s[5]): |
|
if C(s[6]): |
|
if END(s[7]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) |
|
|
|
|
|
|
|
|
|
if V(s[4]): |
|
if C(s[5]): |
|
if M(s[6]): |
|
if END(s[7]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) |
|
|
|
|
|
|
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
|
|
|
|
|
|
if V(s[2]): |
|
if C(s[3]): |
|
if C(s[4]): |
|
if M(s[5]): |
|
if END(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) |
|
|
|
|
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
|
|
if M(s[4]): |
|
if C(s[5]): |
|
if C(s[6]): |
|
if END(s[7]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5]+s[6], s[7]], 8, 2) |
|
|
|
|
|
|
|
|
|
|
|
if S(s[3]): |
|
if C(s[4]): |
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
|
|
|
|
|
|
if END(s[2]): |
|
return ([s[0], s[1], s[2]], 3, 2) |
|
|
|
if M(s[2]): |
|
if C(s[3]): |
|
if V(s[4]): |
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
if C(s[5]): |
|
if END(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) |
|
|
|
if V(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5], s[6]], 7, 2) |
|
|
|
|
|
|
|
|
|
if V(s[3]): |
|
if END(s[4]): |
|
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) |
|
|
|
if S(s[4]): |
|
if END(s[5]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4], s[5]], 6, 2) |
|
|
|
|
|
if C(s[4]): |
|
if M(s[5]): |
|
if END(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if V(s[1]): |
|
if C(s[2]): |
|
if M(s[3]): |
|
if END(s[4]): |
|
return ([s[0], s[1]+s[2]+s[3], s[4]], 5, 2) |
|
|
|
|
|
if END(s[3]): |
|
return ([s[0], s[1]+s[2], s[3]], 4, 2) |
|
|
|
if C(s[3]): |
|
if C(s[4]): |
|
if C(s[5]): |
|
if END(s[6]): |
|
return ([s[0], s[1]+s[2]+s[3]+s[4]+s[5], s[6]], 7, 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if S(s[1]): |
|
if V(s[2]): |
|
if C(s[3]): |
|
if V(s[4]): |
|
return ([s[0], s[1]+s[2], s[3], s[4]], 5, 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
sx = split(u"спросил") |
|
print(u"|".join(sx)) |
|
|
|
def split_word(word): |
|
""" |
|
Split single word to syllables |
|
:param word: unicode string representing Russian word |
|
:return: list of unicode strings for syllables |
|
""" |
|
return split(word) |
|
|
|
|
|
def split_words(words): |
|
""" |
|
Split the words in list to contiguous list of sillables and word separators (single space chars) |
|
:param words: list of words (unicode strings) |
|
:return: list of tokens - syllables and spaces |
|
""" |
|
tokens = [] |
|
for word in words: |
|
sx = split(word) |
|
if len(tokens) > 0: |
|
tokens.append(u' ') |
|
tokens.extend(sx) |
|
return tokens |
|
|