File size: 977 Bytes
7694c84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

PADDING_TOKEN = '_pad_'
EOS_TOKEN = '_eos_'
DOUBLING_TOKEN = '_dbl_'
SEPARATOR_TOKEN = '_+_'

EOS_TOKENS = [SEPARATOR_TOKEN, EOS_TOKEN]

symbols = [
    # special tokens
    PADDING_TOKEN,  # padding
    EOS_TOKEN,  # eos-token
    '_sil_',  # silence
    DOUBLING_TOKEN,  # doubling
    SEPARATOR_TOKEN,  # word separator
    # consonants
    '<',  # hamza
    'b',  # baa'
    't',  # taa'
    '^',  # thaa'
    'j',  # jiim
    'H',  # Haa'
    'x',  # xaa'
    'd',  # daal
    '*',  # dhaal
    'r',  # raa'
    'z',  # zaay
    's',  # siin
    '$',  # shiin
    'S',  # Saad
    'D',  # Daad
    'T',  # Taa'
    'Z',  # Zhaa'
    'E',  # 3ayn
    'g',  # ghain
    'f',  # faa'
    'q',  # qaaf
    'k',  # kaaf
    'l',  # laam
    'm',  # miim
    'n',  # nuun
    'h',  # haa'
    'w',  # waaw
    'y',  # yaa'
    'v',  # /v/ for loanwords e.g. in u'fydyw': u'v i0 d y uu1',
    # vowels
    'a',  # short
    'u',
    'i',
    'aa',  # long
    'uu',
    'ii',
]