File size: 4,978 Bytes
7f7e246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d056fd8
 
 
 
7f7e246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d056fd8
7f7e246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d056fd8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
---
license: cc-by-nc-4.0
language:
- ar
pipeline_tag: token-classification
datasets:
- guymorlan/levanti
---

# Levanti Transliterator

This model converts diacritics in Palestinian colloquial Arabic to their estimated pronunciation via Hebrew vowels. It can be used to transliterate diacritized Palestinian Arabic text into Hebrew or English. The model is trained on a special subset of the Levanti dataset (to be released later).
The model is fine-tuned from Google's [CANINE-s](https://huggingface.co/google/canine-s) character level LM with a token classification head.
Each token (letter) of the input is classified into either of 7 classes: 'O' if not a diacritic, or one of 6 Hebrew vowels (see `model.config.id2label`).

# Diacritizer
This model can be used in conjunction with [Levanti Diacritizer](https://huggingface.co/guymorlan/levanti_arabic2diacritics), which add diacritics to raw Palestinian Arabic text.


# Example Usage

```python
from transformers import CanineForTokenClassification, AutoTokenizer
import torch

model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")

def diacritics2hebrew_vowels(text, model, tokenizer):
    tokens = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        pred = model(**tokens)
        pred = pred.logits.argmax(-1).tolist()
        
    pred = pred[0][1:-1] # remove CLS and SEP
    output = []
    for p, c in zip(pred, text):
        if p != model.config.label2id["O"]:
            output.append(model.config.id2label[p])
        else:
            output.append(c)
    output = "".join(output)
    return output

# to convert arabic diacritics to Hebrew diacritics (Tsere, Holam, Patah, Shva, Kubutz, Hiriq)
text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
heb_vowels
```
```
Out[1]: 'لַازֵم نִعְطִي رַشַّات وִقַائִيֵّة لִلشַّجַر '
```

```python
arabic_to_hebrew = {
    # regular letters
    "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א", 
    "آ": "אא", "ى": "א", "ب": "ב", "ت": "ת", "ث": "ת'", "ج": "ג'", 
    "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז", 
    "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'", 
    "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כ", "ل": "ל", 
    "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
    # special characters
    "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
}

final_letters = {
    "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
}

def to_taatik(arabic):
    taatik = []
    for index, letter in enumerate(arabic):
        if (
            (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and 
            letter in final_letters
        ):
            taatik.append(final_letters[letter])
        elif letter not in arabic_to_hebrew:
            taatik.append(letter)
        else:
            taatik.append(arabic_to_hebrew[letter])
    return "".join(taatik)

# to convert consonants and create full hebrew transliteration (Taatik)
to_taatik(heb_vowels)
```

```
Out[2]: "לַאזֵם נִעְטִי רַשַّאת וִקַאאִיֵّה לִלשַّג'ַר "
```

```python
arabic_to_english = {
    "ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
    "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
    "ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
    "س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
    "ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
    "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
    "َ": "a", "ُ": "u", "ِ": "i",
    "،": ",",
    "ֹ": "o",  # holam
    "ַ": "a",  # patah
    "ִ": "i",  # hiriq
    "ְ": "",   # shva
    "ֻ": "u",  # kubutz
    'ֵ': "e",
    "ّ": "SHADDA"  # shadda
}

vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']


def to_translit(arabic):
    translit = []
    for letter in arabic:
        if letter not in arabic_to_english:
            translit.append([letter, letter])
        else:
            if arabic_to_english[letter] == "SHADDA":
                if translit[-1][0] in vowels:
                    translit[-2][1] = translit[-2][1].upper()
                else:
                    translit[-1][1] = translit[-1][1].upper()
    
            else:
                translit.append([letter, arabic_to_english[letter]])
            
    return "".join([x[1] for x in translit])

# to convert letters to latin representation (English transliteration)
to_translit(heb_vowels)
```
```
Out[3]: 'laazem niatiy raSHaat wiqaaaiYeh lilSHajar '
```

# Attribution
Created by Guy Mor-Lan.<br>
Contact: guy.mor AT mail.huji.ac.il