guymorlan commited on
Commit
7f7e246
1 Parent(s): d272bab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +133 -3
README.md CHANGED
@@ -1,3 +1,133 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - ar
5
+ pipeline_tag: token-classification
6
+ datasets:
7
+ - guymorlan/levanti
8
+ ---
9
+
10
+ # Levanti Transliterator
11
+
12
+ This model converts diacritics in Palestinian colloquial Arabic to their estimated pronunciation via Hebrew vowels. It can be used to transliterate diacritized Palestinian Arabic text into Hebrew or English. The model is trained on a special subset of the Levanti dataset (to be released later).
13
+ The model is fine-tuned from Google's [CANINE-s](https://huggingface.co/google/canine-s) character level LM with a token classification head.
14
+ Each token (letter) of the input is classified into either of 7 classes: 'O' if not a diacritic, or one of 6 Hebrew vowels (see `model.config.id2label`).
15
+
16
+ # Example Usage
17
+
18
+ ```python
19
+ from transformers import CanineForTokenClassification, AutoTokenizer
20
+ import torch
21
+
22
+ model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
23
+ tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
24
+
25
+ def diacritics2hebrew_vowels(text, model, tokenizer):
26
+ tokens = tokenizer(text, return_tensors="pt")
27
+ with torch.no_grad():
28
+ pred = model(**tokens)
29
+ pred = pred.logits.argmax(-1).tolist()
30
+
31
+ pred = pred[0][1:-1] # remove CLS and SEP
32
+ output = []
33
+ for p, c in zip(pred, text):
34
+ if p != model.config.label2id["O"]:
35
+ output.append(model.config.id2label[p])
36
+ else:
37
+ output.append(c)
38
+ output = "".join(output)
39
+ return output
40
+
41
+ # to convert arabic diacritics to Hebrew diacritics (Tsere, Holam, Patah, Shva, Kubutz, Hiriq)
42
+ text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
43
+ heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
44
+ heb_vowels
45
+ ```
46
+ ```
47
+ Out[1]: 'لַازֵم نִعְطִي رַشַّات وִقַائִيֵّة لִلشַّجַر '
48
+ ```
49
+
50
+ ```python
51
+ arabic_to_hebrew = {
52
+ # regular letters
53
+ "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
54
+ "آ": "אא", "ى": "א", "ب": "ב", "ت": "ת", "ث": "ת'", "ج": "ג'",
55
+ "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
56
+ "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
57
+ "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כ", "ل": "ל",
58
+ "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
59
+ # special characters
60
+ "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
61
+ }
62
+
63
+ final_letters = {
64
+ "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
65
+ }
66
+
67
+ def to_taatik(arabic):
68
+ taatik = []
69
+ for index, letter in enumerate(arabic):
70
+ if (
71
+ (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
72
+ letter in final_letters
73
+ ):
74
+ taatik.append(final_letters[letter])
75
+ elif letter not in arabic_to_hebrew:
76
+ taatik.append(letter)
77
+ else:
78
+ taatik.append(arabic_to_hebrew[letter])
79
+ return "".join(taatik)
80
+
81
+ # to convert consonants and create full hebrew transliteration (Taatik)
82
+ to_taatik(heb_vowels)
83
+ ```
84
+
85
+ ```
86
+ Out[2]: "לַאזֵם נִעְטִי רַשַّאת וִקַאאִיֵّה לִלשַّג'ַר "```
87
+ ```
88
+
89
+ ```python
90
+ arabic_to_english = {
91
+ "ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
92
+ "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
93
+ "ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
94
+ "س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
95
+ "ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
96
+ "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
97
+ "َ": "a", "ُ": "u", "ِ": "i",
98
+ "،": ",",
99
+ "ֹ": "o", # holam
100
+ "ַ": "a", # patah
101
+ "ִ": "i", # hiriq
102
+ "ְ": "", # shva
103
+ "ֻ": "u", # kubutz
104
+ 'ֵ': "e",
105
+ "ّ": "SHADDA" # shadda
106
+ }
107
+
108
+ vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
109
+
110
+
111
+ def to_translit(arabic):
112
+ translit = []
113
+ for letter in arabic:
114
+ if letter not in arabic_to_english:
115
+ translit.append([letter, letter])
116
+ else:
117
+ if arabic_to_english[letter] == "SHADDA":
118
+ if translit[-1][0] in vowels:
119
+ translit[-2][1] = translit[-2][1].upper()
120
+ else:
121
+ translit[-1][1] = translit[-1][1].upper()
122
+
123
+ else:
124
+ translit.append([letter, arabic_to_english[letter]])
125
+
126
+ return "".join([x[1] for x in translit])
127
+
128
+ # to convert letters to latin representation (English transliteration)
129
+ to_translit(heb_vowels)
130
+ ```
131
+ ```
132
+ Out[3]: 'laazem niatiy raSHaat wiqaaaiYeh lilSHajar '
133
+ ```