Guy Mor-Lan commited on
Commit
46f657a
1 Parent(s): e0e4f9c

add models

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ embedding_data.csv filter=lfs diff=lfs merge=lfs -text
37
+ embeddings.npy filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+
5
+ from translate import run_translate
6
+ from diacritize import diacritize, diacritize_if_not_already
7
+ from translit import taatik
8
+ from semsearch import update_df
9
+ from tts import get_audio
10
+ from consts import CSS, ABOUT, JS_FUNC
11
+ load_dotenv()
12
+
13
+ with gr.Blocks(title = "Levanti - כלי תרגום לערבית מדוברת",
14
+ css=CSS,
15
+ theme="default") as demo:
16
+ # gr.HTML("<h2><span style='color: #2563eb'>Levantine Arabic</span> Translator</h2>")
17
+ gr.HTML("<h2><span dir='rtl'><span style='color: #2563eb'>Levanti</span>ne Translator</span></h2>כלי תרגום לערבית מדוברת")
18
+ with gr.Tab('תרגום', elem_id="tab1"):
19
+ with gr.Row():
20
+ with gr.Column():
21
+ input_text = gr.Textbox(label="קלט",
22
+ info = "עברית או ערבית מדוברת",
23
+ placeholder="הזינו טקסט בערבית או עברית",
24
+ lines=2,
25
+ elem_id="input",
26
+ rtl=True)
27
+
28
+ gr.Examples(["רציתי ללכת אתמול לחנות, אבל ירד גשם", "خلينا ندور على مطعم تاني"],
29
+ input_text,
30
+ label="דוגמאות")
31
+
32
+ btn = gr.Button("תרגום")
33
+ with gr.Row():
34
+ dialect = gr.Radio(["פלסטיני", "סורי", "לבנוני", "מצרי"],
35
+ label = "להג",
36
+ info="משפיע על תרגום לערבית",
37
+ value="פלסטיני")
38
+
39
+ # gr.Markdown("Built by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il). Pronunciation model is specifically tailored to urban Palestinian Arabic. Text-to-speech uses Microsoft Azure's API and may provide different result from the transliterated pronunciation.")
40
+ gr.Markdown("נוצר על ידי [גיא מור-לן](mailto:guy.mor@mail.huji.ac.il) כחלק מפרויקט [Levanti](https://huggingface.co/datasets/guymorlan/levanti). השמע מופק באמצעות Azure TTS על בסיס הניקוד המנובא ויוריסטיקות.", elem_id="footer", rtl=True)
41
+
42
+ with gr.Column():
43
+ with gr.Group(elem_id="grp"):
44
+ gr.HTML("<div dir='rtl'>תרגום</div>")
45
+ # gr.Markdown("תרגום", elem_id="diacritized")
46
+ translation_output = gr.HTML("<br>", visible=True, label="תרגום", elem_id="main")
47
+
48
+ hidden_arabic = gr.Textbox(lines=1, elem_id="trans", visible=False)
49
+
50
+ diacritized_output = gr.Textbox(label="ניקוד (ניסיוני)", lines=1, elem_id="diacritized",
51
+ rtl=True, interactive=False)
52
+ taatik_output = gr.Textbox(label="תעתיק (ניסיוני)", lines=1, elem_id="taatik",
53
+ text_align="right", rtl=True, interactive=False)
54
+ # diacritized_output = gr.HTML("<br>", label="ניקוד")
55
+ # taatik_output = gr.HTML("<br>", label="תעתיק")
56
+
57
+ audio = gr.Audio(label="שמע (Azure)", interactive=False,
58
+ autoplay=True)
59
+ audio_button = gr.Button("צור שמע")
60
+ audio_button.click(get_audio, inputs=[diacritized_output], outputs=[audio])
61
+
62
+
63
+ btn.click(run_translate, inputs=[input_text, dialect],
64
+ outputs=[translation_output, hidden_arabic], api_name="en2ar",
65
+ js="function jump(x, y){document.getElementById('main').scrollIntoView(); return [x, y];}")
66
+
67
+ input_text.submit(run_translate, inputs=[input_text, dialect],
68
+ outputs=[translation_output, hidden_arabic], scroll_to_output=True)
69
+ hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
70
+ diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
71
+ # pal.change(get_transliteration, inputs=[pal, include], outputs=[pal_translit]);
72
+ # include.change(toggle_visibility, inputs=[include], outputs=[pal_translit, sy, lb, eg])
73
+ with gr.Row():
74
+ # import pandas as pd
75
+ # ex_df = pd.DataFrame({"text": ["שלום", "כיצד ניתן לעזור לך?", "איפה נמצא המסעדה הכי טובה בעיר?"]})
76
+ # ex_df = ex_df.style.set_properties(**{
77
+ # 'font-family': 'Arial, sans-serif',
78
+ # 'text-align': 'right'
79
+ # })
80
+ # bla_df = gr.DataFrame(ex_df, visible=True, elem_id="nearest", wrap =True)
81
+ nearest_df = gr.DataFrame(headers=["ערבית", "עברית", "מאומת"], visible=False, wrap=True,
82
+ elem_id="nearest", label="תוצאות קרובות מתוך קורפוס Levanti", height=300)
83
+
84
+ hidden_arabic.change(update_df, inputs=[hidden_arabic], outputs=[nearest_df])
85
+
86
+ with gr.Tab("ניקוד ותעתוק", elem_id="tab2"):
87
+ with gr.Row():
88
+ with gr.Column():
89
+ diac_text = gr.Textbox(label="קלט", placeholder="הזינו טקסט בערבית", lines=1,
90
+ info = "בשביל תעתוק בלבד, הזינו טקסט ערבי מנוקד",
91
+ elem_id="diac_input", rtl=True)
92
+ gr.Examples(["خلينا ندور على مطعم تاني", "قَدِيْش حَقّ الْبَنْدُورَة؟"], diac_text,
93
+ label="דוגמאות", elem_id="diac_ex")
94
+ btn2 = gr.Button("שליחה")
95
+
96
+ with gr.Column():
97
+ diacritized_output2 = gr.Textbox(label="ניקוד", lines=1,
98
+ elem_id="diacritized2", rtl=True)
99
+ taatik_output2 = gr.Textbox(label="תעתיק", lines=1,
100
+ elem_id="taatik2", rtl=True)
101
+
102
+ # input_text.submit(run_translate, inputs=[input_text, dialect],
103
+ # outputs=[translation_output], scroll_to_output=True)
104
+ # hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
105
+ # diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
106
+ btn2.click(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
107
+ diac_text.submit(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
108
+ diacritized_output2.change(taatik, inputs=[diacritized_output2], outputs=[taatik_output2])
109
+ with gr.Tab("אודות", elem_id="tab3"):
110
+ with gr.Row():
111
+ gr.HTML("<h2>אודות</h2>")
112
+ gr.Markdown(ABOUT, elem_id="about", rtl=True)
113
+
114
+
115
+
116
+ demo.launch(ssl_verify=False)
ar_he/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ datasets:
4
+ - guymorlan/levanti
5
+ language:
6
+ - ar
7
+ - he
8
+ pipeline_tag: translation
9
+ widget:
10
+ - text: بدي أروح ع الدكان بكرا
11
+ ---
12
+
13
+ # Levanti (colloquial Levantine Arabic -> Hebrew) translator
14
+
15
+ Trained on the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) dataset by fine-tuning [Helsinki-NLP/opus-mt-ar-he](https://huggingface.co/Helsinki-NLP/opus-mt-ar-he) for 8 epochs.
16
+ The model supports Palestinian, Jordanian, Syrian, Lebanese and Egyptian dialects.
17
+
18
+
19
+ # Example usage
20
+
21
+ ```python
22
+ from transformers import pipeline
23
+ trans = pipeline("translation", "guymorlan/levanti_translate_ar_he")
24
+ trans("بدي أروح ع الدكان بكرا")
25
+ ```
26
+ ```
27
+ Out[1]: [{'translation_text': 'אני רוצה ללכת לחנות מחר'}]
28
+ ```
29
+
30
+ # Attribution
31
+ Created by Guy Mor-Lan.<br>
32
+ Contact: guy.mor AT mail.huji.ac.il
ar_he/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-ar-he",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 63333
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classif_dropout": 0.0,
18
+ "classifier_dropout": 0.0,
19
+ "d_model": 512,
20
+ "decoder_attention_heads": 8,
21
+ "decoder_ffn_dim": 2048,
22
+ "decoder_layerdrop": 0.0,
23
+ "decoder_layers": 6,
24
+ "decoder_start_token_id": 63333,
25
+ "decoder_vocab_size": 63334,
26
+ "dropout": 0.1,
27
+ "encoder_attention_heads": 8,
28
+ "encoder_ffn_dim": 2048,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 6,
31
+ "eos_token_id": 0,
32
+ "extra_pos_embeddings": 63334,
33
+ "forced_eos_token_id": 0,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1",
37
+ "2": "LABEL_2"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1,
44
+ "LABEL_2": 2
45
+ },
46
+ "max_length": 512,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "marian",
49
+ "normalize_before": false,
50
+ "normalize_embedding": false,
51
+ "num_beams": 4,
52
+ "num_hidden_layers": 6,
53
+ "pad_token_id": 63333,
54
+ "scale_embedding": true,
55
+ "share_encoder_decoder_embeddings": true,
56
+ "static_position_embeddings": true,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.38.1",
59
+ "use_cache": true,
60
+ "vocab_size": 63334
61
+ }
ar_he/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 63333
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 63333,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 63333,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.38.1"
16
+ }
ar_he/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50672992291e20bc714b766615195a72d09b3950362adc5a0097b6bbfc5b630d
3
+ size 306544408
ar_he/source.spm ADDED
Binary file (899 kB). View file
 
ar_he/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
ar_he/target.spm ADDED
Binary file (896 kB). View file
 
ar_he/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "63333": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "eos_token": "</s>",
30
+ "model_max_length": 512,
31
+ "pad_token": "<pad>",
32
+ "return_tensors": "pt",
33
+ "separate_vocabs": false,
34
+ "source_lang": "ara",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "heb",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
ar_he/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
ar_he_ct2/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "multi_query_attention": false,
9
+ "unk_token": "<unk>"
10
+ }
ar_he_ct2/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa9558eb1d4c737e67e2d8238eb5c56e15721911cf75a23e8906225cc417c58a
3
+ size 307573250
ar_he_ct2/shared_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
colorize.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+
4
+
5
+ def generate_diverging_colors(num_colors, palette='Set3'): # courtesy of ChatGPT
6
+ # Generate a colormap with a specified number of colors
7
+ cmap = plt.cm.get_cmap(palette, num_colors)
8
+
9
+ # Get the RGB values of the colors in the colormap
10
+ colors_rgb = cmap(np.arange(num_colors))
11
+
12
+ # Convert the RGB values to hexadecimal color codes
13
+ colors_hex = [format(int(color[0]*255)<<16|int(color[1]*255)<<8|int(color[2]*255), '06x') for color in colors_rgb]
14
+
15
+ return colors_hex
16
+
17
+
18
+ def align_words(outputs, tokenizer, encoder_input_ids, decoder_input_ids,
19
+ threshold=0.4, skip_first_src=True, skip_second_src=False,
20
+ layer=2, head=6):
21
+
22
+ alignment = []
23
+ # threshold = 0.05
24
+ for i, tok in enumerate(outputs.cross_attentions[layer][0][head]):
25
+ alignment.append([[i], (tok > threshold).nonzero().squeeze(-1).tolist()])
26
+
27
+ # for i in alignment:
28
+ # src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
29
+ # trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
30
+ # print(src_tok, "=>", trg_tok)
31
+
32
+ merged = []
33
+ for i in alignment:
34
+ token = tokenizer.convert_ids_to_tokens([decoder_input_ids[0][i[0]]])[0]
35
+ # print(token)
36
+ if token not in ["</s>", "<pad>", "<unk>", "<s>"]:
37
+ if merged:
38
+ tomerge = False
39
+ # check overlap with previous entry
40
+ for x in i[1]:
41
+ if x in merged[-1][1]:# or tokenizer.convert_ids_to_tokens([encoder_input_ids[0][x]])[0][0] != "▁":
42
+ tomerge = True
43
+ break
44
+ # if first character is not a "▁"
45
+ if token[0] != "▁":
46
+ tomerge = True
47
+ if tomerge:
48
+ merged[-1][0] += i[0]
49
+ merged[-1][1] += i[1]
50
+ else:
51
+ merged.append(i)
52
+ else:
53
+ merged.append(i)
54
+
55
+ # print("=====MERGED=====")
56
+ # for i in merged:
57
+ # src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
58
+ # trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
59
+ # print(src_tok, "=>", trg_tok)
60
+
61
+ colordict = {}
62
+ ncolors = 0
63
+ for i in merged:
64
+ src_tok = [f"src_{x}" for x in i[0]]
65
+ trg_tok = [f"trg_{x}" for x in i[1]]
66
+ all_tok = src_tok + trg_tok
67
+ # see if any tokens in entry already have associated color
68
+ newcolor = None
69
+ for t in all_tok:
70
+ if t in colordict:
71
+ newcolor = colordict[t]
72
+ break
73
+ if not newcolor:
74
+ newcolor = ncolors
75
+ ncolors += 1
76
+ for t in all_tok:
77
+ if t not in colordict:
78
+ colordict[t] = newcolor
79
+
80
+ colors = generate_diverging_colors(ncolors, palette="Set2")
81
+ id_to_color = {i: c for i, c in enumerate(colors)}
82
+ for k, v in colordict.items():
83
+ colordict[k] = id_to_color[v]
84
+
85
+ tgthtml = []
86
+ for i, token in enumerate(decoder_input_ids[0]):
87
+ if f"src_{i}" in colordict:
88
+ label = f"src_{i}"
89
+ tgthtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
90
+ else:
91
+ tgthtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
92
+ tgthtml = "".join(tgthtml)
93
+ tgthtml = tgthtml.replace("▁", " ")
94
+ tgthtml = f"<span style='font-size: 25px'>{tgthtml}</span>"
95
+
96
+ srchtml = []
97
+ for i, token in enumerate(encoder_input_ids[0]):
98
+ if (i == 0 and skip_first_src) or (i == 1 and skip_second_src):
99
+ continue
100
+
101
+ if f"trg_{i}" in colordict:
102
+ label = f"trg_{i}"
103
+ srchtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
104
+ else:
105
+ srchtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
106
+ srchtml = "".join(srchtml)
107
+ srchtml = srchtml.replace("▁", " ")
108
+ srchtml = f"<span style='font-size: 25px'>{srchtml}</span>"
109
+ return srchtml, tgthtml
consts.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CSS = """
2
+
3
+ @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Hebrew:wght@400;700&family=Noto+Naskh+Arabic:wght@400;700&display=swap');
4
+
5
+ #taatik textarea {
6
+ font-size: 25px;
7
+ font-family: 'Noto Sans Hebrew', 'Noto Naskh Arabic', 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial', 'Arial Unicode MS', sans-serif;
8
+ }
9
+
10
+ @font-face {
11
+ font-family: 'Noto Sans Hebrew';
12
+ src: url('https://fonts.gstatic.com/s/notosanshebrew/v40/or3HQ7v33eiDlKj4557q0OGCZa662.woff2') format('woff2');
13
+ unicode-range: U+0590-05FF, U+200C-2010, U+20AA, U+25CC, U+FB1D-FB4F;
14
+ }
15
+
16
+ @font-face {
17
+ font-family: 'Noto Naskh Arabic';
18
+ src: url('https://fonts.gstatic.com/s/notonaskharabic/v30/RrQ5bpV-9Dd1b1OAGA6M9PkyDuVBePeKNaxcsss0Y7bwvc5Urqjc.woff2') format('woff2');
19
+ unicode-range: U+0600-06FF, U+0750-077F, U+0870-088E, U+0890-0891, U+0898-08E1, U+08E3-08FF, U+200C-200E, U+2010-2011, U+204F, U+2E41, U+FB50-FDFF, U+FE70-FE74, U+FE76-FEFC;
20
+ }
21
+
22
+ :root { direction: rtl; }
23
+ #liter textarea, #trans textarea { font-size: 25px;}
24
+ #grp { padding: 10px; }
25
+ #trans textarea { direction: rtl; }
26
+ #taatik { direction: rtl; }
27
+ #about { direction: rtl; }
28
+ #tab1 { direction: rtl; }
29
+ #tab2 { direction: rtl; }
30
+ #footer { direction: rtl; }
31
+ #input {direction: rtl;}
32
+ #diac_input {direction: rtl;}
33
+ #diacritized { direction: rtl; }
34
+ #diacritized2 { direction: rtl; }
35
+ #taatik2 { direction: rtl; }
36
+ #diacritized textarea { font-size: 25px;}
37
+ #diacritized2 textarea { font-size: 25px;}
38
+ #taatik2 textarea { font-size: 25px;}
39
+ #input textarea { font-size: 20px;}
40
+ #diac_input textarea { font-size: 20px;}
41
+ #check { border-style: none !important; }
42
+ #nearest { font-family: 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial'; }
43
+ :root {--button-secondary-background-focus: #2563eb !important;
44
+ --button-secondary-background-base: #2563eb !important;
45
+ --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2);
46
+ --button-secondary-text-color-base: white !important;
47
+ --button-secondary-text-color-hover: white !important;
48
+ --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
49
+ --button-secondary-text-color-focus: white !important}
50
+ .dark {--button-secondary-background-base: #2563eb !important;
51
+ --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
52
+ --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2)}
53
+ .feather-music { stroke: #2563eb; }
54
+
55
+ .dataframe {
56
+ font-family: 'Arial', 'Helvetica', sans-serif !important;
57
+ }
58
+ .dataframe th, .dataframe td {
59
+ font-family: inherit !important;
60
+
61
+ }
62
+
63
+ .gradio-container .dataframe {
64
+ font-family: Arial, sans-serif !important;
65
+
66
+ }
67
+
68
+ /* Target the table header cells */
69
+ .table th .cell-wrap {
70
+ text-align: right !important;
71
+ }
72
+
73
+ /* Target the span inside the header cells */
74
+ .table th .cell-wrap span {
75
+ text-align: right !important;
76
+ display: block;
77
+ font-family: Arial, sans-serif !important;
78
+ }
79
+
80
+ /* Ensure the sort button doesn't interfere with alignment */
81
+ .table th .cell-wrap .sort-button {
82
+ float: left;
83
+ }
84
+
85
+ /* Target the table body cells */
86
+ .table td {
87
+ text-align: right !important;
88
+ }
89
+
90
+ /* Target the span inside the body cells */
91
+ .table td .cell-wrap span {
92
+ text-align: right !important;
93
+ display: block;
94
+ font-family: Arial, sans-serif !important;
95
+ font-size: 20px;
96
+ }
97
+
98
+ """
99
+
100
+ ABOUT = """
101
+ כלי זה נוצר על ידי גיא מור-לן כחלק מפרויקט Levanti. הכלי מתרגם טקסט בעברית לדיאלקטים השונים של ערבית מדוברת, ולהפך, באמצעות מודלים יעודיים של בינה מלאכותית. כמו כן הכלי תומך בניקוד אוטומטי, המרה של הניקוד לתעתוק והפקת שמע (פיצ'רים ניסיוניים). לפרטים נוספים, גישה לדאטה ולמודלים, ולגרסה האנגלית של כלי ראו את [דף הפרויקט בהאגינג פייס](https://huggingface.co/datasets/guymorlan/levanti).
102
+ מעוניינים לתרום לפרויקט? מצאתם טעות? מוזמנים ליצור קשר [כאן](mailto:guy.mor@mail.huji.ac.il). מוזמנים לבדוק גם את אתר האחות [סרטונים בערבית](https://videosinarabic.com/).
103
+ """
104
+
105
+ JS_FUNC = """
106
+ function refresh() {
107
+ const url = new URL(window.location);
108
+
109
+ if (url.searchParams.get('__theme') !== 'dark') {
110
+ url.searchParams.set('__theme', 'dark');
111
+ window.location.href = url.href;
112
+ }
113
+ }
114
+ """
diacritize.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from transformers import RobertaForTokenClassification, AutoTokenizer
3
+ model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
4
+ tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")
5
+
6
+ #%%
7
+ label2diacritic = {0: 'ّ', # SHADDA
8
+ 1: 'َ', # FATHA
9
+ 2: 'ِ', # KASRA
10
+ 3: 'ُ', # DAMMA
11
+ 4: 'ْ'} # SUKKUN
12
+
13
+
14
+ def arabic2diacritics(text, model, tokenizer):
15
+ tokens = tokenizer(text, return_tensors="pt")
16
+ preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
17
+ new_text = []
18
+ for p, c in zip(preds, text):
19
+ new_text.append(c)
20
+ for i in range(1, 5):
21
+ if p[i]:
22
+ new_text.append(label2diacritic[i])
23
+ # check shadda last
24
+ if p[0]:
25
+ new_text.append(label2diacritic[0])
26
+
27
+ new_text = "".join(new_text)
28
+ return new_text
29
+
30
+
31
+ def diacritize(text):
32
+ return arabic2diacritics(text, model, tokenizer)
33
+
34
+ def diacritize_if_not_already(text):
35
+ if any(c in label2diacritic.values() for c in text):
36
+ return text
37
+ else:
38
+ return arabic2diacritics(text, model, tokenizer)
39
+ #%%
40
+ # text = "بديش اروح عالمدرسة بكرا"
41
+ # arabic2diacritics(text, model, tokenizer)
42
+ # %%
embedding_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2819479f36c2c7667febd5745f148c1c82b2c691ef1e04defec96e9c70c7b71b
3
+ size 88125517
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d31408285b68d57a39850295e26e616291065ca3953fb6db9494e0b66ae61c
3
+ size 319545472
he_ar/README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ datasets:
4
+ - guymorlan/levanti
5
+ language:
6
+ - ar
7
+ - he
8
+ pipeline_tag: translation
9
+ widget:
10
+ - text: P אני רוצה ללכת מחר לחנות
11
+ ---
12
+
13
+ # Levanti (Hebrew -> colloquial Levantine Arabic) translator
14
+
15
+ Trained on the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) dataset by fine-tuning [Helsinki-NLP/opus-mt-he-ar](https://huggingface.co/Helsinki-NLP/opus-mt-ar-he) for 8 epochs.
16
+ This model is trained to support dialect conditional generation by utilizing the first token (followed by a space) as an indicator of the desired dialect:
17
+ * **P** for Palestinian
18
+ * **L** for Lebanese
19
+ * **S** for Syrian
20
+ * **E** for Egyptian
21
+
22
+ # Example usage
23
+
24
+ ```python
25
+ from transformers import pipeline
26
+ trans = pipeline("translation", "guymorlan/levanti_translate_he_ar")
27
+ trans("P אני רוצה ללכת מחר לחנות")
28
+ ```
29
+ ```
30
+ Out[1]: [{'translation_text': 'بدي أروح ع الدكان بكرا'}]
31
+ ```
32
+
33
+ # Attribution
34
+ Created by Guy Mor-Lan.<br>
35
+ Contact: guy.mor AT mail.huji.ac.il
he_ar/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-he-ar",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 63303
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classif_dropout": 0.0,
18
+ "classifier_dropout": 0.0,
19
+ "d_model": 512,
20
+ "decoder_attention_heads": 8,
21
+ "decoder_ffn_dim": 2048,
22
+ "decoder_layerdrop": 0.0,
23
+ "decoder_layers": 6,
24
+ "decoder_start_token_id": 63303,
25
+ "decoder_vocab_size": 63304,
26
+ "dropout": 0.1,
27
+ "encoder_attention_heads": 8,
28
+ "encoder_ffn_dim": 2048,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 6,
31
+ "eos_token_id": 0,
32
+ "extra_pos_embeddings": 63304,
33
+ "forced_eos_token_id": 0,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1",
37
+ "2": "LABEL_2"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1,
44
+ "LABEL_2": 2
45
+ },
46
+ "max_length": 512,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "marian",
49
+ "normalize_before": false,
50
+ "normalize_embedding": false,
51
+ "num_beams": 4,
52
+ "num_hidden_layers": 6,
53
+ "pad_token_id": 63303,
54
+ "scale_embedding": true,
55
+ "share_encoder_decoder_embeddings": true,
56
+ "static_position_embeddings": true,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.38.1",
59
+ "use_cache": true,
60
+ "vocab_size": 63304
61
+ }
he_ar/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bad_words_ids": [
3
+ [
4
+ 63303
5
+ ]
6
+ ],
7
+ "bos_token_id": 0,
8
+ "decoder_start_token_id": 63303,
9
+ "eos_token_id": 0,
10
+ "forced_eos_token_id": 0,
11
+ "max_length": 512,
12
+ "num_beams": 4,
13
+ "pad_token_id": 63303,
14
+ "renormalize_logits": true,
15
+ "transformers_version": "4.38.1"
16
+ }
he_ar/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:334b98a0c0db5cd649d23521bf3c7ce41d92238092c7b90811434961390ab7b2
3
+ size 306482848
he_ar/source.spm ADDED
Binary file (896 kB). View file
 
he_ar/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
he_ar/target.spm ADDED
Binary file (899 kB). View file
 
he_ar/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "</s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "63303": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": true,
29
+ "eos_token": "</s>",
30
+ "model_max_length": 512,
31
+ "pad_token": "<pad>",
32
+ "return_tensors": "pt",
33
+ "separate_vocabs": false,
34
+ "source_lang": "heb",
35
+ "sp_model_kwargs": {},
36
+ "target_lang": "ara",
37
+ "tokenizer_class": "MarianTokenizer",
38
+ "unk_token": "<unk>"
39
+ }
he_ar/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
he_ar_ct2/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "multi_query_attention": false,
9
+ "unk_token": "<unk>"
10
+ }
he_ar_ct2/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2ca03c089d8803b9963cfca86b1a760cb56e1f3f776743d21b33aa747bc94e8
3
+ size 307511690
he_ar_ct2/shared_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.42.3
2
+ torch==1.13.1
3
+ sentencepiece==0.1.97
4
+ sacremoses==0.0.53
5
+ pandas==1.5.1
6
+ azure-cognitiveservices-speech==1.38.0
7
+ matplotlib==3.7.0
8
+ python-dotenv
9
+ gradio==4.37.2
10
+ ctranslate2==4.1.0
semsearch.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import pandas as pd
4
+ import translate
5
+ import gradio as gr
6
+
7
+ data = pd.read_csv("./embedding_data.csv")
8
+ embeddings = np.load("./embeddings.npy")
9
+
10
+ def normalize_vector(v):
11
+ norm = np.linalg.norm(v)
12
+ if norm == 0:
13
+ return v
14
+ return v / norm
15
+
16
+
17
+ def embed_one(model, tokenizer, text, normalize=True):
18
+ tokens = tokenizer(text, return_tensors="pt", truncation=True)
19
+ with torch.no_grad():
20
+ embedding = model.model.encoder(**tokens).last_hidden_state.mean(axis=1)
21
+ embedding = embedding.detach().numpy()[0]
22
+
23
+ if normalize:
24
+ return normalize_vector(embedding)
25
+ else:
26
+ return embedding
27
+
28
+
29
+ def knn(query_embedding, embeddings, df, k=5, hebrew=True):
30
+ sims = np.dot(embeddings, query_embedding.T)
31
+ outs = np.argsort(sims, axis=0)[-k:][::-1]
32
+ select = outs.ravel()
33
+ if hebrew:
34
+ return df.iloc[select][["arabic", "hebrew", "validated"]]
35
+ else:
36
+ return df.iloc[select][["arabic", "english", "validated"]]
37
+
38
+ def run_knn(text, k=5):
39
+ print(text)
40
+ query_embedding = embed_one(translate.model_from_ar,
41
+ translate.tokenizer_from_ar, text)
42
+ return knn(query_embedding, embeddings, data, k=k, hebrew=True)
43
+
44
+
45
+ def style_dataframe(df):
46
+ styled_df = df.style.set_properties(**{
47
+ 'font-family': 'Arial, sans-serif',
48
+ 'font-size': '20px',
49
+ 'text-align': 'right',
50
+ 'direction': 'rtl',
51
+ 'align': 'right'
52
+ }).set_table_styles([
53
+ {'selector': 'th', 'props': [('text-align', 'right')]}
54
+ ])
55
+ return styled_df
56
+
57
+
58
+ def style_dataframe(df):
59
+ return df.style.set_table_styles([
60
+ {'selector': 'thead', 'props': [('text-align', 'right')]},
61
+ {'selector': '.index_name', 'props': [('text-align', 'right')]},
62
+ ]).set_properties(**{
63
+ 'text-align': 'right',
64
+ }) # Replace 'column_name' with your actual column name
65
+
66
+
67
+ def update_df(hidden_arabic):
68
+ df = run_knn(hidden_arabic, 100)
69
+ # replace true and false in validated column with checkmark and x emoji
70
+ df["validated"] = df["validated"].apply(lambda x: "✅" if x else "❌")
71
+ # replace name validated with "מאומת"
72
+ df = df.rename(columns={"validated": "מאומת"})
73
+ # replace name arabic with "ערבית"
74
+ df = df.rename(columns={"arabic": "ערבית"})
75
+ # replace name hebrew with "עברית"
76
+ df = df.rename(columns={"hebrew": "עברית"})
77
+ styled_df = style_dataframe(df)
78
+ return gr.DataFrame(value=styled_df, visible=True)
translate.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import MarianMTModel, AutoTokenizer
3
+ import ctranslate2
4
+ from colorize import align_words
5
+ import logging
6
+
7
+ # Create a logger
8
+ logger = logging.getLogger()
9
+ logger.setLevel(logging.INFO) # Set to debug to capture all levels of logs
10
+ file_handler = logging.FileHandler('app.log', mode='a') # 'a' mode appends to the file
11
+ file_handler.setLevel(logging.INFO)
12
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13
+ file_handler.setFormatter(formatter)
14
+ logger.addHandler(file_handler)
15
+
16
+ model_to_ar = MarianMTModel.from_pretrained("./he_ar/", output_attentions=True)
17
+ model_from_ar = MarianMTModel.from_pretrained("./ar_he/", output_attentions=True)
18
+ model_to_ar_ct2 = ctranslate2.Translator("./he_ar_ct2/")
19
+ model_from_ar_ct2 = ctranslate2.Translator("./ar_he_ct2/")
20
+
21
+ tokenizer_to_ar = AutoTokenizer.from_pretrained("./he_ar/")
22
+ tokenizer_from_ar = AutoTokenizer.from_pretrained("./ar_he/")
23
+ print("Done loading models")
24
+
25
+ dialect_map = {
26
+ "Palestinian": "P",
27
+ "Syrian": "S",
28
+ "Lebanese": "L",
29
+ "Egyptian": "E",
30
+ "פלסטיני": "P",
31
+ "סורי": "S",
32
+ "לבנוני": "L",
33
+ "מצרי": "E"
34
+ }
35
+
36
+
37
+ def translate(text, ct_model, hf_model, tokenizer, to_arabic=True,
38
+ threshold=None, layer=2, head=6):
39
+
40
+ logger.info(f"Translating: {text}")
41
+ inp_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
42
+ out_tokens = ct_model.translate_batch([inp_tokens])[0].hypotheses[0]
43
+ out_string = tokenizer.convert_tokens_to_string(out_tokens)
44
+
45
+ encoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(inp_tokens)).unsqueeze(0)
46
+ decoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(["<pad>"] + out_tokens +
47
+ ['</s>'])).unsqueeze(0)
48
+
49
+ colorization_output = hf_model(input_ids=encoder_input_ids,
50
+ decoder_input_ids=decoder_input_ids)
51
+
52
+ if not threshold:
53
+ if len(inp_tokens) < 10:
54
+ threshold = 0.05
55
+ elif len(inp_tokens) < 20:
56
+ threshold = 0.10
57
+ else:
58
+ threshold = 0.05
59
+
60
+ srchtml, tgthtml = align_words(colorization_output,
61
+ tokenizer,
62
+ encoder_input_ids,
63
+ decoder_input_ids,
64
+ threshold,
65
+ skip_first_src=to_arabic,
66
+ skip_second_src=False,
67
+ layer=layer,
68
+ head=head)
69
+
70
+ html = f"<div style='direction: rtl'>{srchtml}<br><br>{tgthtml}</div>"
71
+
72
+ arabic = out_string if is_arabic(out_string) else text
73
+ return html, arabic
74
+
75
+
76
+ #%%
77
+
78
+
79
+ def is_arabic(text):
80
+ # return True if text has more than 50% arabic characters, False otherwise
81
+ text = text.replace(" ", "")
82
+ arabic_chars = 0
83
+ for c in text:
84
+ if "\u0600" <= c <= "\u06FF":
85
+ arabic_chars += 1
86
+
87
+ return arabic_chars / len(text) > 0.5
88
+
89
+ def run_translate(text, dialect=None):
90
+ if not text:
91
+ return ""
92
+ if is_arabic(text):
93
+ return translate(text, model_from_ar_ct2, model_from_ar, tokenizer_from_ar,
94
+ to_arabic=False, threshold=None, layer=2, head=1)
95
+ else:
96
+ if dialect in dialect_map:
97
+ dialect = dialect_map[dialect]
98
+
99
+ text = f"{dialect} {text}" if dialect else text
100
+ return translate(text, model_to_ar_ct2, model_to_ar, tokenizer_to_ar,
101
+ to_arabic=True, threshold=None, layer=2, head=6)
translit.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from transformers import CanineForTokenClassification, AutoTokenizer
3
+ import re
4
+ import torch
5
+
6
+ # instantiate module logger
7
+ import logging
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+ model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
12
+ tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
13
+
14
+ #%%
15
+
16
+ def diacritics2hebrew_vowels(text, model, tokenizer):
17
+ tokens = tokenizer(text, return_tensors="pt")
18
+ with torch.no_grad():
19
+ pred = model(**tokens)
20
+ pred = pred.logits.argmax(-1).tolist()
21
+
22
+ pred = pred[0][1:-1] # remove CLS and SEP
23
+ output = []
24
+ for p, c in zip(pred, text):
25
+ if p != model.config.label2id["O"]:
26
+ output.append(model.config.id2label[p])
27
+ else:
28
+ output.append(c)
29
+ output = "".join(output)
30
+
31
+ # print("Done converting to Hebrew vowels")
32
+ logger.warning("Done converting to Hebrew vowels")
33
+ return output
34
+
35
+ #%%
36
+
37
+
38
+ arabic_to_english = {
39
+ "ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
40
+ "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
41
+ "ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
42
+ "س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
43
+ "ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
44
+ "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
45
+ "َ": "a", "ُ": "u", "ِ": "i",
46
+ "،": ",",
47
+ "ֹ": "o", # holam
48
+ "ַ": "a", # patah
49
+ "ִ": "i", # hiriq
50
+ "ְ": "", # shva
51
+ "ֻ": "u", # kubutz
52
+ 'ֵ': "e",
53
+ "ّ": "SHADDA" # shadda
54
+ }
55
+
56
+ arabic_to_hebrew = {
57
+ # regular letters
58
+ "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
59
+ "آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'",
60
+ "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
61
+ "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
62
+ "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל",
63
+ "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
64
+ # special characters
65
+ "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
66
+ "؟": "?", "؛": ";", "ـ": "",
67
+ # shadda to \u0598
68
+ "ّ": "\u0598",
69
+ }
70
+
71
+ vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
72
+
73
+ final_letters = {
74
+ "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
75
+ }
76
+
77
+ def reorder_hebrew_nikkud(input_string):
78
+ # in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe
79
+
80
+ # Define a dictionary for the nikkud signs
81
+ nikkud_signs = {
82
+ "ֹ": "o", # holam
83
+ "ַ": "a", # patah
84
+ "ִ": "i", # hiriq
85
+ "ְ": "", # shva
86
+ "ֻ": "u", # kubutz
87
+ "ֵ": "e", # tsere
88
+ "ّ": "SHADDA" # shadda
89
+ }
90
+
91
+ # Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs
92
+ pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])'
93
+ replacement = r'\1\3\2'
94
+
95
+ result = re.sub(pattern, replacement, input_string)
96
+
97
+ return result
98
+
99
+ def reverse_holam_shadda_vav(input_string):
100
+ # For better readability, replace (holam, shadda, ו) with (shadda, ו, holam)
101
+ # instead of shadda we use the replacement \u0598
102
+ pattern = r'(\u05B9)(\u0598)(\u05D5)'
103
+ replacement = r'\2\3\1'
104
+ result = re.sub(pattern, replacement, input_string)
105
+
106
+ return result
107
+
108
+ def to_taatik(arabic):
109
+ taatik = []
110
+ for index, letter in enumerate(arabic):
111
+ if (
112
+ (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
113
+ letter in final_letters
114
+ ):
115
+ taatik.append(final_letters[letter])
116
+ elif letter not in arabic_to_hebrew:
117
+ taatik.append(letter)
118
+ else:
119
+ taatik.append(arabic_to_hebrew[letter])
120
+ reversed = reverse_holam_shadda_vav("".join(taatik))
121
+ reordered = reorder_hebrew_nikkud(reversed)
122
+ # print("Done converting to taatik")
123
+ logger.warning("Done converting to taatik")
124
+ return reordered
125
+
126
+
127
+
128
+
129
+ def to_translit(arabic):
130
+ translit = []
131
+ for letter in arabic:
132
+ if letter not in arabic_to_english:
133
+ translit.append([letter, letter])
134
+ else:
135
+ if arabic_to_english[letter] == "SHADDA":
136
+ if translit[-1][0] in vowels:
137
+ translit[-2][1] = translit[-2][1].upper()
138
+ else:
139
+ translit[-1][1] = translit[-1][1].upper()
140
+
141
+ else:
142
+ translit.append([letter, arabic_to_english[letter]])
143
+
144
+ return "".join([x[1] for x in translit])
145
+
146
+
147
+ # %%
148
+
149
+ def taatik(text):
150
+ return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer))
151
+
152
+ # text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِل��َّجَر "
153
+ # heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
154
+ # #%%
155
+ # to_taatik(heb_vowels)
156
+ # #%%
157
+ # to_translit(heb_vowels)
158
+ # # %%
tts.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import azure.cognitiveservices.speech as speechsdk
3
+ import re
4
+ import os
5
+ import hashlib
6
+ import random
7
+
8
+ speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
9
+ region=os.environ.get('SPEECH_REGION'))
10
+
11
+ def do_cleanup(dir='wavs', num_files=100):
12
+ files = os.listdir(dir)
13
+ if len(files) > num_files:
14
+ for file in files[:len(files) - num_files]:
15
+ os.remove(f"{dir}/{file}")
16
+
17
+ def add_sukun(text):
18
+ # Define Arabic letters and sukun
19
+ arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
20
+ shadda = 'ّ'
21
+ arabic_letters += shadda
22
+ sukun = 'ْ'
23
+ punctuation = '.,;!?،؛؟'
24
+
25
+ def process_word(word):
26
+ # If the last character is punctuation, process the letter before it
27
+ if word[-1] in punctuation:
28
+ if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
29
+ return word[:-2] + word[-2] + sukun + word[-1]
30
+ return word
31
+ # If the last character is an Arabic letter and does not have a sukun, add one
32
+ elif word[-1] in arabic_letters and word[-1] != sukun:
33
+ return word + sukun
34
+ return word
35
+
36
+ # Use regex to split text into words and punctuation
37
+ words = re.findall(r'\S+|[.,;!?،؛؟]', text)
38
+ processed_text = ' '.join(process_word(word) for word in words)
39
+ return processed_text
40
+
41
+ def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
42
+ return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'
43
+
44
+
45
+ def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):
46
+
47
+ input_text = add_sukun(input_text)
48
+ hash = hashlib.md5(input_text.encode()).hexdigest()
49
+
50
+ if os.path.exists(f"wavs/{hash}.wav"):
51
+ return f"wavs/{hash}.wav"
52
+
53
+ audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
54
+ # speech_config.speech_synthesis_voice_name=voice
55
+ # speech_config.speech_synthesis_language = "ar-EG"
56
+ speech_config.set_speech_synthesis_output_format(
57
+ speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
58
+ )
59
+
60
+ speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
61
+ audio_config=audio_config)
62
+ if use_ssml:
63
+ # print("Using SSML")
64
+ ssml = get_ssml(input_text, voice=voice)
65
+ result = speech_synthesizer.speak_ssml_async(ssml).get()
66
+ else:
67
+ # print("Using text")
68
+ result = speech_synthesizer.speak_text_async(input_text).get()
69
+
70
+ if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
71
+ print("Speech synthesized for text [{}]".format(input_text))
72
+ elif result.reason == speechsdk.ResultReason.Canceled:
73
+ cancellation_details = result.cancellation_details
74
+ print("Speech synthesis canceled: {}".format(cancellation_details.reason))
75
+ if cancellation_details.reason == speechsdk.CancellationReason.Error:
76
+ print("Error details: {}".format(cancellation_details.error_details))
77
+
78
+ # randomly every 50 calls, clean up the wavs folder
79
+ if random.randint(1, 50) == 1:
80
+ do_cleanup()
81
+
82
+ return f"wavs/{hash}.wav"