Spaces:

guymorlan
/

levanti_en_ar

Running

App Files Files Community

Guy Mor-Lan commited on Jul 13

Commit

e35836c

•

1 Parent(s): 909a784

add files

Browse files

Files changed (20) hide show

app.py +106 -0
ar_en +1 -0
ar_en_ct2/config.json +10 -0
ar_en_ct2/model.bin +3 -0
ar_en_ct2/shared_vocabulary.json +0 -0
colorize.py +109 -0
consts.py +104 -0
diacritize.py +42 -0
en_ar +1 -0
en_ar_ct2/config.json +10 -0
en_ar_ct2/model.bin +3 -0
en_ar_ct2/shared_vocabulary.json +0 -0
requirements.txt +10 -0
semsearch.py +78 -0
translate.py +101 -0
translit.py +161 -0
tts.py +86 -0
wavs/1cfdc7d62daa8ab925371ac17ea4e792.wav +0 -0
wavs/5fc7e1c64e032cdbfc6435dc9a6a32ce.wav +0 -0
wavs/8300935681825cf0e5c467f08fb31325.wav +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#%%
+import gradio as gr
+from dotenv import load_dotenv
+from translate import run_translate
+from diacritize import diacritize, diacritize_if_not_already
+from translit import taatik, translit
+from semsearch import update_df
+from tts import get_audio
+from consts import CSS, ABOUT, JS_FUNC
+load_dotenv()
+with gr.Blocks(title = "Levanti - Levantine Arabic Translation Tools",
+               css=CSS,
+               theme="default") as demo:
+    # gr.HTML("<h2><span style='color: #2563eb'>Levantine Arabic</span> Translator</h2>")
+    gr.HTML("<h2><span><span style='color: #2563eb'>Levanti</span>ne Translator</span></h2>Levantine Arabic Translation Tools")
+    with gr.Tab('Translation', elem_id="tab1"):
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(label="Input",
+                                        info = "Colloquial Arabic or English",
+                                        placeholder="Enter text in Arabic or English",
+                                        lines=2,
+                                        elem_id="input")
+                gr.Examples(["I called him two times, he's not picking up", "خلينا ندور على مطعم تاني"],
+                            input_text,
+                            label="Examples")
+                btn = gr.Button("Translation")
+                with gr.Row():
+                    dialect = gr.Radio(["Palestinian", "Syrian", "Lebanese", "Egyptian"],
+                                        label = "Dialect",
+                                        info="Affects translation to Arabic",
+                                        value="Palestinian")
+                # gr.Markdown("Built by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il). Pronunciation model is specifically tailored to urban Palestinian Arabic. Text-to-speech uses Microsoft Azure's API and may provide different result from the transliterated pronunciation.")
+                gr.Markdown("Create by [Guy Mor-Lan](mailto:guy.mor@mail.huji.ac.il) as part of the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) project. Audio is produced using Azure TTS with predicted diacritics and heuristics.", elem_id="footer")
+            with gr.Column():
+                with gr.Group(elem_id="grp"):
+                        gr.HTML("Translation")
+                        # gr.Markdown("תרגום", elem_id="diacritized")
+                        translation_output = gr.HTML("<br>", visible=True, label="Translation", elem_id="main")
+                hidden_arabic = gr.Textbox(lines=1, elem_id="trans", visible=False)
+                diacritized_output = gr.Textbox(label="Diacritization (experimental)", lines=1, elem_id="diacritized",
+                                                interactive=False)
+                taatik_output = gr.Textbox(label="Transliteration (Experimental)", lines=1, elem_id="taatik",
+                                           text_align="right", interactive=False)
+                # diacritized_output = gr.HTML("<br>", label="ניקוד")
+                # taatik_output = gr.HTML("<br>", label="תעתיק")
+                audio = gr.Audio(label="Audio (Azure)", interactive=False,
+                                 autoplay=True)
+                audio_button = gr.Button("Generate Audio")
+                audio_button.click(get_audio, inputs=[diacritized_output], outputs=[audio])
+        btn.click(run_translate, inputs=[input_text, dialect],
+                  outputs=[translation_output, hidden_arabic], api_name="en2ar",
+                  js="function jump(x, y){document.getElementById('main').scrollIntoView(); return [x, y];}")
+        input_text.submit(run_translate, inputs=[input_text, dialect],
+                          outputs=[translation_output, hidden_arabic], scroll_to_output=True)
+        hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
+        diacritized_output.change(translit, inputs=[diacritized_output], outputs=[taatik_output])
+        # with gr.Row():
+        #     nearest_df = gr.DataFrame(headers=["ערבית", "עברית", "מאומת"], visible=False, wrap=True,
+        #                               elem_id="nearest", label="תוצאות קרובות מתוך קורפוס Levanti", height=300)
+        # hidden_arabic.change(update_df, inputs=[hidden_arabic], outputs=[nearest_df])
+    with gr.Tab("Diacritization and Transliteration", elem_id="tab2"):
+        with gr.Row():
+            with gr.Column():
+                diac_text = gr.Textbox(label="Input", placeholder="Insert text in Arabic", lines=1,
+                                        info = "For transliteration only, insert diacritized text",
+                                        elem_id="diac_input")
+                gr.Examples(["خلينا ندور على مطعم تاني", "قَدِيْش حَقّ الْبَنْدُورَة؟"], diac_text,
+                            label="Examples", elem_id="diac_ex")
+                btn2 = gr.Button("Send")
+            with gr.Column():
+                diacritized_output2 = gr.Textbox(label="Diacritization", lines=1,
+                                                  elem_id="diacritized2")
+                taatik_output2 = gr.Textbox(label="Transliteration", lines=1,
+                                             elem_id="taatik2")
+        # input_text.submit(run_translate, inputs=[input_text, dialect],
+        #                   outputs=[translation_output], scroll_to_output=True)
+        # hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
+        # diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
+        btn2.click(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
+        diac_text.submit(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
+        diacritized_output2.change(translit, inputs=[diacritized_output2], outputs=[taatik_output2])
+    with gr.Tab("About", elem_id="tab3"):
+        with gr.Row():
+            gr.HTML("<h2>About</h2>")
+            gr.Markdown(ABOUT, elem_id="about")
+demo.launch(ssl_verify=False)

ar_en ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit b5626c21d9814e83302354362e60d813003f8b97

ar_en_ct2/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "add_source_bos": false,
+  "add_source_eos": false,
+  "bos_token": "<s>",
+  "decoder_start_token": "</s>",
+  "eos_token": "</s>",
+  "layer_norm_epsilon": null,
+  "multi_query_attention": false,
+  "unk_token": "<unk>"
+}

ar_en_ct2/model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e12f837210b68c3e3f915b1de10bd762feee9e0f7c515f56f521f79d0b6dc5c
+size 306547250

ar_en_ct2/shared_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

colorize.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import matplotlib.pyplot as plt
+import numpy as np
+def generate_diverging_colors(num_colors, palette='Set3'): # courtesy of ChatGPT
+    # Generate a colormap with a specified number of colors
+    cmap = plt.cm.get_cmap(palette, num_colors)
+    # Get the RGB values of the colors in the colormap
+    colors_rgb = cmap(np.arange(num_colors))
+    # Convert the RGB values to hexadecimal color codes
+    colors_hex = [format(int(color[0]*255)<<16|int(color[1]*255)<<8|int(color[2]*255), '06x') for color in colors_rgb]
+    return colors_hex
+def align_words(outputs, tokenizer, encoder_input_ids, decoder_input_ids,
+                threshold=0.4, skip_first_src=True, skip_second_src=False,
+                layer=2, head=6):
+    alignment = []
+    # threshold = 0.05
+    for i, tok in enumerate(outputs.cross_attentions[layer][0][head]):
+        alignment.append([[i], (tok > threshold).nonzero().squeeze(-1).tolist()])
+    # for i in alignment:
+    #     src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
+    #     trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
+    #     print(src_tok, "=>", trg_tok)
+    merged = []
+    for i in alignment:
+        token = tokenizer.convert_ids_to_tokens([decoder_input_ids[0][i[0]]])[0]
+        # print(token)
+        if token not in ["</s>", "<pad>", "<unk>", "<s>"]:
+            if merged:
+                tomerge = False
+                # check overlap with previous entry
+                for x in i[1]:
+                    if x in merged[-1][1]:# or tokenizer.convert_ids_to_tokens([encoder_input_ids[0][x]])[0][0] != "▁":
+                        tomerge = True
+                        break
+                # if first character is not a "▁"
+                if token[0] != "▁":
+                    tomerge = True
+                if tomerge:
+                    merged[-1][0] += i[0]
+                    merged[-1][1] += i[1]
+                else:
+                    merged.append(i)
+            else:
+                merged.append(i)
+    # print("=====MERGED=====")
+    # for i in merged:
+    #     src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
+    #     trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
+    #     print(src_tok, "=>", trg_tok)
+    colordict = {}
+    ncolors = 0
+    for i in merged:
+        src_tok = [f"src_{x}" for x in i[0]]
+        trg_tok = [f"trg_{x}" for x in i[1]]
+        all_tok = src_tok + trg_tok
+        # see if any tokens in entry already have associated color
+        newcolor = None
+        for t in all_tok:
+            if t in colordict:
+                newcolor = colordict[t]
+                break
+        if not newcolor:
+            newcolor = ncolors
+            ncolors += 1
+        for t in all_tok:
+            if t not in colordict:
+                colordict[t] = newcolor
+    colors = generate_diverging_colors(ncolors, palette="Set2")
+    id_to_color = {i: c for i, c in enumerate(colors)}
+    for k, v in colordict.items():
+        colordict[k] = id_to_color[v]
+    tgthtml = []
+    for i, token in enumerate(decoder_input_ids[0]):
+        if f"src_{i}" in colordict:
+            label = f"src_{i}"
+            tgthtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
+        else:
+            tgthtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
+    tgthtml = "".join(tgthtml)
+    tgthtml = tgthtml.replace("▁", " ")
+    tgthtml = f"<span style='font-size: 25px'>{tgthtml}</span>"
+    srchtml = []
+    for i, token in enumerate(encoder_input_ids[0]):
+        if  (i == 0 and skip_first_src) or (i == 1 and skip_second_src):
+            continue
+        if f"trg_{i}" in colordict:
+            label = f"trg_{i}"
+            srchtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
+        else:
+            srchtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
+    srchtml = "".join(srchtml)
+    srchtml = srchtml.replace("▁", " ")
+    srchtml = f"<span style='font-size: 25px'>{srchtml}</span>"
+    return srchtml, tgthtml

consts.py ADDED Viewed

	@@ -0,0 +1,104 @@

+CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Hebrew:wght@400;700&family=Noto+Naskh+Arabic:wght@400;700&display=swap');
+#taatik textarea {
+    font-size: 25px;
+    font-family: 'Noto Sans Hebrew', 'Noto Naskh Arabic', 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial', 'Arial Unicode MS', sans-serif;
+}
+@font-face {
+    font-family: 'Noto Sans Hebrew';
+    src: url('https://fonts.gstatic.com/s/notosanshebrew/v40/or3HQ7v33eiDlKj4557q0OGCZa662.woff2') format('woff2');
+    unicode-range: U+0590-05FF, U+200C-2010, U+20AA, U+25CC, U+FB1D-FB4F;
+}
+@font-face {
+    font-family: 'Noto Naskh Arabic';
+    src: url('https://fonts.gstatic.com/s/notonaskharabic/v30/RrQ5bpV-9Dd1b1OAGA6M9PkyDuVBePeKNaxcsss0Y7bwvc5Urqjc.woff2') format('woff2');
+    unicode-range: U+0600-06FF, U+0750-077F, U+0870-088E, U+0890-0891, U+0898-08E1, U+08E3-08FF, U+200C-200E, U+2010-2011, U+204F, U+2E41, U+FB50-FDFF, U+FE70-FE74, U+FE76-FEFC;
+}
+#liter textarea, #trans textarea { font-size: 25px;}
+#grp { padding: 10px; }
+#diac_input textarea {direction: rtl;}
+#diacritized textarea { direction: rtl; }
+#diacritized2 textarea { direction: rtl; }
+#diacritized textarea { font-size: 25px;}
+#diacritized2 textarea { font-size: 25px;}
+#taatik2 textarea { font-size: 25px;}
+#input textarea { font-size: 20px;}
+#diac_input textarea { font-size: 20px;}
+#check { border-style: none !important; }
+#nearest { font-family: 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial'; }
+:root {--button-secondary-background-focus: #2563eb !important;
+       --button-secondary-background-base: #2563eb !important;
+       --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2);
+       --button-secondary-text-color-base: white !important;
+       --button-secondary-text-color-hover: white !important;
+       --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
+       --button-secondary-text-color-focus: white !important}
+.dark {--button-secondary-background-base: #2563eb !important;
+       --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
+       --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2)}
+.feather-music { stroke: #2563eb; }
+.dataframe {
+    font-family: 'Arial', 'Helvetica', sans-serif !important;
+}
+.dataframe th, .dataframe td {
+    font-family: inherit !important;
+}
+.gradio-container .dataframe {
+    font-family: Arial, sans-serif !important;
+}
+/* Target the table header cells */
+.table th .cell-wrap {
+    text-align: right !important;
+}
+/* Target the span inside the header cells */
+.table th .cell-wrap span {
+    text-align: right !important;
+    display: block;
+    font-family: Arial, sans-serif !important;
+}
+/* Ensure the sort button doesn't interfere with alignment */
+.table th .cell-wrap .sort-button {
+    float: left;
+}
+/* Target the table body cells */
+.table td {
+    text-align: right !important;
+}
+/* Target the span inside the body cells */
+.table td .cell-wrap span {
+    text-align: right !important;
+    display: block;
+    font-family: Arial, sans-serif !important;
+    font-size: 20px;
+}
+"""
+ABOUT = """
+This tool was created by Guy Mor-Lan as part of the Levanti project. The tool translates text between Hebrew and various Levantine Arabic dialects using specialized AI models. The tool also supports automatic diacritization, conversion of diacritization to transliteration, and audio generation (experimental features). For more information, access to data and models, and the Hebrew version of the tool, see the [project page on Hugging Face](https://huggingface.co/datasets/guymorlan/levanti).
+"""
+JS_FUNC = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+        window.location.href = url.href;
+    }
+}
+"""

diacritize.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#%%
+from transformers import RobertaForTokenClassification, AutoTokenizer
+model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
+tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")
+#%%
+label2diacritic = {0: 'ّ', # SHADDA
+                   1: 'َ', # FATHA
+                   2: 'ِ', # KASRA
+                   3: 'ُ', # DAMMA
+                   4: 'ْ'} # SUKKUN
+def arabic2diacritics(text, model, tokenizer):
+    tokens = tokenizer(text, return_tensors="pt")
+    preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
+    new_text = []
+    for p, c in zip(preds, text):
+        new_text.append(c)
+        for i in range(1, 5):
+            if p[i]:
+                new_text.append(label2diacritic[i])
+        # check shadda last
+        if p[0]:
+            new_text.append(label2diacritic[0])
+    new_text = "".join(new_text)
+    return new_text
+def diacritize(text):
+    return arabic2diacritics(text, model, tokenizer)
+def diacritize_if_not_already(text):
+    if any(c in label2diacritic.values() for c in text):
+        return text
+    else:
+        return arabic2diacritics(text, model, tokenizer)
+#%%
+# text = "بديش اروح عالمدرسة بكرا"
+# arabic2diacritics(text, model, tokenizer)
+# %%

en_ar ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 76d2a612d5c6b4cc8fe16bd7608e7a0809a96ba5

en_ar_ct2/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "add_source_bos": false,
+  "add_source_eos": false,
+  "bos_token": "<s>",
+  "decoder_start_token": "</s>",
+  "eos_token": "</s>",
+  "layer_norm_epsilon": null,
+  "multi_query_attention": false,
+  "unk_token": "<unk>"
+}

en_ar_ct2/model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00ce25a723eef083222e4700514043b654c329c9202e11204e630484810b2235
+size 306481586

en_ar_ct2/shared_vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+transformers==4.42.3
+torch==1.13.1
+sentencepiece==0.1.97
+sacremoses==0.0.53
+pandas==1.5.1
+azure-cognitiveservices-speech==1.25.0
+matplotlib==3.7.0
+python-dotenv
+gradio==4.37.2
+ctranslate2==4.1.0

semsearch.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+import torch
+import pandas as pd
+import translate
+import gradio as gr
+# data = pd.read_csv("./embedding_data.csv")
+# embeddings = np.load("./embeddings.npy")
+def normalize_vector(v):
+    norm = np.linalg.norm(v)
+    if norm == 0:
+        return v
+    return v / norm
+def embed_one(model, tokenizer, text, normalize=True):
+    tokens = tokenizer(text, return_tensors="pt", truncation=True)
+    with torch.no_grad():
+        embedding = model.model.encoder(**tokens).last_hidden_state.mean(axis=1)
+        embedding = embedding.detach().numpy()[0]
+    if normalize:
+        return normalize_vector(embedding)
+    else:
+        return embedding
+def knn(query_embedding, embeddings, df, k=5, hebrew=True):
+    sims = np.dot(embeddings, query_embedding.T)
+    outs = np.argsort(sims, axis=0)[-k:][::-1]
+    select = outs.ravel()
+    if hebrew:
+        return df.iloc[select][["arabic", "hebrew", "validated"]]
+    else:
+        return df.iloc[select][["arabic", "english", "validated"]]
+def run_knn(text, k=5):
+    print(text)
+    query_embedding = embed_one(translate.model_from_ar,
+                                translate.tokenizer_from_ar, text)
+    return knn(query_embedding, embeddings, data, k=k, hebrew=True)
+def style_dataframe(df):
+    styled_df = df.style.set_properties(**{
+        'font-family': 'Arial, sans-serif',
+        'font-size': '20px',
+        'text-align': 'right',
+        'direction': 'rtl',
+        'align': 'right'
+    }).set_table_styles([
+    {'selector': 'th', 'props': [('text-align', 'right')]}
+])
+    return styled_df
+def style_dataframe(df):
+    return df.style.set_table_styles([
+        {'selector': 'thead', 'props': [('text-align', 'right')]},
+        {'selector': '.index_name', 'props': [('text-align', 'right')]},
+    ]).set_properties(**{
+        'text-align': 'right',
+    })  # Replace 'column_name' with your actual column name
+def update_df(hidden_arabic):
+    df = run_knn(hidden_arabic, 100)
+    # replace true and false in validated column with checkmark and x emoji
+    df["validated"] = df["validated"].apply(lambda x: "✅" if x else "❌")
+    # replace name validated with "מאומת"
+    df = df.rename(columns={"validated": "מאומת"})
+    # replace name arabic with "ערבית"
+    df = df.rename(columns={"arabic": "ערבית"})
+    # replace name hebrew with "עברית"
+    df = df.rename(columns={"hebrew": "עברית"})
+    styled_df = style_dataframe(df)
+    return gr.DataFrame(value=styled_df, visible=True)

translate.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+from transformers import MarianMTModel, AutoTokenizer
+import ctranslate2
+from colorize import align_words
+import logging
+# Create a logger
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)  # Set to debug to capture all levels of logs
+file_handler = logging.FileHandler('app.log', mode='a')  # 'a' mode appends to the file
+file_handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+file_handler.setFormatter(formatter)
+logger.addHandler(file_handler)
+model_to_ar = MarianMTModel.from_pretrained("./en_ar/", output_attentions=True)
+model_from_ar = MarianMTModel.from_pretrained("./ar_en/", output_attentions=True)
+model_to_ar_ct2 = ctranslate2.Translator("./en_ar_ct2/")
+model_from_ar_ct2 = ctranslate2.Translator("./ar_en_ct2/")
+tokenizer_to_ar = AutoTokenizer.from_pretrained("./en_ar/")
+tokenizer_from_ar = AutoTokenizer.from_pretrained("./ar_en/")
+print("Done loading models")
+dialect_map = {
+    "Palestinian": "P",
+    "Syrian": "S",
+    "Lebanese": "L",
+    "Egyptian": "E",
+    "פלסטיני": "P",
+    "סורי": "S",
+    "לבנוני": "L",
+    "מצרי": "E"
+}
+def translate(text, ct_model, hf_model, tokenizer, to_arabic=True,
+              threshold=None, layer=2, head=6):
+    logger.info(f"Translating: {text}")
+    inp_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
+    out_tokens = ct_model.translate_batch([inp_tokens])[0].hypotheses[0]
+    out_string = tokenizer.convert_tokens_to_string(out_tokens)
+    encoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(inp_tokens)).unsqueeze(0)
+    decoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(["<pad>"] + out_tokens +
+                                                                     ['</s>'])).unsqueeze(0)
+    colorization_output = hf_model(input_ids=encoder_input_ids,
+                                   decoder_input_ids=decoder_input_ids)
+    if not threshold:
+        if len(inp_tokens) < 10:
+            threshold = 0.05
+        elif len(inp_tokens) < 20:
+            threshold = 0.10
+        else:
+            threshold = 0.05
+    srchtml, tgthtml = align_words(colorization_output,
+                                   tokenizer,
+                                   encoder_input_ids,
+                                   decoder_input_ids,
+                                   threshold,
+                                   skip_first_src=to_arabic,
+                                   skip_second_src=False,
+                                   layer=layer,
+                                   head=head)
+    html = f"<div style='direction: rtl'>{srchtml}<br><br>{tgthtml}</div>"
+    arabic = out_string if is_arabic(out_string) else text
+    return html, arabic
+#%%
+def is_arabic(text):
+    # return True if text has more than 50% arabic characters, False otherwise
+    text = text.replace(" ", "")
+    arabic_chars = 0
+    for c in text:
+        if "\u0600" <= c <= "\u06FF":
+            arabic_chars += 1
+    return arabic_chars / len(text) > 0.5
+def run_translate(text, dialect=None):
+    if not text:
+        return ""
+    if is_arabic(text):
+        return translate(text, model_from_ar_ct2, model_from_ar, tokenizer_from_ar,
+                         to_arabic=False, threshold=None, layer=2, head=7)
+    else:
+        if dialect in dialect_map:
+            dialect = dialect_map[dialect]
+        text = f"{dialect} {text}" if dialect else text
+        return translate(text, model_to_ar_ct2, model_to_ar, tokenizer_to_ar,
+                          to_arabic=True, threshold=None, layer=2, head=7)

translit.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#%%
+from transformers import CanineForTokenClassification, AutoTokenizer
+import re
+import torch
+# instantiate module logger
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
+tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
+#%%
+def diacritics2hebrew_vowels(text, model, tokenizer):
+    tokens = tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        pred = model(**tokens)
+        pred = pred.logits.argmax(-1).tolist()
+    pred = pred[0][1:-1] # remove CLS and SEP
+    output = []
+    for p, c in zip(pred, text):
+        if p != model.config.label2id["O"]:
+            output.append(model.config.id2label[p])
+        else:
+            output.append(c)
+    output = "".join(output)
+    # print("Done converting to Hebrew vowels")
+    logger.warning("Done converting to Hebrew vowels")
+    return output
+#%%
+arabic_to_english = {
+    "ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
+    "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
+    "ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
+    "س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
+    "ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
+    "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
+    "َ": "a", "ُ": "u", "ِ": "i",
+    "،": ",",
+    "ֹ": "o",  # holam
+    "ַ": "a",  # patah
+    "ִ": "i",  # hiriq
+    "ְ": "",   # shva
+    "ֻ": "u",  # kubutz
+    'ֵ': "e",
+    "ّ": "SHADDA"  # shadda
+}
+arabic_to_hebrew = {
+    # regular letters
+    "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
+    "آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'",
+    "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
+    "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
+    "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל",
+    "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
+    # special characters
+    "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
+    "؟": "?", "؛": ";", "ـ": "",
+    # shadda to \u0598
+    "ّ": "\u0598",
+}
+vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
+final_letters = {
+    "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
+}
+def reorder_hebrew_nikkud(input_string):
+    # in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe
+    # Define a dictionary for the nikkud signs
+    nikkud_signs = {
+        "ֹ": "o",  # holam
+        "ַ": "a",  # patah
+        "ִ": "i",  # hiriq
+        "ְ": "",   # shva
+        "ֻ": "u",  # kubutz
+        "ֵ": "e",  # tsere
+        "ّ": "SHADDA"  # shadda
+    }
+    # Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs
+    pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])'
+    replacement = r'\1\3\2'
+    result = re.sub(pattern, replacement, input_string)
+    return result
+def reverse_holam_shadda_vav(input_string):
+    # For better readability, replace (holam, shadda, ו) with (shadda, ו, holam)
+    # instead of shadda we use the replacement \u0598
+    pattern = r'(\u05B9)(\u0598)(\u05D5)'
+    replacement = r'\2\3\1'
+    result = re.sub(pattern, replacement, input_string)
+    return result
+def to_taatik(arabic):
+    taatik = []
+    for index, letter in enumerate(arabic):
+        if (
+            (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
+            letter in final_letters
+        ):
+            taatik.append(final_letters[letter])
+        elif letter not in arabic_to_hebrew:
+            taatik.append(letter)
+        else:
+            taatik.append(arabic_to_hebrew[letter])
+    reversed = reverse_holam_shadda_vav("".join(taatik))
+    reordered = reorder_hebrew_nikkud(reversed)
+    # print("Done converting to taatik")
+    logger.warning("Done converting to taatik")
+    return reordered
+def to_translit(arabic):
+    translit = []
+    for letter in arabic:
+        if letter not in arabic_to_english:
+            translit.append([letter, letter])
+        else:
+            if arabic_to_english[letter] == "SHADDA":
+                if translit[-1][0] in vowels:
+                    translit[-2][1] = translit[-2][1].upper()
+                else:
+                    translit[-1][1] = translit[-1][1].upper()
+            else:
+                translit.append([letter, arabic_to_english[letter]])
+    return "".join([x[1] for x in translit])
+# %%
+def taatik(text):
+    return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer))
+def translit(text):
+    return to_translit(diacritics2hebrew_vowels(text, model, tokenizer))
+# text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
+# heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
+# #%%
+# to_taatik(heb_vowels)
+# #%%
+# to_translit(heb_vowels)
+# # %%

tts.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#%%
+import azure.cognitiveservices.speech as speechsdk
+import re
+import os
+import hashlib
+import random
+from dotenv import load_dotenv
+load_dotenv(".env")
+print(os.environ.get('SPEECH_KEY'))
+print(os.environ.get('SPEECH_REGION'))
+speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
+                                       region=os.environ.get('SPEECH_REGION'))
+def do_cleanup(dir='wavs', num_files=100):
+    files = os.listdir(dir)
+    if len(files) > num_files:
+        for file in files[:len(files) - num_files]:
+            os.remove(f"{dir}/{file}")
+def add_sukun(text):
+    # Define Arabic letters and sukun
+    arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
+    shadda = 'ّ'
+    arabic_letters += shadda
+    sukun = 'ْ'
+    punctuation = '.,;!?،؛؟'
+    def process_word(word):
+        # If the last character is punctuation, process the letter before it
+        if word[-1] in punctuation:
+            if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
+                return word[:-2] + word[-2] + sukun + word[-1]
+            return word
+        # If the last character is an Arabic letter and does not have a sukun, add one
+        elif word[-1] in arabic_letters and word[-1] != sukun:
+            return word + sukun
+        return word
+    # Use regex to split text into words and punctuation
+    words = re.findall(r'\S+|[.,;!?،؛؟]', text)
+    processed_text = ' '.join(process_word(word) for word in words)
+    return processed_text
+def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
+    return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'
+def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):
+    input_text = add_sukun(input_text)
+    hash = hashlib.md5(input_text.encode()).hexdigest()
+    if os.path.exists(f"wavs/{hash}.wav"):
+        return f"wavs/{hash}.wav"
+    audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
+    # speech_config.speech_synthesis_voice_name=voice
+    # speech_config.speech_synthesis_language = "ar-EG"
+    speech_config.set_speech_synthesis_output_format(
+        speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
+    )
+    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
+                                                     audio_config=audio_config)
+    if use_ssml:
+        # print("Using SSML")
+        ssml = get_ssml(input_text, voice=voice)
+        result = speech_synthesizer.speak_ssml_async(ssml).get()
+    else:
+        # print("Using text")
+        result = speech_synthesizer.speak_text_async(input_text).get()
+    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+        print("Speech synthesized for text [{}]".format(input_text))
+    elif result.reason == speechsdk.ResultReason.Canceled:
+        cancellation_details = result.cancellation_details
+        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
+        if cancellation_details.reason == speechsdk.CancellationReason.Error:
+            print("Error details: {}".format(cancellation_details.error_details))
+    # randomly every 50 calls, clean up the wavs folder
+    if random.randint(1, 50) == 1:
+        do_cleanup()
+    return f"wavs/{hash}.wav"

wavs/1cfdc7d62daa8ab925371ac17ea4e792.wav ADDED Viewed

Binary file (133 kB). View file

wavs/5fc7e1c64e032cdbfc6435dc9a6a32ce.wav ADDED Viewed

Binary file (226 kB). View file

wavs/8300935681825cf0e5c467f08fb31325.wav ADDED Viewed

Binary file (128 kB). View file