Ashaar

Running

App Files Files Community

Zaid commited on Jun 20, 2023

Commit

2fb81a4

•

1 Parent(s): 9f20162

first commit

Browse files

Files changed (10) hide show

.gitignore +4 -0
app.py +139 -0
extra/labels.txt +17 -0
extra/labels_ar.txt +17 -0
extra/meter_tokens.json +1 -0
extra/theme_tokens.json +1 -0
extra/theme_tokens.txt +0 -0
langs.py +43 -0
requirements.txt +1 -0
test.yml +52 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+deep-learning-models/*
+deep-learning-models.zip
+__MACOSX/*
+__pycache__/*

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import gradio as gr
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from Ashaar.utils import get_output_df, get_highlighted_patterns_html
+from Ashaar.bait_analysis import BaitAnalysis
+from langs import *
+import sys
+import json
+import argparse
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('--lang', type = str, default = 'ar', required=True)
+args = arg_parser.parse_args()
+lang = args.lang
+if lang == 'ar':
+    TITLE = TITLE_ar
+    DESCRIPTION = DESCRIPTION_ar
+    textbox_trg_text = textbox_trg_text_ar
+    textbox_inp_text = textbox_inp_text_ar
+    btn_trg_text = btn_trg_text_ar
+    btn_inp_text = btn_inp_text_ar
+    css = """ #textbox{ direction: RTL;}"""
+else:
+    TITLE = TITLE_en
+    DESCRIPTION = DESCRIPTION_en
+    textbox_trg_text = textbox_trg_text_en
+    textbox_inp_text = textbox_inp_text_en
+    btn_trg_text = btn_trg_text_en
+    btn_inp_text = btn_inp_text_en
+    css = ""
+gpt_tokenizer = AutoTokenizer.from_pretrained('arbml/ashaar_tokenizer')
+model = AutoModelForCausalLM.from_pretrained('arbml/Ashaar_model')
+theme_to_token = json.load(open("extra/theme_tokens.json", "r"))
+token_to_theme = {t:m for m,t in theme_to_token.items()}
+meter_to_token = json.load(open("extra/meter_tokens.json", "r"))
+token_to_meter = {t:m for m,t in meter_to_token.items()}
+analysis = BaitAnalysis()
+meter, theme, qafiyah = "", "", ""
+def analyze(poem):
+    global meter,theme,qafiyah
+    shatrs = poem.split("\n")
+    baits = [' # '.join(shatrs[2*i:2*i+2]) for i in range(len(shatrs)//2)]
+    output = analysis.analyze(baits,override_tashkeel=True)
+    meter = output['meter']
+    qafiyah = output['qafiyah'][0]
+    theme = output['theme'][-1]
+    df = get_output_df(output)
+    return get_highlighted_patterns_html(df)
+def generate(inputs, top_p = 3):
+    baits = inputs.split('\n')
+    print(baits)
+    poem = ' '.join(['<|bsep|> '+baits[i]+' <|vsep|> '+baits[i+1]+' </|bsep|>' for i in range(0, len(baits), 2)])
+    print(poem)
+    prompt = f"""
+    {meter_to_token[meter]} {qafiyah} {theme_to_token[theme]}
+    <|psep|>
+    {poem}
+    """.strip()
+    print(prompt)
+    encoded_input = gpt_tokenizer(prompt, return_tensors='pt')
+    output = model.generate(**encoded_input, max_length = 512, top_p = 3, do_sample=True)
+    result = ""
+    prev_token = ""
+    line_cnts = 0
+    for i, beam in enumerate(output[:, len(encoded_input.input_ids[0]):]):
+        if line_cnts >= 10:
+            break
+        for token in beam:
+            if line_cnts >= 10:
+                break
+            decoded = gpt_tokenizer.decode(token)
+            if 'meter' in decoded or 'theme' in decoded:
+                break
+            if decoded in ["<|vsep|>", "</|bsep|>"]:
+                result += "\n"
+                line_cnts+=1
+            elif decoded in ['<|bsep|>', '<|psep|>', '</|psep|>']:
+                pass
+            else:
+                result += decoded
+            prev_token = decoded
+        else:
+            break
+    # return theme+" "+ f"من بحر {meter} مع قافية بحر ({qafiyah})" + "\n" +result
+    return result
+examples = [
+    [
+"""القلب أعلم يا عذول بدائه
+وأحق منك بجفنه وبمائه"""
+    ],
+    [
+"""ألا ليت شعري هل أبيتن ليلة
+بجنب الغضى أزجي الغلاص النواجيا"""
+    ],
+]
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.HTML(TITLE)
+            gr.HTML(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            textbox_output = gr.Textbox(lines=10, label=textbox_trg_text, elem_id="textbox")
+        with gr.Column():
+            inputs = gr.Textbox(lines=10, label=textbox_inp_text, elem_id="textbox")
+    with gr.Row():
+        with gr.Column():
+            trg_btn = gr.Button(btn_trg_text)
+        with gr.Column():
+            inp_btn = gr.Button(btn_inp_text)
+    with gr.Row():
+        html_output = gr.HTML()
+    if lang == 'en':
+        gr.Examples(examples, textbox_output)
+        inp_btn.click(generate, inputs = textbox_output, outputs=inputs)
+        trg_btn.click(analyze, inputs = textbox_output, outputs=html_output)
+    else:
+        gr.Examples(examples, inputs)
+        trg_btn.click(generate, inputs = inputs, outputs=textbox_output)
+        inp_btn.click(analyze, inputs = inputs, outputs=html_output)
+demo.launch(server_name = "0.0.0.0", share = True, debug = True)

extra/labels.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+saree
+kamel
+mutakareb
+mutadarak
+munsareh
+madeed
+mujtath
+ramal
+baseet
+khafeef
+taweel
+wafer
+hazaj
+rajaz
+mudhare
+muqtadheb
+prose

extra/labels_ar.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+السريع
+الكامل
+المتقارب
+المتدارك
+المنسرح
+المديد
+المجتث
+الرمل
+البسيط
+الخفيف
+الطويل
+الوافر
+الهزج
+الرجز
+المضارع
+المقتضب
+النثر

extra/meter_tokens.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"\u0627\u0644\u062e\u0641\u064a\u0641": "<|meter_0|>", "\u0627\u0644\u0645\u0636\u0627\u0631\u0639": "<|meter_1|>", "\u0627\u0644\u0645\u062c\u062a\u062b": "<|meter_2|>", "\u0627\u0644\u0631\u0645\u0644": "<|meter_3|>", "\u0627\u0644\u0628\u0633\u064a\u0637": "<|meter_4|>", "\u0627\u0644\u0645\u062a\u0642\u0627\u0631\u0628": "<|meter_5|>", "\u0627\u0644\u0648\u0627\u0641\u0631": "<|meter_6|>", "\u0627\u0644\u0645\u0642\u062a\u0636\u0628": "<|meter_7|>", "\u0627\u0644\u0645\u062f\u064a\u062f": "<|meter_8|>", "\u0627\u0644\u0646\u062b\u0631": "<|meter_9|>", "\u0627\u0644\u0647\u0632\u062c": "<|meter_10|>", "\u0627\u0644\u0645\u062a\u062f\u0627\u0631\u0643": "<|meter_11|>", "\u0627\u0644\u0645\u0646\u0633\u0631\u062d": "<|meter_12|>", "\u0627\u0644\u0637\u0648\u064a\u0644": "<|meter_13|>", "\u0627\u0644\u0643\u0627\u0645\u0644": "<|meter_14|>", "\u0627\u0644\u0631\u062c\u0632": "<|meter_15|>", "\u0627\u0644\u0633\u0631\u064a\u0639": "<|meter_16|>"}

extra/theme_tokens.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"\u0642\u0635\u064a\u062f\u0629 \u0642\u0635\u064a\u0631\u0647": "<|theme_0|>", "\u0642\u0635\u064a\u062f\u0629 \u0645\u062f\u062d": "<|theme_1|>", "\u0642\u0635\u064a\u062f\u0629 \u0648\u0637\u0646\u064a\u0647": "<|theme_2|>", "\u0642\u0635\u064a\u062f\u0629 \u0631\u0648\u0645\u0646\u0633\u064a\u0647": "<|theme_3|>", "\u0642\u0635\u064a\u062f\u0629 \u0647\u062c\u0627\u0621": "<|theme_4|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0639\u062a\u0630\u0627\u0631": "<|theme_5|>", "\u0642\u0635\u064a\u062f\u0629 \u0633\u064a\u0627\u0633\u064a\u0629": "<|theme_6|>", "\u0642\u0635\u064a\u062f\u0629 \u0641\u0631\u0627\u0642": "<|theme_7|>", "\u0642\u0635\u064a\u062f\u0629 \u063a\u0632\u0644": "<|theme_8|>", "\u0642\u0635\u064a\u062f\u0629 \u0630\u0645": "<|theme_9|>", "\u0642\u0635\u064a\u062f\u0629 \u0631\u062b\u0627\u0621": "<|theme_10|>", "null": "<|theme_11|>", "\u0642\u0635\u064a\u062f\u0629 \u0634\u0648\u0642": "<|theme_12|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0644\u0645\u0639\u0644\u0642\u0627\u062a": "<|theme_13|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0644\u0627\u0646\u0627\u0634\u064a\u062f": "<|theme_14|>", "\u0642\u0635\u064a\u062f\u0629 \u062d\u0632\u064a\u0646\u0647": "<|theme_15|>", "\u0642\u0635\u064a\u062f\u0629 \u0639\u062a\u0627\u0628": "<|theme_16|>", "\u0642\u0635\u064a\u062f\u0629 \u0639\u0627\u0645\u0647": "<|theme_17|>", "\u0642\u0635\u064a\u062f\u0629 \u062f\u064a\u0646\u064a\u0629": "<|theme_18|>"}

extra/theme_tokens.txt ADDED Viewed

File without changes

langs.py ADDED Viewed

	@@ -0,0 +1,43 @@

+IMG = """<p align = 'center'>
+<img src='https://raw.githubusercontent.com/ARBML/Ashaar/master/images/ashaar_icon.png' width='150px' alt='logo for Ashaar'/>
+</p>
+"""
+TITLE_ar="""<h1 style="font-size: 30px;" align="center">أَشْعــَـار: تحليل وإنشاء الشعر العربي</h1>"""
+DESCRIPTION_ar = IMG
+DESCRIPTION_ar +=""" <p dir='rtl'>
+هذا البرنامج يتيح للمستخدم تحليل وإنشاء الشعر العربي.
+لإنشاء الشعر العربي تم تدريب نموج يقوم بإستخدام البحر والقافية والعاطفة لإنشاء أكمال للقصيدة بناء على هذه الشورط.
+بالإضافة إلى نموذج إنشاء الشعر يحتوي البرنامج على نماذج لتصنيف الحقبة الزمنية والعاطفة والبحر و كذلك تشكيل الشعر العربي بالإضافة إلى إكمال الشعر.
+قمنا بتوفير الشفرة البرمجية كلها على
+<a href ='https://github.com/ARBML/Ashaar'> GitHub</a>.
+</p>
+"""
+TITLE_en="""<h1 style="font-size: 30px;" align="center">Ashaar: Arabic Poetry Analysis and Generation</h1>"""
+DESCRIPTION_en = IMG
+DESCRIPTION_en +="""
+The demo provides a way to generate analysis for poetry and also complete the poetry.
+The generative model is a character-based conditional GPT-2 model. The pipeline contains many models for
+classification, diacritization and conditional generation. Check our <a src='https://github.com/ARBML/Ashaar'>GitHub</a> for more techincal details
+about this work. In the demo we have two basic pipelines. Analyze which predicts the meter, era, theme, diacritized text, qafiyah and, arudi style.
+The other module, Generate which takes the input text, meter, theme and qafiyah to generate the full poem.
+"""
+btn_trg_text_ar = "إنشاء"
+btn_inp_text_ar = "تحليل"
+btn_inp_text_en = "Generate"
+btn_trg_text_en = "Analyze"
+textbox_inp_text_ar = "القصيدة المدخلة"
+textbox_trg_text_ar = "القصيدة المنشئة"
+textbox_trg_text_en = "Input Poem"
+textbox_inp_text_en = "Generated Poem"

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ashaar @ git+https://github.com/arbml/Ashaar.git

test.yml ADDED Viewed

	@@ -0,0 +1,52 @@

+session_name: base
+data_directory: "data"
+data_type: "ashaar_proc"
+log_directory: "deep-learning-models/log_dir_ashaar"
+load_training_data: true
+load_test_data: false
+load_validation_data: true
+n_training_examples: null # null load all training examples, good for fast loading
+n_test_examples: null  # null load all test examples
+n_validation_examples: null # null load all validation examples
+test_file_name: "test.csv"
+is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
+data_separator: '|' # Required if the data already processed
+diacritics_separator: '*'  # Required if the data already processed
+text_encoder: ArabicEncoderWithStartSymbol
+text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
+max_len: 600 # sentences larger than this size will not be used
+max_sen_len: null
+max_steps: 10000
+learning_rate: 0.001
+batch_size: 32
+adam_beta1: 0.9
+adam_beta2: 0.999
+use_decay: true
+weight_decay: 0.0
+embedding_dim: 256
+use_prenet: false
+prenet_sizes: [512, 256]
+cbhg_projections: [128, 256]
+cbhg_filters: 16
+cbhg_gru_units: 256
+post_cbhg_layers_units: [256, 256]
+post_cbhg_use_batch_norm: true
+use_mixed_precision: false
+optimizer_type: Adam
+device: cuda
+# LOGGING
+evaluate_frequency: 50000000
+max_eval_batches: 100
+evaluate_with_error_rates_frequency: 1000
+n_predicted_text_tensorboard: 10 # To be written to the tensorboard
+model_save_frequency: 5000
+train_plotting_frequency: 50000000 # No plotting for this model
+n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
+error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
+test_model_path: null # load the last saved model
+train_resume_model_path: null # load last saved model