Duplicate from arbml/Ashaar
Browse filesCo-authored-by: Zaid Alyafeai <Zaid@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +35 -0
- .gitignore +7 -0
- README.md +14 -0
- app.py +151 -0
- extra/labels.txt +17 -0
- extra/labels_ar.txt +17 -0
- extra/meter_tokens.json +1 -0
- extra/theme_tokens.json +1 -0
- extra/theme_tokens.txt +0 -0
- langs.py +59 -0
- poetry_diacritizer/__init__.py +1 -0
- poetry_diacritizer/config/ashaar.yml +52 -0
- poetry_diacritizer/config/baseline.yml +47 -0
- poetry_diacritizer/config/cbhg.yml +52 -0
- poetry_diacritizer/config/cbhg2.yml +51 -0
- poetry_diacritizer/config/gpt-0.yml +46 -0
- poetry_diacritizer/config/gpt-1.yml +46 -0
- poetry_diacritizer/config/gpt-2.yml +46 -0
- poetry_diacritizer/config/gpt-3.yml +46 -0
- poetry_diacritizer/config/gpt-4.yml +46 -0
- poetry_diacritizer/config/gpt-5.yml +46 -0
- poetry_diacritizer/config/gpt-6.yml +46 -0
- poetry_diacritizer/config/gpt-7.yml +46 -0
- poetry_diacritizer/config/gpt-8.yml +46 -0
- poetry_diacritizer/config/gpt-9.yml +46 -0
- poetry_diacritizer/config/gpt-cls-0-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-0-test.yml +46 -0
- poetry_diacritizer/config/gpt-cls-0.yml +46 -0
- poetry_diacritizer/config/gpt-cls-1-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-1.yml +46 -0
- poetry_diacritizer/config/gpt-cls-2-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-2.yml +46 -0
- poetry_diacritizer/config/gpt-cls-3-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-3.yml +46 -0
- poetry_diacritizer/config/gpt-cls-4-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-4.yml +46 -0
- poetry_diacritizer/config/gpt-cls-5-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-5-test.yml +46 -0
- poetry_diacritizer/config/gpt-cls-5.yml +46 -0
- poetry_diacritizer/config/gpt-cls-6-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-6.yml +46 -0
- poetry_diacritizer/config/gpt-cls-7-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-7.yml +46 -0
- poetry_diacritizer/config/gpt-cls-8-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-8.yml +46 -0
- poetry_diacritizer/config/gpt-cls-9-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-cls-9-test.yml +46 -0
- poetry_diacritizer/config/gpt-cls-9.yml +46 -0
- poetry_diacritizer/config/gpt-cls-tash-proc.yml +46 -0
- poetry_diacritizer/config/gpt-lstm-0-50K.yml +46 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
deep-learning-models/*
|
2 |
+
deep-learning-models.zip
|
3 |
+
__MACOSX/*
|
4 |
+
__pycache__/*
|
5 |
+
poetry_diacritizer/*
|
6 |
+
*.pyc
|
7 |
+
deep-learning-models.zip:Zone.Identifier
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Ashaar
|
3 |
+
emoji: 🧑🎤
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.35.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: arbml/Ashaar
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
3 |
+
import gradio as gr
|
4 |
+
from transformers import pipeline
|
5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
6 |
+
from Ashaar.utils import get_output_df, get_highlighted_patterns_html
|
7 |
+
from Ashaar.bait_analysis import BaitAnalysis
|
8 |
+
from langs import *
|
9 |
+
import sys
|
10 |
+
import json
|
11 |
+
import argparse
|
12 |
+
|
13 |
+
arg_parser = argparse.ArgumentParser()
|
14 |
+
arg_parser.add_argument('--lang', type = str, default = 'ar')
|
15 |
+
args = arg_parser.parse_args()
|
16 |
+
lang = args.lang
|
17 |
+
|
18 |
+
if lang == 'ar':
|
19 |
+
TITLE = TITLE_ar
|
20 |
+
DESCRIPTION = DESCRIPTION_ar
|
21 |
+
textbox_trg_text = textbox_trg_text_ar
|
22 |
+
textbox_inp_text = textbox_inp_text_ar
|
23 |
+
btn_trg_text = btn_trg_text_ar
|
24 |
+
btn_inp_text = btn_inp_text_ar
|
25 |
+
css = """ #textbox{ direction: RTL;}"""
|
26 |
+
|
27 |
+
else:
|
28 |
+
TITLE = TITLE_en
|
29 |
+
DESCRIPTION = DESCRIPTION_en
|
30 |
+
textbox_trg_text = textbox_trg_text_en
|
31 |
+
textbox_inp_text = textbox_inp_text_en
|
32 |
+
btn_trg_text = btn_trg_text_en
|
33 |
+
btn_inp_text = btn_inp_text_en
|
34 |
+
css = ""
|
35 |
+
|
36 |
+
gpt_tokenizer = AutoTokenizer.from_pretrained('arbml/ashaar_tokenizer')
|
37 |
+
model = AutoModelForCausalLM.from_pretrained('arbml/Ashaar_model')
|
38 |
+
|
39 |
+
theme_to_token = json.load(open("extra/theme_tokens.json", "r"))
|
40 |
+
token_to_theme = {t:m for m,t in theme_to_token.items()}
|
41 |
+
meter_to_token = json.load(open("extra/meter_tokens.json", "r"))
|
42 |
+
token_to_meter = {t:m for m,t in meter_to_token.items()}
|
43 |
+
|
44 |
+
analysis = BaitAnalysis()
|
45 |
+
meter, theme, qafiyah = "", "", ""
|
46 |
+
|
47 |
+
def analyze(poem):
|
48 |
+
global meter,theme,qafiyah, generate_btn
|
49 |
+
shatrs = poem.split("\n")
|
50 |
+
baits = [' # '.join(shatrs[2*i:2*i+2]) for i in range(len(shatrs)//2)]
|
51 |
+
output = analysis.analyze(baits,override_tashkeel=True)
|
52 |
+
meter = output['meter']
|
53 |
+
qafiyah = output['qafiyah'][0]
|
54 |
+
theme = output['theme'][-1]
|
55 |
+
df = get_output_df(output)
|
56 |
+
return get_highlighted_patterns_html(df), gr.Button.update(interactive=True)
|
57 |
+
|
58 |
+
def generate(inputs, top_p = 3):
|
59 |
+
baits = inputs.split('\n')
|
60 |
+
if len(baits) % 2 !=0:
|
61 |
+
baits = baits[:-1]
|
62 |
+
poem = ' '.join(['<|bsep|> '+baits[i]+' <|vsep|> '+baits[i+1]+' </|bsep|>' for i in range(0, len(baits), 2)])
|
63 |
+
prompt = f"""
|
64 |
+
{meter_to_token[meter]} {qafiyah} {theme_to_token[theme]}
|
65 |
+
<|psep|>
|
66 |
+
{poem}
|
67 |
+
""".strip()
|
68 |
+
print(prompt)
|
69 |
+
encoded_input = gpt_tokenizer(prompt, return_tensors='pt')
|
70 |
+
output = model.generate(**encoded_input, max_length = 512, top_p = 3, do_sample=True)
|
71 |
+
|
72 |
+
result = ""
|
73 |
+
prev_token = ""
|
74 |
+
line_cnts = 0
|
75 |
+
for i, beam in enumerate(output[:, len(encoded_input.input_ids[0]):]):
|
76 |
+
if line_cnts >= 10:
|
77 |
+
break
|
78 |
+
for token in beam:
|
79 |
+
if line_cnts >= 10:
|
80 |
+
break
|
81 |
+
decoded = gpt_tokenizer.decode(token)
|
82 |
+
if 'meter' in decoded or 'theme' in decoded:
|
83 |
+
break
|
84 |
+
if decoded in ["<|vsep|>", "</|bsep|>"]:
|
85 |
+
result += "\n"
|
86 |
+
line_cnts+=1
|
87 |
+
elif decoded in ['<|bsep|>', '<|psep|>', '</|psep|>']:
|
88 |
+
pass
|
89 |
+
else:
|
90 |
+
result += decoded
|
91 |
+
prev_token = decoded
|
92 |
+
else:
|
93 |
+
break
|
94 |
+
# return theme+" "+ f"من بحر {meter} مع قافية بحر ({qafiyah})" + "\n" +result
|
95 |
+
return result, gr.Button.update(interactive=False)
|
96 |
+
|
97 |
+
examples = [
|
98 |
+
[
|
99 |
+
"""القلب أعلم يا عذول بدائه
|
100 |
+
وأحق منك بجفنه وبمائه"""
|
101 |
+
],
|
102 |
+
[
|
103 |
+
"""رمتِ الفؤادَ مليحة عذراءُ
|
104 |
+
بسهامِ لحظٍ ما لهنَّ دواءُ"""
|
105 |
+
],
|
106 |
+
[
|
107 |
+
"""أذَلَّ الحِرْصُ والطَّمَعُ الرِّقابَا
|
108 |
+
وقَد يَعفو الكَريمُ، إذا استَرَابَا"""
|
109 |
+
]
|
110 |
+
]
|
111 |
+
|
112 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
113 |
+
with gr.Row():
|
114 |
+
with gr.Column():
|
115 |
+
gr.HTML(TITLE)
|
116 |
+
gr.HTML(DESCRIPTION)
|
117 |
+
|
118 |
+
with gr.Row():
|
119 |
+
with gr.Column():
|
120 |
+
textbox_output = gr.Textbox(lines=10, label=textbox_trg_text, elem_id="textbox")
|
121 |
+
with gr.Column():
|
122 |
+
inputs = gr.Textbox(lines=10, label=textbox_inp_text, elem_id="textbox")
|
123 |
+
|
124 |
+
|
125 |
+
with gr.Row():
|
126 |
+
with gr.Column():
|
127 |
+
if lang == 'ar':
|
128 |
+
trg_btn = gr.Button(btn_trg_text, interactive=False)
|
129 |
+
else:
|
130 |
+
trg_btn = gr.Button(btn_trg_text)
|
131 |
+
|
132 |
+
with gr.Column():
|
133 |
+
if lang == 'ar':
|
134 |
+
inp_btn = gr.Button(btn_inp_text)
|
135 |
+
else:
|
136 |
+
inp_btn = gr.Button(btn_inp_text, interactive = False)
|
137 |
+
|
138 |
+
with gr.Row():
|
139 |
+
html_output = gr.HTML()
|
140 |
+
|
141 |
+
if lang == 'en':
|
142 |
+
gr.Examples(examples, textbox_output)
|
143 |
+
inp_btn.click(generate, inputs = textbox_output, outputs=[inputs, inp_btn])
|
144 |
+
trg_btn.click(analyze, inputs = textbox_output, outputs=[html_output,inp_btn])
|
145 |
+
else:
|
146 |
+
gr.Examples(examples, inputs)
|
147 |
+
trg_btn.click(generate, inputs = inputs, outputs=[textbox_output, trg_btn])
|
148 |
+
inp_btn.click(analyze, inputs = inputs, outputs=[html_output,trg_btn] )
|
149 |
+
|
150 |
+
# demo.launch(server_name = '0.0.0.0', share=True)
|
151 |
+
demo.launch()
|
extra/labels.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
saree
|
2 |
+
kamel
|
3 |
+
mutakareb
|
4 |
+
mutadarak
|
5 |
+
munsareh
|
6 |
+
madeed
|
7 |
+
mujtath
|
8 |
+
ramal
|
9 |
+
baseet
|
10 |
+
khafeef
|
11 |
+
taweel
|
12 |
+
wafer
|
13 |
+
hazaj
|
14 |
+
rajaz
|
15 |
+
mudhare
|
16 |
+
muqtadheb
|
17 |
+
prose
|
extra/labels_ar.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
السريع
|
2 |
+
الكامل
|
3 |
+
المتقارب
|
4 |
+
المتدارك
|
5 |
+
المنسرح
|
6 |
+
المديد
|
7 |
+
المجتث
|
8 |
+
الرمل
|
9 |
+
البسيط
|
10 |
+
الخفيف
|
11 |
+
الطويل
|
12 |
+
الوافر
|
13 |
+
الهزج
|
14 |
+
الرجز
|
15 |
+
المضارع
|
16 |
+
المقتضب
|
17 |
+
النثر
|
extra/meter_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"\u0627\u0644\u062e\u0641\u064a\u0641": "<|meter_0|>", "\u0627\u0644\u0645\u0636\u0627\u0631\u0639": "<|meter_1|>", "\u0627\u0644\u0645\u062c\u062a\u062b": "<|meter_2|>", "\u0627\u0644\u0631\u0645\u0644": "<|meter_3|>", "\u0627\u0644\u0628\u0633\u064a\u0637": "<|meter_4|>", "\u0627\u0644\u0645\u062a\u0642\u0627\u0631\u0628": "<|meter_5|>", "\u0627\u0644\u0648\u0627\u0641\u0631": "<|meter_6|>", "\u0627\u0644\u0645\u0642\u062a\u0636\u0628": "<|meter_7|>", "\u0627\u0644\u0645\u062f\u064a\u062f": "<|meter_8|>", "\u0627\u0644\u0646\u062b\u0631": "<|meter_9|>", "\u0627\u0644\u0647\u0632\u062c": "<|meter_10|>", "\u0627\u0644\u0645\u062a\u062f\u0627\u0631\u0643": "<|meter_11|>", "\u0627\u0644\u0645\u0646\u0633\u0631\u062d": "<|meter_12|>", "\u0627\u0644\u0637\u0648\u064a\u0644": "<|meter_13|>", "\u0627\u0644\u0643\u0627\u0645\u0644": "<|meter_14|>", "\u0627\u0644\u0631\u062c\u0632": "<|meter_15|>", "\u0627\u0644\u0633\u0631\u064a\u0639": "<|meter_16|>"}
|
extra/theme_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"\u0642\u0635\u064a\u062f\u0629 \u0642\u0635\u064a\u0631\u0647": "<|theme_0|>", "\u0642\u0635\u064a\u062f\u0629 \u0645\u062f\u062d": "<|theme_1|>", "\u0642\u0635\u064a\u062f\u0629 \u0648\u0637\u0646\u064a\u0647": "<|theme_2|>", "\u0642\u0635\u064a\u062f\u0629 \u0631\u0648\u0645\u0646\u0633\u064a\u0647": "<|theme_3|>", "\u0642\u0635\u064a\u062f\u0629 \u0647\u062c\u0627\u0621": "<|theme_4|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0639\u062a\u0630\u0627\u0631": "<|theme_5|>", "\u0642\u0635\u064a\u062f\u0629 \u0633\u064a\u0627\u0633\u064a\u0629": "<|theme_6|>", "\u0642\u0635\u064a\u062f\u0629 \u0641\u0631\u0627\u0642": "<|theme_7|>", "\u0642\u0635\u064a\u062f\u0629 \u063a\u0632\u0644": "<|theme_8|>", "\u0642\u0635\u064a\u062f\u0629 \u0630\u0645": "<|theme_9|>", "\u0642\u0635\u064a\u062f\u0629 \u0631\u062b\u0627\u0621": "<|theme_10|>", "null": "<|theme_11|>", "\u0642\u0635\u064a\u062f\u0629 \u0634\u0648\u0642": "<|theme_12|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0644\u0645\u0639\u0644\u0642\u0627\u062a": "<|theme_13|>", "\u0642\u0635\u064a\u062f\u0629 \u0627\u0644\u0627\u0646\u0627\u0634\u064a\u062f": "<|theme_14|>", "\u0642\u0635\u064a\u062f\u0629 \u062d\u0632\u064a\u0646\u0647": "<|theme_15|>", "\u0642\u0635\u064a\u062f\u0629 \u0639\u062a\u0627\u0628": "<|theme_16|>", "\u0642\u0635\u064a\u062f\u0629 \u0639\u0627\u0645\u0647": "<|theme_17|>", "\u0642\u0635\u064a\u062f\u0629 \u062f\u064a\u0646\u064a\u0629": "<|theme_18|>"}
|
extra/theme_tokens.txt
ADDED
File without changes
|
langs.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMG = """<p align = 'center'>
|
2 |
+
<img src='https://raw.githubusercontent.com/ARBML/Ashaar/master/images/ashaar_icon.png' width='150px' alt='logo for Ashaar'/>
|
3 |
+
</p>
|
4 |
+
|
5 |
+
"""
|
6 |
+
TITLE_ar="""<h1 style="font-size: 30px;" align="center">أَشْعــَـار: تحليل وإنشاء الشعر العربي</h1>"""
|
7 |
+
DESCRIPTION_ar = IMG
|
8 |
+
|
9 |
+
DESCRIPTION_ar +=""" <p dir='rtl'>
|
10 |
+
هذا البرنامج يتيح للمستخدم تحليل وإنشاء الشعر العربي.
|
11 |
+
لإنشاء الشعر العربي تم تدريب نموج يقوم بإستخدام البحر والقافية والعاطفة لإنشاء أكمال للقصيدة بناء على هذه الشروط.
|
12 |
+
بالإضافة إلى نموذج إنشاء الشعر يحتوي البرنامج على نماذج لتصنيف الحقبة الزمنية والعاطفة والبحر و كذلك تشكيل الشعر .
|
13 |
+
يقوم البرنامج بإستخدام هذه النماذج لإيجاد الخلل في القصيدة من خلال إضافة ألوان معينة تدل على اماكن الخلل.
|
14 |
+
لإستخدام البرنامج قم في البداية بكتابة قصيدة تحتوي على عدد زوجي من الأبيات و من ثم قم بالضغط على تحليل ، وبعد إنتهاء التحليل بالإمكان إنشاء إكمال للقصيدة.
|
15 |
+
عند الضغط على زر التحليل يتم إنشاء جدول التحليل الذي يشرح العديد من الأشياء :
|
16 |
+
</p>
|
17 |
+
"""
|
18 |
+
DESCRIPTION_ar+= """<div dir='RTL'>
|
19 |
+
<ul>
|
20 |
+
<li> المشكل : تشكيل كل شطر من القصيدة المدخلة</li>
|
21 |
+
<li>الكتابة العروضية: وتقوم هذه الكتابة على التعبير عن كل منطوق في اللغة وتبيانه حتى لو لم يكن يكتب إملائياً
|
22 |
+
</li>
|
23 |
+
<li>التفعيلة: تفعيلات القصيدة ، مثالاً : طَويلٌ لَهُ دُونَ البُحورِ فضائل فَعُوْلُنْ مَفَاْعِيْلُنْ فَعُوْلُنْ مَفَاْعِلُ
|
24 |
+
</li>
|
25 |
+
<li>النمط: يحدد حركة وسكون كل حرف في الكتابة العروضية. نستخدم الألوان التالية للرمز إلى خلل في الكتابة العروضية: الأحمر: حرف محذوف، الأزرق: حرف مضاف، الأصفر: حركة مقلوبة.</li>
|
26 |
+
</ul>
|
27 |
+
</div>
|
28 |
+
"""
|
29 |
+
DESCRIPTION_ar+= """<p dir='rtl'>
|
30 |
+
قمنا بتوفير الشفرة البرمجية كلها على
|
31 |
+
<a href ='https://github.com/ARBML/Ashaar'> GitHub</a>.
|
32 |
+
</p>
|
33 |
+
"""
|
34 |
+
|
35 |
+
TITLE_en="""<h1 style="font-size: 30px;" align="center">Ashaar: Arabic Poetry Analysis and Generation</h1>"""
|
36 |
+
DESCRIPTION_en = IMG
|
37 |
+
|
38 |
+
DESCRIPTION_en +="""
|
39 |
+
The demo provides a way to generate analysis for poetry and also complete the poetry.
|
40 |
+
The generative model is a character-based conditional GPT-2 model. The pipeline contains many models for
|
41 |
+
classification, diacritization and conditional generation. Check our <a src='https://github.com/ARBML/Ashaar'>GitHub</a> for more techincal details
|
42 |
+
about this work. In the demo we have two basic pipelines. Analyze which predicts the meter, era, theme, diacritized text, qafiyah and, arudi style.
|
43 |
+
The other module, Generate which takes the input text, meter, theme and qafiyah to generate the full poem.
|
44 |
+
"""
|
45 |
+
|
46 |
+
btn_trg_text_ar = "إنشاء"
|
47 |
+
btn_inp_text_ar = "تحليل"
|
48 |
+
|
49 |
+
btn_inp_text_en = "Generate"
|
50 |
+
btn_trg_text_en = "Analyze"
|
51 |
+
|
52 |
+
textbox_inp_text_ar = "القصيدة المدخلة"
|
53 |
+
textbox_trg_text_ar = "القصيدة المنشئة"
|
54 |
+
|
55 |
+
textbox_trg_text_en = "Input Poem"
|
56 |
+
textbox_inp_text_en = "Generated Poem"
|
57 |
+
|
58 |
+
|
59 |
+
|
poetry_diacritizer/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from poetry_diacritizer import predict
|
poetry_diacritizer/config/ashaar.yml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
session_name: base
|
2 |
+
|
3 |
+
data_directory: "data"
|
4 |
+
data_type: "ashaar_proc"
|
5 |
+
log_directory: "log_dir_ashaar"
|
6 |
+
load_training_data: true
|
7 |
+
load_test_data: false
|
8 |
+
load_validation_data: true
|
9 |
+
n_training_examples: null # null load all training examples, good for fast loading
|
10 |
+
n_test_examples: null # null load all test examples
|
11 |
+
n_validation_examples: null # null load all validation examples
|
12 |
+
test_file_name: "test.csv"
|
13 |
+
is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
|
14 |
+
data_separator: '|' # Required if the data already processed
|
15 |
+
diacritics_separator: '*' # Required if the data already processed
|
16 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
17 |
+
text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
|
18 |
+
max_len: 600 # sentences larger than this size will not be used
|
19 |
+
max_sen_len: null
|
20 |
+
|
21 |
+
max_steps: 10000
|
22 |
+
learning_rate: 0.001
|
23 |
+
batch_size: 32
|
24 |
+
adam_beta1: 0.9
|
25 |
+
adam_beta2: 0.999
|
26 |
+
use_decay: true
|
27 |
+
weight_decay: 0.0
|
28 |
+
embedding_dim: 256
|
29 |
+
use_prenet: false
|
30 |
+
prenet_sizes: [512, 256]
|
31 |
+
cbhg_projections: [128, 256]
|
32 |
+
cbhg_filters: 16
|
33 |
+
cbhg_gru_units: 256
|
34 |
+
post_cbhg_layers_units: [256, 256]
|
35 |
+
post_cbhg_use_batch_norm: true
|
36 |
+
|
37 |
+
use_mixed_precision: false
|
38 |
+
optimizer_type: Adam
|
39 |
+
device: cuda
|
40 |
+
|
41 |
+
# LOGGING
|
42 |
+
evaluate_frequency: 50000000
|
43 |
+
max_eval_batches: 100
|
44 |
+
evaluate_with_error_rates_frequency: 1000
|
45 |
+
n_predicted_text_tensorboard: 10 # To be written to the tensorboard
|
46 |
+
model_save_frequency: 1000
|
47 |
+
train_plotting_frequency: 50000000 # No plotting for this model
|
48 |
+
n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
|
49 |
+
error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
|
50 |
+
|
51 |
+
test_model_path: null # load the last saved model
|
52 |
+
train_resume_model_path: null # load last saved model
|
poetry_diacritizer/config/baseline.yml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
session_name: base
|
2 |
+
|
3 |
+
data_directory: "data"
|
4 |
+
data_type: "CA_MSA"
|
5 |
+
log_directory: "log_dir"
|
6 |
+
load_training_data: true
|
7 |
+
load_test_data: false
|
8 |
+
load_validation_data: true
|
9 |
+
n_training_examples: null # null load all training examples, good for fast loading
|
10 |
+
n_test_examples: null # null load all test examples
|
11 |
+
n_validation_examples: null # null load all validation examples
|
12 |
+
test_file_name: "test.csv"
|
13 |
+
is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
|
14 |
+
data_separator: '|' # Required if the data already processed
|
15 |
+
diacritics_separator: '*' # Required if the data already processed
|
16 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
17 |
+
text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
|
18 |
+
max_len: 600 # sentences larger than this size will not be used
|
19 |
+
|
20 |
+
|
21 |
+
max_steps: 2_000_000
|
22 |
+
learning_rate: 0.001
|
23 |
+
batch_size: 64
|
24 |
+
adam_beta1: 0.9
|
25 |
+
adam_beta2: 0.999
|
26 |
+
use_decay: true
|
27 |
+
weight_decay: 0.0
|
28 |
+
embedding_dim: 512
|
29 |
+
n_layers: 3
|
30 |
+
layers_units: [256, 256, 256]
|
31 |
+
use_mixed_precision: false
|
32 |
+
optimizer_type: Adam
|
33 |
+
use_batch_norm: False
|
34 |
+
device: cuda
|
35 |
+
max_sen_len: 256
|
36 |
+
|
37 |
+
# LOGGING
|
38 |
+
evaluate_frequency: 5000
|
39 |
+
evaluate_with_error_rates_frequency: 5000
|
40 |
+
n_predicted_text_tensorboard: 10 # To be written to the tensorboard
|
41 |
+
model_save_frequency: 5000
|
42 |
+
train_plotting_frequency: 50000000 # No plotting for this model
|
43 |
+
n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
|
44 |
+
error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
|
45 |
+
|
46 |
+
test_model_path: null # load the last saved model
|
47 |
+
train_resume_model_path: null # load last saved model
|
poetry_diacritizer/config/cbhg.yml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
session_name: base
|
2 |
+
|
3 |
+
data_directory: "data"
|
4 |
+
data_type: "CA_MSA"
|
5 |
+
log_directory: "log_dir_cbhg"
|
6 |
+
load_training_data: true
|
7 |
+
load_test_data: false
|
8 |
+
load_validation_data: true
|
9 |
+
n_training_examples: null # null load all training examples, good for fast loading
|
10 |
+
n_test_examples: null # null load all test examples
|
11 |
+
n_validation_examples: null # null load all validation examples
|
12 |
+
test_file_name: "test.csv"
|
13 |
+
is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
|
14 |
+
data_separator: '|' # Required if the data already processed
|
15 |
+
diacritics_separator: '*' # Required if the data already processed
|
16 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
17 |
+
text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
|
18 |
+
max_len: 600 # sentences larger than this size will not be used
|
19 |
+
max_sen_len: null
|
20 |
+
|
21 |
+
max_steps: 5000
|
22 |
+
learning_rate: 0.001
|
23 |
+
batch_size: 32
|
24 |
+
adam_beta1: 0.9
|
25 |
+
adam_beta2: 0.999
|
26 |
+
use_decay: true
|
27 |
+
weight_decay: 0.0
|
28 |
+
embedding_dim: 256
|
29 |
+
use_prenet: false
|
30 |
+
prenet_sizes: [512, 256]
|
31 |
+
cbhg_projections: [128, 256]
|
32 |
+
cbhg_filters: 16
|
33 |
+
cbhg_gru_units: 256
|
34 |
+
post_cbhg_layers_units: [256, 256]
|
35 |
+
post_cbhg_use_batch_norm: true
|
36 |
+
|
37 |
+
use_mixed_precision: false
|
38 |
+
optimizer_type: Adam
|
39 |
+
device: cuda
|
40 |
+
|
41 |
+
# LOGGING
|
42 |
+
evaluate_frequency: 50000000
|
43 |
+
max_eval_batches: 100
|
44 |
+
evaluate_with_error_rates_frequency: 1000
|
45 |
+
n_predicted_text_tensorboard: 10 # To be written to the tensorboard
|
46 |
+
model_save_frequency: 5000
|
47 |
+
train_plotting_frequency: 50000000 # No plotting for this model
|
48 |
+
n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
|
49 |
+
error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
|
50 |
+
|
51 |
+
test_model_path: null # load the last saved model
|
52 |
+
train_resume_model_path: null # load last saved model
|
poetry_diacritizer/config/cbhg2.yml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
session_name: base
|
2 |
+
|
3 |
+
data_directory: "ashaar"
|
4 |
+
data_type: "CA_MSA"
|
5 |
+
log_directory: "/content/drive/MyDrive/Research/Barmajan/Diacritization/log_ashaar_dir"
|
6 |
+
load_training_data: true
|
7 |
+
load_test_data: false
|
8 |
+
load_validation_data: true
|
9 |
+
n_training_examples: null # null load all training examples, good for fast loading
|
10 |
+
n_test_examples: null # null load all test examples
|
11 |
+
n_validation_examples: null # null load all validation examples
|
12 |
+
test_file_name: "test.csv"
|
13 |
+
is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
|
14 |
+
data_separator: '|' # Required if the data already processed
|
15 |
+
diacritics_separator: '*' # Required if the data already processed
|
16 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
17 |
+
text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
|
18 |
+
max_len: 600 # sentences larger than this size will not be used
|
19 |
+
|
20 |
+
|
21 |
+
max_steps: 25_000
|
22 |
+
learning_rate: 0.001
|
23 |
+
batch_size: 32
|
24 |
+
adam_beta1: 0.9
|
25 |
+
adam_beta2: 0.999
|
26 |
+
use_decay: true
|
27 |
+
weight_decay: 0.0
|
28 |
+
embedding_dim: 256
|
29 |
+
use_prenet: false
|
30 |
+
prenet_sizes: [512, 256]
|
31 |
+
cbhg_projections: [128, 256]
|
32 |
+
cbhg_filters: 16
|
33 |
+
cbhg_gru_units: 256
|
34 |
+
post_cbhg_layers_units: [256, 256]
|
35 |
+
post_cbhg_use_batch_norm: true
|
36 |
+
|
37 |
+
use_mixed_precision: false
|
38 |
+
optimizer_type: Adam
|
39 |
+
device: cuda
|
40 |
+
|
41 |
+
# LOGGING
|
42 |
+
evaluate_frequency: 1000
|
43 |
+
evaluate_with_error_rates_frequency: 1000
|
44 |
+
n_predicted_text_tensorboard: 10 # To be written to the tensorboard
|
45 |
+
model_save_frequency: 1000
|
46 |
+
train_plotting_frequency: 50000000 # No plotting for this model
|
47 |
+
n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
|
48 |
+
error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
|
49 |
+
|
50 |
+
test_model_path: null # load the last saved model
|
51 |
+
train_resume_model_path: "/content/drive/MyDrive/Research/Barmajan/Diacritization/log_cleaned_dir/CA_MSA.base.cbhg/models/20000-snapshot.pt" # load last saved model
|
poetry_diacritizer/config/gpt-0.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_0
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-1.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_1
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 1
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-2.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_2
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 2
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-3.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_3
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 3
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-4.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_4
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 4
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-5.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_5
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 5
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-6.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_6
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 6
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-7.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_7
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 7
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-8.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_8
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 8
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-9.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_9
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 9
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-0-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_0_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-0-test.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_0_test
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-0.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_0
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-1-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_1_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 1
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-1.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_1
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 1
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-2-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_2_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 2
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-2.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_2
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 2
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-3-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_3_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 3
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-3.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_3
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 3
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-4-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_4_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 4
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-4.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_4
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 4
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-5-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_5_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 5
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-5-test.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: logs/log_dir_cls_5_test
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 5
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-5.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_5
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 5
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-6-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_6_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 6
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-6.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_6
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 6
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-7-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_7_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 7
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-7.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_7
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 7
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-8-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_8_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 8
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-8.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_8
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 8
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-9-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_9_tash_proc
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 9
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-9-test.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: logs/log_dir_cls_9_test
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 9
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-9.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_9
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 9
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-cls-tash-proc.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: tash_proc
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_cls_0_test
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 5000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: false
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|
poetry_diacritizer/config/gpt-lstm-0-50K.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.999
|
3 |
+
base_model_path: ashaar-from-scratch-with-spaces-no-tatweel-epochs-75
|
4 |
+
batch_size: 64
|
5 |
+
data_directory: data
|
6 |
+
data_separator: '|'
|
7 |
+
data_type: CA_MSA
|
8 |
+
device: cuda
|
9 |
+
diacritics_separator: '*'
|
10 |
+
error_rates_n_batches: 10000
|
11 |
+
evaluate_frequency: 50000000
|
12 |
+
evaluate_with_error_rates_frequency: 1000
|
13 |
+
freeze: true
|
14 |
+
is_data_preprocessed: false
|
15 |
+
learning_rate: 0.001
|
16 |
+
load_test_data: false
|
17 |
+
load_training_data: true
|
18 |
+
load_validation_data: true
|
19 |
+
log_directory: log_dir_lstm_0_50K
|
20 |
+
max_eval_batches: -1
|
21 |
+
max_len: 600
|
22 |
+
max_sen_len: 256
|
23 |
+
max_steps: 50000
|
24 |
+
model_save_frequency: 5000
|
25 |
+
n_layer: 0
|
26 |
+
n_predicted_text_tensorboard: 10
|
27 |
+
n_steps_avg_losses:
|
28 |
+
- 100
|
29 |
+
- 500
|
30 |
+
- 1000
|
31 |
+
- 5000
|
32 |
+
n_test_examples: null
|
33 |
+
n_training_examples: null
|
34 |
+
n_validation_examples: null
|
35 |
+
optimizer_type: Adam
|
36 |
+
session_name: base
|
37 |
+
test_file_name: test.csv
|
38 |
+
test_model_path: null
|
39 |
+
text_cleaner: valid_arabic_cleaners
|
40 |
+
text_encoder: ArabicEncoderWithStartSymbol
|
41 |
+
train_plotting_frequency: 50000000
|
42 |
+
train_resume_model_path: null
|
43 |
+
use_decay: true
|
44 |
+
use_lstm: true
|
45 |
+
use_mixed_precision: false
|
46 |
+
weight_decay: 0.0
|