dicta-il merge
Browse files- .gitattributes +1 -0
- .idea/.gitignore +8 -0
- README.md +591 -3
- config.json +92 -0
- model.safetensors +3 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +63 -0
- vocab.txt +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
vocab.txt filter=lfs diff=lfs merge=lfs -text
|
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
README.md
CHANGED
@@ -1,3 +1,591 @@
|
|
1 |
-
---
|
2 |
-
license: cc-by-4.0
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-4.0
|
3 |
+
language:
|
4 |
+
- he
|
5 |
+
inference: false
|
6 |
+
---
|
7 |
+
# DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew
|
8 |
+
|
9 |
+
State-of-the-art language model for parsing Hebrew, released [here](https://arxiv.org/abs/2403.06970).
|
10 |
+
|
11 |
+
This is the fine-tuned model for the joint parsing of the following tasks:
|
12 |
+
|
13 |
+
- Prefix Segmentation
|
14 |
+
- Morphological Disabmgiuation
|
15 |
+
- Lexicographical Analysis (Lemmatization)
|
16 |
+
- Syntactical Parsing (Dependency-Tree)
|
17 |
+
- Named-Entity Recognition
|
18 |
+
|
19 |
+
This model was initialized from dictabert-**large**-joint and tuned on the Hebrew UD Treebank and NEMO corpora, to align the predictions of the model to the tagging methodology in those corpora.
|
20 |
+
|
21 |
+
A live demo of the `dictabert-joint` model with instant visualization of the syntax tree can be found [here](https://huggingface.co/spaces/dicta-il/joint-demo).
|
22 |
+
|
23 |
+
For a faster model, you can use the equivalent bert-tiny model for this task [here](https://huggingface.co/dicta-il/dictabert-tiny-parse).
|
24 |
+
|
25 |
+
For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b).
|
26 |
+
|
27 |
+
---
|
28 |
+
|
29 |
+
The model currently supports 3 types of output:
|
30 |
+
|
31 |
+
1. **JSON**: The model returns a JSON object for each sentence in the input, where for each sentence we have the sentence text, the NER entities, and the list of tokens. For each token we include the output from each of the tasks.
|
32 |
+
```python
|
33 |
+
model.predict(..., output_style='json')
|
34 |
+
```
|
35 |
+
|
36 |
+
1. **UD**: The model returns the full UD output for each sentence, according to the style of the Hebrew UD Treebank.
|
37 |
+
```python
|
38 |
+
model.predict(..., output_style='ud')
|
39 |
+
```
|
40 |
+
|
41 |
+
1. **UD, in the style of IAHLT**: This model returns the full UD output, with slight modifications to match the style of IAHLT. This differences are mostly granularity of some dependency relations, how the suffix of a word is broken up, and implicit definite articles. The actual tagging behavior doesn't change.
|
42 |
+
```python
|
43 |
+
model.predict(..., output_style='iahlt_ud')
|
44 |
+
```
|
45 |
+
|
46 |
+
---
|
47 |
+
|
48 |
+
If you only need the output for one of the tasks, you can tell the model to not initialize some of the heads, for example:
|
49 |
+
```python
|
50 |
+
model = AutoModel.from_pretrained('dicta-il/dictabert-parse', trust_remote_code=True, do_lex=False)
|
51 |
+
```
|
52 |
+
|
53 |
+
The list of options are: `do_lex`, `do_syntax`, `do_ner`, `do_prefix`, `do_morph`.
|
54 |
+
|
55 |
+
---
|
56 |
+
|
57 |
+
Sample usage:
|
58 |
+
|
59 |
+
```python
|
60 |
+
from transformers import AutoModel, AutoTokenizer
|
61 |
+
|
62 |
+
tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-large-parse')
|
63 |
+
model = AutoModel.from_pretrained('dicta-il/dictabert-large-parse', trust_remote_code=True)
|
64 |
+
|
65 |
+
model.eval()
|
66 |
+
|
67 |
+
sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים'
|
68 |
+
print(model.predict([sentence], tokenizer, output_style='json')) # see below for other return formats
|
69 |
+
```
|
70 |
+
|
71 |
+
Output:
|
72 |
+
```json
|
73 |
+
[
|
74 |
+
{
|
75 |
+
"text": "בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים",
|
76 |
+
"tokens": [
|
77 |
+
{
|
78 |
+
"token": "בשנת",
|
79 |
+
"offsets": {
|
80 |
+
"start": 0,
|
81 |
+
"end": 4
|
82 |
+
},
|
83 |
+
"syntax": {
|
84 |
+
"word": "בשנת",
|
85 |
+
"dep_head_idx": 2,
|
86 |
+
"dep_func": "obl",
|
87 |
+
"dep_head": "השלים"
|
88 |
+
},
|
89 |
+
"seg": [
|
90 |
+
"ב",
|
91 |
+
"שנת"
|
92 |
+
],
|
93 |
+
"lex": "שנה",
|
94 |
+
"morph": {
|
95 |
+
"token": "בשנת",
|
96 |
+
"pos": "NOUN",
|
97 |
+
"feats": {
|
98 |
+
"Gender": "Fem",
|
99 |
+
"Number": "Sing"
|
100 |
+
},
|
101 |
+
"prefixes": [
|
102 |
+
"ADP"
|
103 |
+
],
|
104 |
+
"suffix": false
|
105 |
+
}
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"token": "1948",
|
109 |
+
"offsets": {
|
110 |
+
"start": 5,
|
111 |
+
"end": 9
|
112 |
+
},
|
113 |
+
"syntax": {
|
114 |
+
"word": "1948",
|
115 |
+
"dep_head_idx": 0,
|
116 |
+
"dep_func": "compound:smixut",
|
117 |
+
"dep_head": "בשנת"
|
118 |
+
},
|
119 |
+
"seg": [
|
120 |
+
"1948"
|
121 |
+
],
|
122 |
+
"lex": "1948",
|
123 |
+
"morph": {
|
124 |
+
"token": "1948",
|
125 |
+
"pos": "NUM",
|
126 |
+
"feats": {},
|
127 |
+
"prefixes": [],
|
128 |
+
"suffix": false
|
129 |
+
}
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"token": "השלים",
|
133 |
+
"offsets": {
|
134 |
+
"start": 10,
|
135 |
+
"end": 15
|
136 |
+
},
|
137 |
+
"syntax": {
|
138 |
+
"word": "השלים",
|
139 |
+
"dep_head_idx": -1,
|
140 |
+
"dep_func": "root",
|
141 |
+
"dep_head": "הומוריסטיים"
|
142 |
+
},
|
143 |
+
"seg": [
|
144 |
+
"השלים"
|
145 |
+
],
|
146 |
+
"lex": "השלים",
|
147 |
+
"morph": {
|
148 |
+
"token": "השלים",
|
149 |
+
"pos": "VERB",
|
150 |
+
"feats": {
|
151 |
+
"Gender": "Masc",
|
152 |
+
"Number": "Sing",
|
153 |
+
"Person": "3",
|
154 |
+
"Tense": "Past"
|
155 |
+
},
|
156 |
+
"prefixes": [],
|
157 |
+
"suffix": false
|
158 |
+
}
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"token": "אפרים",
|
162 |
+
"offsets": {
|
163 |
+
"start": 16,
|
164 |
+
"end": 21
|
165 |
+
},
|
166 |
+
"syntax": {
|
167 |
+
"word": "אפרים",
|
168 |
+
"dep_head_idx": 2,
|
169 |
+
"dep_func": "nsubj",
|
170 |
+
"dep_head": "השלים"
|
171 |
+
},
|
172 |
+
"seg": [
|
173 |
+
"אפרים"
|
174 |
+
],
|
175 |
+
"lex": "אפרים",
|
176 |
+
"morph": {
|
177 |
+
"token": "אפרים",
|
178 |
+
"pos": "PROPN",
|
179 |
+
"feats": {},
|
180 |
+
"prefixes": [],
|
181 |
+
"suffix": false
|
182 |
+
}
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"token": "קישון",
|
186 |
+
"offsets": {
|
187 |
+
"start": 22,
|
188 |
+
"end": 27
|
189 |
+
},
|
190 |
+
"syntax": {
|
191 |
+
"word": "קישון",
|
192 |
+
"dep_head_idx": 3,
|
193 |
+
"dep_func": "flat:name",
|
194 |
+
"dep_head": "אפרים"
|
195 |
+
},
|
196 |
+
"seg": [
|
197 |
+
"קישון"
|
198 |
+
],
|
199 |
+
"lex": "קישון",
|
200 |
+
"morph": {
|
201 |
+
"token": "קישון",
|
202 |
+
"pos": "PROPN",
|
203 |
+
"feats": {},
|
204 |
+
"prefixes": [],
|
205 |
+
"suffix": false
|
206 |
+
}
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"token": "את",
|
210 |
+
"offsets": {
|
211 |
+
"start": 28,
|
212 |
+
"end": 30
|
213 |
+
},
|
214 |
+
"syntax": {
|
215 |
+
"word": "את",
|
216 |
+
"dep_head_idx": 6,
|
217 |
+
"dep_func": "case:acc",
|
218 |
+
"dep_head": "לימודיו"
|
219 |
+
},
|
220 |
+
"seg": [
|
221 |
+
"את"
|
222 |
+
],
|
223 |
+
"lex": "את",
|
224 |
+
"morph": {
|
225 |
+
"token": "את",
|
226 |
+
"pos": "ADP",
|
227 |
+
"feats": {},
|
228 |
+
"prefixes": [],
|
229 |
+
"suffix": false
|
230 |
+
}
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"token": "לימודיו",
|
234 |
+
"offsets": {
|
235 |
+
"start": 31,
|
236 |
+
"end": 38
|
237 |
+
},
|
238 |
+
"syntax": {
|
239 |
+
"word": "לימודיו",
|
240 |
+
"dep_head_idx": 2,
|
241 |
+
"dep_func": "obj",
|
242 |
+
"dep_head": "השלים"
|
243 |
+
},
|
244 |
+
"seg": [
|
245 |
+
"לימודיו"
|
246 |
+
],
|
247 |
+
"lex": "לימוד",
|
248 |
+
"morph": {
|
249 |
+
"token": "לימודיו",
|
250 |
+
"pos": "NOUN",
|
251 |
+
"feats": {
|
252 |
+
"Gender": "Masc",
|
253 |
+
"Number": "Plur"
|
254 |
+
},
|
255 |
+
"prefixes": [],
|
256 |
+
"suffix": "ADP_PRON",
|
257 |
+
"suffix_feats": {
|
258 |
+
"Gender": "Masc",
|
259 |
+
"Number": "Sing",
|
260 |
+
"Person": "3"
|
261 |
+
}
|
262 |
+
}
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"token": "בפיסול",
|
266 |
+
"offsets": {
|
267 |
+
"start": 39,
|
268 |
+
"end": 45
|
269 |
+
},
|
270 |
+
"syntax": {
|
271 |
+
"word": "בפיסול",
|
272 |
+
"dep_head_idx": 6,
|
273 |
+
"dep_func": "nmod",
|
274 |
+
"dep_head": "לימודיו"
|
275 |
+
},
|
276 |
+
"seg": [
|
277 |
+
"ב",
|
278 |
+
"פיסול"
|
279 |
+
],
|
280 |
+
"lex": "פיסול",
|
281 |
+
"morph": {
|
282 |
+
"token": "בפיסול",
|
283 |
+
"pos": "NOUN",
|
284 |
+
"feats": {
|
285 |
+
"Gender": "Masc",
|
286 |
+
"Number": "Sing"
|
287 |
+
},
|
288 |
+
"prefixes": [
|
289 |
+
"ADP"
|
290 |
+
],
|
291 |
+
"suffix": false
|
292 |
+
}
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"token": "מתכת",
|
296 |
+
"offsets": {
|
297 |
+
"start": 46,
|
298 |
+
"end": 50
|
299 |
+
},
|
300 |
+
"syntax": {
|
301 |
+
"word": "מתכת",
|
302 |
+
"dep_head_idx": 7,
|
303 |
+
"dep_func": "compound:smixut",
|
304 |
+
"dep_head": "בפיסול"
|
305 |
+
},
|
306 |
+
"seg": [
|
307 |
+
"מתכת"
|
308 |
+
],
|
309 |
+
"lex": "מתכת",
|
310 |
+
"morph": {
|
311 |
+
"token": "מתכת",
|
312 |
+
"pos": "NOUN",
|
313 |
+
"feats": {
|
314 |
+
"Gender": "Fem",
|
315 |
+
"Number": "Sing"
|
316 |
+
},
|
317 |
+
"prefixes": [],
|
318 |
+
"suffix": false
|
319 |
+
}
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"token": "ובתולדות",
|
323 |
+
"offsets": {
|
324 |
+
"start": 51,
|
325 |
+
"end": 59
|
326 |
+
},
|
327 |
+
"syntax": {
|
328 |
+
"word": "ובתולדות",
|
329 |
+
"dep_head_idx": 7,
|
330 |
+
"dep_func": "conj",
|
331 |
+
"dep_head": "בפיסול"
|
332 |
+
},
|
333 |
+
"seg": [
|
334 |
+
"וב",
|
335 |
+
"תולדות"
|
336 |
+
],
|
337 |
+
"lex": "תולדה",
|
338 |
+
"morph": {
|
339 |
+
"token": "ובתולדות",
|
340 |
+
"pos": "NOUN",
|
341 |
+
"feats": {
|
342 |
+
"Gender": "Fem",
|
343 |
+
"Number": "Plur"
|
344 |
+
},
|
345 |
+
"prefixes": [
|
346 |
+
"CCONJ",
|
347 |
+
"ADP"
|
348 |
+
],
|
349 |
+
"suffix": false
|
350 |
+
}
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"token": "האמנות",
|
354 |
+
"offsets": {
|
355 |
+
"start": 60,
|
356 |
+
"end": 66
|
357 |
+
},
|
358 |
+
"syntax": {
|
359 |
+
"word": "האמנות",
|
360 |
+
"dep_head_idx": 9,
|
361 |
+
"dep_func": "compound:smixut",
|
362 |
+
"dep_head": "ובתולדות"
|
363 |
+
},
|
364 |
+
"seg": [
|
365 |
+
"ה",
|
366 |
+
"אמנות"
|
367 |
+
],
|
368 |
+
"lex": "אומנות",
|
369 |
+
"morph": {
|
370 |
+
"token": "האמנות",
|
371 |
+
"pos": "NOUN",
|
372 |
+
"feats": {
|
373 |
+
"Gender": "Fem",
|
374 |
+
"Number": "Sing"
|
375 |
+
},
|
376 |
+
"prefixes": [
|
377 |
+
"DET"
|
378 |
+
],
|
379 |
+
"suffix": false
|
380 |
+
}
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"token": "והחל",
|
384 |
+
"offsets": {
|
385 |
+
"start": 67,
|
386 |
+
"end": 71
|
387 |
+
},
|
388 |
+
"syntax": {
|
389 |
+
"word": "והחל",
|
390 |
+
"dep_head_idx": 2,
|
391 |
+
"dep_func": "conj",
|
392 |
+
"dep_head": "השלים"
|
393 |
+
},
|
394 |
+
"seg": [
|
395 |
+
"ו",
|
396 |
+
"החל"
|
397 |
+
],
|
398 |
+
"lex": "החל",
|
399 |
+
"morph": {
|
400 |
+
"token": "והחל",
|
401 |
+
"pos": "VERB",
|
402 |
+
"feats": {
|
403 |
+
"Gender": "Masc",
|
404 |
+
"Number": "Sing",
|
405 |
+
"Person": "3",
|
406 |
+
"Tense": "Past"
|
407 |
+
},
|
408 |
+
"prefixes": [
|
409 |
+
"CCONJ"
|
410 |
+
],
|
411 |
+
"suffix": false
|
412 |
+
}
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"token": "לפרסם",
|
416 |
+
"offsets": {
|
417 |
+
"start": 72,
|
418 |
+
"end": 77
|
419 |
+
},
|
420 |
+
"syntax": {
|
421 |
+
"word": "לפרסם",
|
422 |
+
"dep_head_idx": 11,
|
423 |
+
"dep_func": "xcomp",
|
424 |
+
"dep_head": "והחל"
|
425 |
+
},
|
426 |
+
"seg": [
|
427 |
+
"לפרסם"
|
428 |
+
],
|
429 |
+
"lex": "פרסם",
|
430 |
+
"morph": {
|
431 |
+
"token": "לפרסם",
|
432 |
+
"pos": "VERB",
|
433 |
+
"feats": {},
|
434 |
+
"prefixes": [],
|
435 |
+
"suffix": false
|
436 |
+
}
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"token": "מאמרים",
|
440 |
+
"offsets": {
|
441 |
+
"start": 78,
|
442 |
+
"end": 84
|
443 |
+
},
|
444 |
+
"syntax": {
|
445 |
+
"word": "מאמרים",
|
446 |
+
"dep_head_idx": 12,
|
447 |
+
"dep_func": "obj",
|
448 |
+
"dep_head": "לפרסם"
|
449 |
+
},
|
450 |
+
"seg": [
|
451 |
+
"מאמרים"
|
452 |
+
],
|
453 |
+
"lex": "מאמר",
|
454 |
+
"morph": {
|
455 |
+
"token": "מאמרים",
|
456 |
+
"pos": "NOUN",
|
457 |
+
"feats": {
|
458 |
+
"Gender": "Masc",
|
459 |
+
"Number": "Plur"
|
460 |
+
},
|
461 |
+
"prefixes": [],
|
462 |
+
"suffix": false
|
463 |
+
}
|
464 |
+
},
|
465 |
+
{
|
466 |
+
"token": "הומוריסטיים",
|
467 |
+
"offsets": {
|
468 |
+
"start": 85,
|
469 |
+
"end": 96
|
470 |
+
},
|
471 |
+
"syntax": {
|
472 |
+
"word": "הומוריסטיים",
|
473 |
+
"dep_head_idx": 13,
|
474 |
+
"dep_func": "amod",
|
475 |
+
"dep_head": "מאמרים"
|
476 |
+
},
|
477 |
+
"seg": [
|
478 |
+
"הומוריסטיים"
|
479 |
+
],
|
480 |
+
"lex": "הומוריסטי",
|
481 |
+
"morph": {
|
482 |
+
"token": "הומוריסטיים",
|
483 |
+
"pos": "ADJ",
|
484 |
+
"feats": {
|
485 |
+
"Gender": "Masc",
|
486 |
+
"Number": "Plur"
|
487 |
+
},
|
488 |
+
"prefixes": [],
|
489 |
+
"suffix": false
|
490 |
+
}
|
491 |
+
}
|
492 |
+
],
|
493 |
+
"root_idx": 2,
|
494 |
+
"ner_entities": [
|
495 |
+
{
|
496 |
+
"phrase": "1948",
|
497 |
+
"label": "TIMEX",
|
498 |
+
"start": 5,
|
499 |
+
"end": 9,
|
500 |
+
"token_start": 1,
|
501 |
+
"token_end": 1
|
502 |
+
},
|
503 |
+
{
|
504 |
+
"phrase": "אפרים קישון",
|
505 |
+
"label": "PER",
|
506 |
+
"start": 16,
|
507 |
+
"end": 27,
|
508 |
+
"token_start": 3,
|
509 |
+
"token_end": 4
|
510 |
+
}
|
511 |
+
]
|
512 |
+
}
|
513 |
+
]
|
514 |
+
```
|
515 |
+
|
516 |
+
You can also choose to get your response in UD format:
|
517 |
+
|
518 |
+
```python
|
519 |
+
sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים'
|
520 |
+
print(model.predict([sentence], tokenizer, output_style='ud'))
|
521 |
+
```
|
522 |
+
|
523 |
+
Results:
|
524 |
+
```json
|
525 |
+
[
|
526 |
+
[
|
527 |
+
"# sent_id = 1",
|
528 |
+
"# text = בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים",
|
529 |
+
"1-2\tבשנת\t_\t_\t_\t_\t_\t_\t_\t_",
|
530 |
+
"1\tב\tב\tADP\tADP\t_\t2\tcase\t_\t_",
|
531 |
+
"2\tשנת\tשנה\tNOUN\tNOUN\tGender=Fem|Number=Sing\t4\tobl\t_\t_",
|
532 |
+
"3\t1948\t1948\tNUM\tNUM\t\t2\tcompound:smixut\t_\t_",
|
533 |
+
"4\tהשלים\tהשלים\tVERB\tVERB\tGender=Masc|Number=Sing|Person=3|Tense=Past\t0\troot\t_\t_",
|
534 |
+
"5\tאפרים\tאפרים\tPROPN\tPROPN\t\t4\tnsubj\t_\t_",
|
535 |
+
"6\tקישון\tקישון\tPROPN\tPROPN\t\t5\tflat:name\t_\t_",
|
536 |
+
"7\tאת\tאת\tADP\tADP\t\t8\tcase:acc\t_\t_",
|
537 |
+
"8-10\tלימודיו\t_\t_\t_\t_\t_\t_\t_\t_",
|
538 |
+
"8\tלימוד_\tלימוד\tNOUN\tNOUN\tGender=Masc|Number=Plur\t4\tobj\t_\t_",
|
539 |
+
"9\t_של_\tשל\tADP\tADP\t_\t10\tcase\t_\t_",
|
540 |
+
"10\t_הוא\tהוא\tPRON\tPRON\tGender=Masc|Number=Sing|Person=3\t8\tnmod:poss\t_\t_",
|
541 |
+
"11-12\tבפיסול\t_\t_\t_\t_\t_\t_\t_\t_",
|
542 |
+
"11\tב\tב\tADP\tADP\t_\t12\tcase\t_\t_",
|
543 |
+
"12\tפיסול\tפיסול\tNOUN\tNOUN\tGender=Masc|Number=Sing\t8\tnmod\t_\t_",
|
544 |
+
"13\tמתכת\tמתכת\tNOUN\tNOUN\tGender=Fem|Number=Sing\t12\tcompound:smixut\t_\t_",
|
545 |
+
"14-16\tובתולדות\t_\t_\t_\t_\t_\t_\t_\t_",
|
546 |
+
"14\tו\tו\tCCONJ\tCCONJ\t_\t16\tcc\t_\t_",
|
547 |
+
"15\tב\tב\tADP\tADP\t_\t16\tcase\t_\t_",
|
548 |
+
"16\tתולדות\tתולדה\tNOUN\tNOUN\tGender=Fem|Number=Plur\t12\tconj\t_\t_",
|
549 |
+
"17-18\tהאמנות\t_\t_\t_\t_\t_\t_\t_\t_",
|
550 |
+
"17\tה\tה\tDET\tDET\t_\t18\tdet\t_\t_",
|
551 |
+
"18\tאמנות\tאומנות\tNOUN\tNOUN\tGender=Fem|Number=Sing\t16\tcompound:smixut\t_\t_",
|
552 |
+
"19-20\tוהחל\t_\t_\t_\t_\t_\t_\t_\t_",
|
553 |
+
"19\tו\tו\tCCONJ\tCCONJ\t_\t20\tcc\t_\t_",
|
554 |
+
"20\tהחל\tהחל\tVERB\tVERB\tGender=Masc|Number=Sing|Person=3|Tense=Past\t4\tconj\t_\t_",
|
555 |
+
"21\tלפרסם\tפרסם\tVERB\tVERB\t\t20\txcomp\t_\t_",
|
556 |
+
"22\tמאמרים\tמאמר\tNOUN\tNOUN\tGender=Masc|Number=Plur\t21\tobj\t_\t_",
|
557 |
+
"23\tהומוריסטיים\tהומוריסטי\tADJ\tADJ\tGender=Masc|Number=Plur\t22\tamod\t_\t_"
|
558 |
+
]
|
559 |
+
]
|
560 |
+
```
|
561 |
+
|
562 |
+
## Citation
|
563 |
+
|
564 |
+
If you use DictaBERT-large-parse in your research, please cite ```MRL Parsing without Tears: The Case of Hebrew```
|
565 |
+
|
566 |
+
**BibTeX:**
|
567 |
+
|
568 |
+
```bibtex
|
569 |
+
@misc{shmidman2024mrl,
|
570 |
+
title={MRL Parsing Without Tears: The Case of Hebrew},
|
571 |
+
author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel and Reut Tsarfaty},
|
572 |
+
year={2024},
|
573 |
+
eprint={2403.06970},
|
574 |
+
archivePrefix={arXiv},
|
575 |
+
primaryClass={cs.CL}
|
576 |
+
}
|
577 |
+
```
|
578 |
+
|
579 |
+
|
580 |
+
## License
|
581 |
+
|
582 |
+
Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
|
583 |
+
|
584 |
+
This work is licensed under a
|
585 |
+
[Creative Commons Attribution 4.0 International License][cc-by].
|
586 |
+
|
587 |
+
[![CC BY 4.0][cc-by-image]][cc-by]
|
588 |
+
|
589 |
+
[cc-by]: http://creativecommons.org/licenses/by/4.0/
|
590 |
+
[cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
|
591 |
+
[cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
|
config.json
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"BertForJointParsing"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoModel": "dicta-il/dictabert-joint--BertForJointParsing.BertForJointParsing"
|
7 |
+
},
|
8 |
+
"attention_probs_dropout_prob": 0.1,
|
9 |
+
"classifier_dropout": null,
|
10 |
+
"do_lex": true,
|
11 |
+
"do_morph": true,
|
12 |
+
"do_ner": true,
|
13 |
+
"do_prefix": true,
|
14 |
+
"do_syntax": true,
|
15 |
+
"hidden_act": "gelu",
|
16 |
+
"hidden_dropout_prob": 0.1,
|
17 |
+
"hidden_size": 1024,
|
18 |
+
"id2label": {
|
19 |
+
"0": "B-ANG",
|
20 |
+
"1": "B-DUC",
|
21 |
+
"2": "B-EVE",
|
22 |
+
"3": "B-FAC",
|
23 |
+
"4": "B-GPE",
|
24 |
+
"5": "B-LOC",
|
25 |
+
"6": "B-ORG",
|
26 |
+
"7": "B-PER",
|
27 |
+
"8": "B-WOA",
|
28 |
+
"9": "B-INFORMAL",
|
29 |
+
"10": "B-MISC",
|
30 |
+
"11": "B-TIMEX",
|
31 |
+
"12": "B-TTL",
|
32 |
+
"13": "I-DUC",
|
33 |
+
"14": "I-EVE",
|
34 |
+
"15": "I-FAC",
|
35 |
+
"16": "I-GPE",
|
36 |
+
"17": "I-LOC",
|
37 |
+
"18": "I-ORG",
|
38 |
+
"19": "I-PER",
|
39 |
+
"20": "I-WOA",
|
40 |
+
"21": "I-ANG",
|
41 |
+
"22": "I-INFORMAL",
|
42 |
+
"23": "I-MISC",
|
43 |
+
"24": "I-TIMEX",
|
44 |
+
"25": "I-TTL",
|
45 |
+
"26": "O"
|
46 |
+
},
|
47 |
+
"initializer_range": 0.02,
|
48 |
+
"intermediate_size": 4096,
|
49 |
+
"label2id": {
|
50 |
+
"B-ANG": 0,
|
51 |
+
"B-DUC": 1,
|
52 |
+
"B-EVE": 2,
|
53 |
+
"B-FAC": 3,
|
54 |
+
"B-GPE": 4,
|
55 |
+
"B-INFORMAL": 9,
|
56 |
+
"B-LOC": 5,
|
57 |
+
"B-MISC": 10,
|
58 |
+
"B-ORG": 6,
|
59 |
+
"B-PER": 7,
|
60 |
+
"B-TIMEX": 11,
|
61 |
+
"B-TTL": 12,
|
62 |
+
"B-WOA": 8,
|
63 |
+
"I-ANG": 21,
|
64 |
+
"I-DUC": 13,
|
65 |
+
"I-EVE": 14,
|
66 |
+
"I-FAC": 15,
|
67 |
+
"I-GPE": 16,
|
68 |
+
"I-INFORMAL": 22,
|
69 |
+
"I-LOC": 17,
|
70 |
+
"I-MISC": 23,
|
71 |
+
"I-ORG": 18,
|
72 |
+
"I-PER": 19,
|
73 |
+
"I-TIMEX": 24,
|
74 |
+
"I-TTL": 25,
|
75 |
+
"I-WOA": 20,
|
76 |
+
"O": 26
|
77 |
+
},
|
78 |
+
"layer_norm_eps": 1e-12,
|
79 |
+
"max_position_embeddings": 512,
|
80 |
+
"model_type": "bert",
|
81 |
+
"newmodern": true,
|
82 |
+
"num_attention_heads": 16,
|
83 |
+
"num_hidden_layers": 24,
|
84 |
+
"pad_token_id": 0,
|
85 |
+
"position_embedding_type": "absolute",
|
86 |
+
"syntax_head_size": 128,
|
87 |
+
"torch_dtype": "float32",
|
88 |
+
"transformers_version": "4.36.2",
|
89 |
+
"type_vocab_size": 2,
|
90 |
+
"use_cache": true,
|
91 |
+
"vocab_size": 128000
|
92 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d25a305e90a34d3f74d8651651af2575be03fcee631a239a35fb538a13ed8158
|
3 |
+
size 1750709384
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[UNK]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[CLS]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[SEP]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[PAD]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"5": {
|
44 |
+
"content": "[BLANK]",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"clean_up_tokenization_spaces": true,
|
53 |
+
"cls_token": "[CLS]",
|
54 |
+
"do_lower_case": true,
|
55 |
+
"mask_token": "[MASK]",
|
56 |
+
"model_max_length": 512,
|
57 |
+
"pad_token": "[PAD]",
|
58 |
+
"sep_token": "[SEP]",
|
59 |
+
"strip_accents": null,
|
60 |
+
"tokenize_chinese_chars": true,
|
61 |
+
"tokenizer_class": "BertTokenizer",
|
62 |
+
"unk_token": "[UNK]"
|
63 |
+
}
|
vocab.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
|
3 |
+
size 1500244
|