Shaltiel commited on
Commit
20254bf
verified
1 Parent(s): 6c3937d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +438 -0
README.md CHANGED
@@ -1,3 +1,441 @@
1
  ---
2
  license: cc-by-4.0
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-4.0
3
+ language:
4
+ - he
5
+ inference: false
6
  ---
7
+ # DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew
8
+
9
+ State-of-the-art language model for Hebrew, released [here](https://arxiv.org/abs/2308.16687).
10
+
11
+ This is the fine-tuned BERT-tiny model for the joint parsing of the following tasks:
12
+
13
+ - Prefix Segmentation
14
+ - Morphological Disabmgiuation
15
+ - Lexicographical Analysis (Lemmatization)
16
+ - Syntactical Parsing (Dependency-Tree)
17
+ - Named-Entity Recognition
18
+
19
+ For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b).
20
+
21
+ Sample usage:
22
+
23
+ ```python
24
+ from transformers import AutoModel, AutoTokenizer
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint')
27
+ model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True)
28
+
29
+ model.eval()
30
+
31
+ sentence = '讘砖谞转 1948 讛砖诇讬诐 讗驻专讬诐 拽讬砖讜谉 讗转 诇讬诪讜讚讬讜 讘驻讬住讜诇 诪转讻转 讜讘转讜诇讚讜转 讛讗诪谞讜转 讜讛讞诇 诇驻专住诐 诪讗诪专讬诐 讛讜诪讜专讬住讟讬讬诐'
32
+ print(model.predict([sentence], tokenizer))
33
+ ```
34
+
35
+ Output:
36
+ ```json
37
+ [
38
+ {
39
+ "text": "讘砖谞转 1948 讛砖诇讬诐 讗驻专讬诐 拽讬砖讜谉 讗转 诇讬诪讜讚讬讜 讘驻讬住讜诇 诪转讻转 讜讘转讜诇讚讜转 讛讗诪谞讜转 讜讛讞诇 诇驻专住诐 诪讗诪专讬诐 讛讜诪讜专讬住讟讬讬诐",
40
+ "tokens": [
41
+ {
42
+ "token": "讘砖谞转",
43
+ "syntax": {
44
+ "word": "讘砖谞转",
45
+ "dep_head_idx": 2,
46
+ "dep_func": "obl",
47
+ "dep_head": "讛砖诇讬诐"
48
+ },
49
+ "seg": [
50
+ "讘",
51
+ "砖谞转"
52
+ ],
53
+ "lex": "砖谞讛",
54
+ "morph": {
55
+ "token": "讘砖谞转",
56
+ "pos": "NOUN",
57
+ "feats": {
58
+ "Gender": "Fem",
59
+ "Number": "Sing"
60
+ },
61
+ "prefixes": [
62
+ "ADP"
63
+ ],
64
+ "suffix": false
65
+ }
66
+ },
67
+ {
68
+ "token": "1948",
69
+ "syntax": {
70
+ "word": "1948",
71
+ "dep_head_idx": 0,
72
+ "dep_func": "compound",
73
+ "dep_head": "讘砖谞转"
74
+ },
75
+ "seg": [
76
+ "1948"
77
+ ],
78
+ "lex": "1948",
79
+ "morph": {
80
+ "token": "1948",
81
+ "pos": "NUM",
82
+ "feats": {},
83
+ "prefixes": [],
84
+ "suffix": false
85
+ }
86
+ },
87
+ {
88
+ "token": "讛砖诇讬诐",
89
+ "syntax": {
90
+ "word": "讛砖诇讬诐",
91
+ "dep_head_idx": -1,
92
+ "dep_func": "root",
93
+ "dep_head": "讛讜诪讜专讬住讟讬讬诐"
94
+ },
95
+ "seg": [
96
+ "讛砖诇讬诐"
97
+ ],
98
+ "lex": "讛砖诇讬诐",
99
+ "morph": {
100
+ "token": "讛砖诇讬诐",
101
+ "pos": "VERB",
102
+ "feats": {
103
+ "Gender": "Masc",
104
+ "Number": "Sing",
105
+ "Person": "3",
106
+ "Tense": "Past"
107
+ },
108
+ "prefixes": [],
109
+ "suffix": false
110
+ }
111
+ },
112
+ {
113
+ "token": "讗驻专讬诐",
114
+ "syntax": {
115
+ "word": "讗驻专讬诐",
116
+ "dep_head_idx": 2,
117
+ "dep_func": "nsubj",
118
+ "dep_head": "讛砖诇讬诐"
119
+ },
120
+ "seg": [
121
+ "讗驻专讬诐"
122
+ ],
123
+ "lex": "讗驻专讬诐",
124
+ "morph": {
125
+ "token": "讗驻专讬诐",
126
+ "pos": "PROPN",
127
+ "feats": {},
128
+ "prefixes": [],
129
+ "suffix": false
130
+ }
131
+ },
132
+ {
133
+ "token": "拽讬砖讜谉",
134
+ "syntax": {
135
+ "word": "拽讬砖讜谉",
136
+ "dep_head_idx": 3,
137
+ "dep_func": "flat",
138
+ "dep_head": "讗驻专讬诐"
139
+ },
140
+ "seg": [
141
+ "拽讬砖讜谉"
142
+ ],
143
+ "lex": "拽讬砖讜谉",
144
+ "morph": {
145
+ "token": "拽讬砖讜谉",
146
+ "pos": "PROPN",
147
+ "feats": {},
148
+ "prefixes": [],
149
+ "suffix": false
150
+ }
151
+ },
152
+ {
153
+ "token": "讗转",
154
+ "syntax": {
155
+ "word": "讗转",
156
+ "dep_head_idx": 6,
157
+ "dep_func": "case",
158
+ "dep_head": "诇讬诪讜讚讬讜"
159
+ },
160
+ "seg": [
161
+ "讗转"
162
+ ],
163
+ "lex": "讗转",
164
+ "morph": {
165
+ "token": "讗转",
166
+ "pos": "ADP",
167
+ "feats": {},
168
+ "prefixes": [],
169
+ "suffix": false
170
+ }
171
+ },
172
+ {
173
+ "token": "诇讬诪讜讚讬讜",
174
+ "syntax": {
175
+ "word": "诇讬诪讜讚讬讜",
176
+ "dep_head_idx": 2,
177
+ "dep_func": "obj",
178
+ "dep_head": "讛砖诇讬诐"
179
+ },
180
+ "seg": [
181
+ "诇讬诪讜讚讬讜"
182
+ ],
183
+ "lex": "诇讬诪讜讚",
184
+ "morph": {
185
+ "token": "诇讬诪讜讚讬讜",
186
+ "pos": "NOUN",
187
+ "feats": {
188
+ "Gender": "Masc",
189
+ "Number": "Plur"
190
+ },
191
+ "prefixes": [],
192
+ "suffix": "PRON",
193
+ "suffix_feats": {
194
+ "Gender": "Masc",
195
+ "Number": "Sing",
196
+ "Person": "3"
197
+ }
198
+ }
199
+ },
200
+ {
201
+ "token": "讘驻讬住讜诇",
202
+ "syntax": {
203
+ "word": "讘驻讬住讜诇",
204
+ "dep_head_idx": 6,
205
+ "dep_func": "nmod",
206
+ "dep_head": "诇讬诪讜讚讬讜"
207
+ },
208
+ "seg": [
209
+ "讘",
210
+ "驻讬住讜诇"
211
+ ],
212
+ "lex": "驻讬住讜诇",
213
+ "morph": {
214
+ "token": "讘驻讬住讜诇",
215
+ "pos": "NOUN",
216
+ "feats": {
217
+ "Gender": "Masc",
218
+ "Number": "Sing"
219
+ },
220
+ "prefixes": [
221
+ "ADP"
222
+ ],
223
+ "suffix": false
224
+ }
225
+ },
226
+ {
227
+ "token": "诪转讻转",
228
+ "syntax": {
229
+ "word": "诪转讻转",
230
+ "dep_head_idx": 7,
231
+ "dep_func": "compound",
232
+ "dep_head": "讘驻讬住讜诇"
233
+ },
234
+ "seg": [
235
+ "诪转讻转"
236
+ ],
237
+ "lex": "诪转讻转",
238
+ "morph": {
239
+ "token": "诪转讻转",
240
+ "pos": "NOUN",
241
+ "feats": {
242
+ "Gender": "Fem",
243
+ "Number": "Sing"
244
+ },
245
+ "prefixes": [],
246
+ "suffix": false
247
+ }
248
+ },
249
+ {
250
+ "token": "讜讘转讜诇讚讜转",
251
+ "syntax": {
252
+ "word": "讜讘转讜诇讚讜转",
253
+ "dep_head_idx": 7,
254
+ "dep_func": "conj",
255
+ "dep_head": "讘驻讬住讜诇"
256
+ },
257
+ "seg": [
258
+ "讜讘",
259
+ "转讜诇讚讜转"
260
+ ],
261
+ "lex": "转讜诇讚讛",
262
+ "morph": {
263
+ "token": "讜讘转讜诇讚讜转",
264
+ "pos": "NOUN",
265
+ "feats": {
266
+ "Gender": "Fem",
267
+ "Number": "Plur"
268
+ },
269
+ "prefixes": [
270
+ "CCONJ",
271
+ "ADP"
272
+ ],
273
+ "suffix": false
274
+ }
275
+ },
276
+ {
277
+ "token": "讛讗诪谞讜转",
278
+ "syntax": {
279
+ "word": "讛讗诪谞讜转",
280
+ "dep_head_idx": 9,
281
+ "dep_func": "compound",
282
+ "dep_head": "讜讘转讜诇讚讜转"
283
+ },
284
+ "seg": [
285
+ "讛",
286
+ "讗诪谞讜转"
287
+ ],
288
+ "lex": "讗讜诪谞讜转",
289
+ "morph": {
290
+ "token": "讛讗诪谞讜转",
291
+ "pos": "NOUN",
292
+ "feats": {
293
+ "Gender": "Fem",
294
+ "Number": "Sing"
295
+ },
296
+ "prefixes": [
297
+ "DET"
298
+ ],
299
+ "suffix": false
300
+ }
301
+ },
302
+ {
303
+ "token": "讜讛讞诇",
304
+ "syntax": {
305
+ "word": "讜讛讞诇",
306
+ "dep_head_idx": 2,
307
+ "dep_func": "conj",
308
+ "dep_head": "讛砖诇讬诐"
309
+ },
310
+ "seg": [
311
+ "讜",
312
+ "讛讞诇"
313
+ ],
314
+ "lex": "讛讞诇",
315
+ "morph": {
316
+ "token": "讜讛讞诇",
317
+ "pos": "VERB",
318
+ "feats": {
319
+ "Gender": "Masc",
320
+ "Number": "Sing",
321
+ "Person": "3",
322
+ "Tense": "Past"
323
+ },
324
+ "prefixes": [
325
+ "CCONJ"
326
+ ],
327
+ "suffix": false
328
+ }
329
+ },
330
+ {
331
+ "token": "诇驻专住诐",
332
+ "syntax": {
333
+ "word": "诇驻专住诐",
334
+ "dep_head_idx": 11,
335
+ "dep_func": "xcomp",
336
+ "dep_head": "讜讛讞诇"
337
+ },
338
+ "seg": [
339
+ "诇驻专住诐"
340
+ ],
341
+ "lex": "驻专住诐",
342
+ "morph": {
343
+ "token": "诇驻专住诐",
344
+ "pos": "VERB",
345
+ "feats": {},
346
+ "prefixes": [],
347
+ "suffix": false
348
+ }
349
+ },
350
+ {
351
+ "token": "诪讗诪专讬诐",
352
+ "syntax": {
353
+ "word": "诪讗诪专讬诐",
354
+ "dep_head_idx": 12,
355
+ "dep_func": "obj",
356
+ "dep_head": "诇驻专住诐"
357
+ },
358
+ "seg": [
359
+ "诪讗诪专讬诐"
360
+ ],
361
+ "lex": "诪讗诪专",
362
+ "morph": {
363
+ "token": "诪讗诪专讬诐",
364
+ "pos": "NOUN",
365
+ "feats": {
366
+ "Gender": "Masc",
367
+ "Number": "Plur"
368
+ },
369
+ "prefixes": [],
370
+ "suffix": false
371
+ }
372
+ },
373
+ {
374
+ "token": "讛讜诪讜专讬住讟讬讬诐",
375
+ "syntax": {
376
+ "word": "讛讜诪讜专讬住讟讬讬诐",
377
+ "dep_head_idx": 13,
378
+ "dep_func": "amod",
379
+ "dep_head": "诪讗诪专讬诐"
380
+ },
381
+ "seg": [
382
+ "讛讜诪讜专讬住讟讬讬诐"
383
+ ],
384
+ "lex": "讛讜诪讜专讬住讟讬",
385
+ "morph": {
386
+ "token": "讛讜诪讜专讬住讟讬讬诐",
387
+ "pos": "ADJ",
388
+ "feats": {
389
+ "Gender": "Masc",
390
+ "Number": "Plur"
391
+ },
392
+ "prefixes": [],
393
+ "suffix": false
394
+ }
395
+ }
396
+ ],
397
+ "root_idx": 2,
398
+ "ner_entities": [
399
+ {
400
+ "phrase": "1948",
401
+ "label": "TIMEX"
402
+ },
403
+ {
404
+ "phrase": "讗驻专讬诐 拽讬砖讜谉",
405
+ "label": "PER"
406
+ }
407
+ ]
408
+ }
409
+ ]
410
+ ```
411
+
412
+
413
+ ## Citation
414
+
415
+ If you use DictaBERT in your research, please cite ```DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew```
416
+
417
+ **BibTeX:**
418
+
419
+ ```bibtex
420
+ @misc{shmidman2023dictabert,
421
+ title={DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew},
422
+ author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel},
423
+ year={2023},
424
+ eprint={2308.16687},
425
+ archivePrefix={arXiv},
426
+ primaryClass={cs.CL}
427
+ }
428
+ ```
429
+
430
+ ## License
431
+
432
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
433
+
434
+ This work is licensed under a
435
+ [Creative Commons Attribution 4.0 International License][cc-by].
436
+
437
+ [![CC BY 4.0][cc-by-image]][cc-by]
438
+
439
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
440
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
441
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg