File size: 5,145 Bytes
3fc19a0 b34be6b b16d907 e3b12e0 3fc19a0 b34be6b 3fc19a0 b34be6b fc1825b b34be6b 7fc904d b34be6b 0d87a37 b34be6b 65a05e2 b34be6b 6d59b1c b34be6b b16d907 b34be6b 4458cee b34be6b 4458cee b34be6b 4458cee b34be6b b87b465 b34be6b b87b465 b34be6b 441194f b87b465 e8b81f2 b87b465 e8b81f2 b87b465 e8b81f2 b3e3ab9 b34be6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
---
tags:
- BERT
- Text Classification
- relation
language:
- ar
- en
license: mit
datasets:
- ACE2005
---
# Arabic Relation Extraction Model
- [Github repo](https://github.com/edchengg/GigaBERT)
- Relation Extraction model based on [GigaBERTv4](https://huggingface.co/lanwuwei/GigaBERT-v4-Arabic-and-English).
- Model detail: mark two entities in the sentence with special markers (e.g., ```XXXX <PER> entity1 </PER> XXXXXXX <ORG> entity2 </ORG> XXXXX```). Then we use the BERT [CLS] representation to make a prediction.
- ACE2005 Training data: Arabic
- [Relation tags](https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/arabic-relations-guidelines-v6.5.pdf) including: Physical, Part-whole, Personal-Social, ORG-Affiliation, Agent-Artifact, Gen-Affiliation
## Hyperparameters
- learning_rate=2e-5
- num_train_epochs=10
- weight_decay=0.01
## How to use
Workflow of a relation extraction model:
1. Input --> NER model --> Entities
2. Input sentence + Entity 1 + Entity 2 --> Relation Classification Model --> Relation Type
```python
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AuotoModelForSequenceClassification
>>> ner_model = AutoModelForTokenClassification.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_pip = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
>>> re_model = AutoModelForSequenceClassification.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_pip = pipeline("text-classification", model=re_model, tokenizer=re_tokenizer)
def process_ner_output(entity_mention, inputs):
re_input = []
for idx1 in range(len(entity_mention) - 1):
for idx2 in range(idx1 + 1, len(entity_mention)):
ent_1 = entity_mention[idx1]
ent_2 = entity_mention[idx2]
ent_1_type = ent_1['entity_group']
ent_2_type = ent_2['entity_group']
ent_1_s = ent_1['start']
ent_1_e = ent_1['end']
ent_2_s = ent_2['start']
ent_2_e = ent_2['end']
new_re_input = ""
for c_idx, c in enumerate(inputs):
if c_idx == ent_1_s:
new_re_input += "<{}>".format(ent_1_type)
elif c_idx == ent_1_e:
new_re_input += "</{}>".format(ent_1_type)
elif c_idx == ent_2_s:
new_re_input += "<{}>".format(ent_2_type)
elif c_idx == ent_2_e:
new_re_input += "</{}>".format(ent_2_type)
new_re_input += c
re_input.append({"re_input": new_re_input, "arg1": ent_1, "arg2": ent_2, "input": inputs})
return re_input
def post_process_re_output(re_output, text_input, ner_output):
final_output = []
for idx, out in enumerate(re_output):
if out["label"] != 'O':
tmp = re_input[idx]
tmp['relation_type'] = out
tmp.pop('re_input', None)
final_output.append(tmp)
template = {"input": text_input,
"entity": ner_output,
"relation": final_output}
return template
text_input = """ويتزامن ذلك مع اجتماع بايدن مع قادة الدول الأعضاء في الناتو في قمة موسعة في العاصمة الإسبانية، مدريد."""
ner_output = ner_pip(text_input) # inference NER tags
re_input = process_ner_output(ner_output, text_input) # prepare a pair of entity and predict relation type
re_output = []
for idx in range(len(re_input)):
tmp_re_output = re_pip(re_input[idx]["re_input"]) # for each pair of entity, predict relation
re_output.append(tmp_re_output[0])
re_ner_output = post_process_re_output(re_output, text_input, ner_output) # post process NER and relation predictions
print("Sentence: ",re_ner_output["input"])
print('====Entity====')
for ent in re_ner_output["entity"]:
print('{}--{}'.format(ent["word"], ent["entity_group"]))
print('====Relation====')
for rel in re_ner_output["relation"]:
print('{}--{}:{}'.format(rel['arg1']['word'], rel['arg2']['word'], rel['relation_type']['label']))
Sentence: ويتزامن ذلك مع اجتماع بايدن مع قادة الدول الأعضاء في الناتو في قمة موسعة في العاصمة الإسبانية، مدريد.
====Entity====
بايدن--PER
قادة--PER
الدول--GPE
الناتو--ORG
العاصمة--GPE
الاسبانية--GPE
مدريد--GPE
====Relation====
قادة--الدول:ORG-AFF
الدول--الناتو:ORG-AFF
العاصمة--الاسبانية:PART-WHOLE
```
### BibTeX entry and citation info
```bibtex
@inproceedings{lan2020gigabert,
author = {Lan, Wuwei and Chen, Yang and Xu, Wei and Ritter, Alan},
title = {Giga{BERT}: Zero-shot Transfer Learning from {E}nglish to {A}rabic},
booktitle = {Proceedings of The 2020 Conference on Empirical Methods on Natural Language Processing (EMNLP)},
year = {2020}
}
```
|