File size: 5,145 Bytes
3fc19a0
b34be6b
 
 
b16d907
e3b12e0
 
 
3fc19a0
b34be6b
 
3fc19a0
b34be6b
 
fc1825b
b34be6b
7fc904d
b34be6b
0d87a37
b34be6b
 
 
 
 
 
 
65a05e2
 
 
 
b34be6b
 
 
6d59b1c
 
b34be6b
 
b16d907
 
b34be6b
 
4458cee
b34be6b
 
 
 
 
 
 
 
 
 
 
 
 
4458cee
b34be6b
 
 
 
 
 
 
 
 
4458cee
b34be6b
 
b87b465
b34be6b
 
 
 
 
 
 
 
b87b465
b34be6b
 
 
 
 
441194f
 
b87b465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8b81f2
b87b465
e8b81f2
 
 
 
 
 
 
b87b465
e8b81f2
 
 
b3e3ab9
b34be6b
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
---
tags:
- BERT
- Text Classification
- relation
language:
- ar
- en
license: mit
datasets:
- ACE2005
---

# Arabic Relation Extraction Model
- [Github repo](https://github.com/edchengg/GigaBERT)
- Relation Extraction model based on [GigaBERTv4](https://huggingface.co/lanwuwei/GigaBERT-v4-Arabic-and-English).
- Model detail: mark two entities in the sentence with special markers (e.g., ```XXXX <PER> entity1 </PER> XXXXXXX <ORG> entity2 </ORG> XXXXX```). Then we use the BERT [CLS] representation to make a prediction.
- ACE2005 Training data: Arabic
- [Relation tags](https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/arabic-relations-guidelines-v6.5.pdf) including: Physical, Part-whole, Personal-Social, ORG-Affiliation, Agent-Artifact, Gen-Affiliation
## Hyperparameters
- learning_rate=2e-5
- num_train_epochs=10
- weight_decay=0.01


## How to use
Workflow of a relation extraction model:
1. Input --> NER model --> Entities
2. Input sentence + Entity 1 + Entity 2 --> Relation Classification Model --> Relation Type

```python
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AuotoModelForSequenceClassification

>>> ner_model = AutoModelForTokenClassification.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_pip = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)

>>> re_model = AutoModelForSequenceClassification.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_pip = pipeline("text-classification", model=re_model, tokenizer=re_tokenizer)

def process_ner_output(entity_mention, inputs):
    re_input = []
    for idx1 in range(len(entity_mention) - 1):
        for idx2 in range(idx1 + 1, len(entity_mention)):
            ent_1 = entity_mention[idx1]
            ent_2 = entity_mention[idx2]

            ent_1_type = ent_1['entity_group']
            ent_2_type = ent_2['entity_group']
            ent_1_s = ent_1['start']
            ent_1_e = ent_1['end']
            ent_2_s = ent_2['start']
            ent_2_e = ent_2['end']
            new_re_input = ""
            for c_idx, c in enumerate(inputs):
                if c_idx == ent_1_s:
                    new_re_input += "<{}>".format(ent_1_type)
                elif c_idx == ent_1_e:
                    new_re_input += "</{}>".format(ent_1_type)
                elif c_idx == ent_2_s:
                    new_re_input += "<{}>".format(ent_2_type)
                elif c_idx == ent_2_e:
                    new_re_input += "</{}>".format(ent_2_type)
                new_re_input += c
            re_input.append({"re_input": new_re_input, "arg1": ent_1, "arg2": ent_2, "input": inputs})
    return re_input
    
def post_process_re_output(re_output, text_input, ner_output):
    final_output = []
    for idx, out in enumerate(re_output):
        if out["label"] != 'O':
            tmp = re_input[idx]
            tmp['relation_type'] = out
            tmp.pop('re_input', None)
            final_output.append(tmp)

    template = {"input": text_input,
                "entity": ner_output,
                "relation": final_output}

    return template

text_input = """ويتزامن ذلك مع اجتماع بايدن مع قادة الدول الأعضاء في الناتو في قمة موسعة في العاصمة الإسبانية، مدريد."""

ner_output = ner_pip(text_input) # inference NER tags

re_input = process_ner_output(ner_output, text_input) # prepare a pair of entity and predict relation type

re_output = []
for idx in range(len(re_input)):
    tmp_re_output = re_pip(re_input[idx]["re_input"]) # for each pair of entity, predict relation
    re_output.append(tmp_re_output[0])



re_ner_output = post_process_re_output(re_output, text_input, ner_output) # post process NER and relation predictions
print("Sentence: ",re_ner_output["input"])
print('====Entity====')
for ent in re_ner_output["entity"]:
  print('{}--{}'.format(ent["word"], ent["entity_group"]))
print('====Relation====')
for rel in re_ner_output["relation"]:
  print('{}--{}:{}'.format(rel['arg1']['word'], rel['arg2']['word'], rel['relation_type']['label']))

Sentence:  ويتزامن ذلك مع اجتماع بايدن مع قادة الدول الأعضاء في الناتو في قمة موسعة في العاصمة الإسبانية، مدريد.
====Entity====
بايدن--PER
قادة--PER
الدول--GPE
الناتو--ORG
العاصمة--GPE
الاسبانية--GPE
مدريد--GPE
====Relation====
قادة--الدول:ORG-AFF
الدول--الناتو:ORG-AFF
العاصمة--الاسبانية:PART-WHOLE
```

### BibTeX entry and citation info

```bibtex
@inproceedings{lan2020gigabert,
  author     = {Lan, Wuwei and Chen, Yang and Xu, Wei and Ritter, Alan},
    title      = {Giga{BERT}: Zero-shot Transfer Learning from {E}nglish to {A}rabic},
    booktitle  = {Proceedings of The 2020 Conference on Empirical Methods on Natural Language Processing (EMNLP)},
    year       = {2020}
  } 
```