File size: 4,231 Bytes
3fc19a0 b34be6b b16d907 b34be6b 3fc19a0 b34be6b 3fc19a0 b34be6b fc1825b b34be6b 0d87a37 b34be6b 65a05e2 b34be6b 6d59b1c b34be6b b16d907 b34be6b 4458cee b34be6b 4458cee b34be6b 4458cee b34be6b 13f9bc5 b34be6b 13f9bc5 b34be6b 13f9bc5 b34be6b 13f9bc5 b34be6b b3e3ab9 b34be6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
---
tags:
- BERT
- Text Classification
- relation
language: Arabic
license: mit
datasets:
- ACE2005
---
# Arabic Relation Extraction Model
- [Github repo](https://github.com/edchengg/GigaBERT)
- Relation Extraction model based on [GigaBERTv4](https://huggingface.co/lanwuwei/GigaBERT-v4-Arabic-and-English).
- ACE2005 Training data: Arabic
- [Relation tags](https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/arabic-relations-guidelines-v6.5.pdf) including: Physical, Part-whole, Personal-Social, ORG-Affiliation, Agent-Artifact, Gen-Affiliation
## Hyperparameters
- learning_rate=2e-5
- num_train_epochs=10
- weight_decay=0.01
## ACE2005 Evaluation results (F1)
| Language | Arabic |
|:----:|:-----------:|
| | 89.4 |
## How to use
Workflow of a relation extraction model:
1. Input --> NER model --> Entities
2. Input sentence + Entity 1 + Entity 2 --> Relation Classification Model --> Relation Type
```python
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AuotoModelForSequenceClassification
>>> ner_model = AutoModelForTokenClassification.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-ner-ace")
>>> ner_pip = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
>>> re_model = AutoModelForSequenceClassification.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_tokenizer = AutoTokenizer.from_pretrained("ychenNLP/arabic-relation-extraction")
>>> re_pip = pipeline("text-classification", model=re_model, tokenizer=re_tokenizer)
def process_ner_output(entity_mention, inputs):
re_input = []
for idx1 in range(len(entity_mention) - 1):
for idx2 in range(idx1 + 1, len(entity_mention)):
ent_1 = entity_mention[idx1]
ent_2 = entity_mention[idx2]
ent_1_type = ent_1['entity_group']
ent_2_type = ent_2['entity_group']
ent_1_s = ent_1['start']
ent_1_e = ent_1['end']
ent_2_s = ent_2['start']
ent_2_e = ent_2['end']
new_re_input = ""
for c_idx, c in enumerate(inputs):
if c_idx == ent_1_s:
new_re_input += "<{}>".format(ent_1_type)
elif c_idx == ent_1_e:
new_re_input += "</{}>".format(ent_1_type)
elif c_idx == ent_2_s:
new_re_input += "<{}>".format(ent_2_type)
elif c_idx == ent_2_e:
new_re_input += "</{}>".format(ent_2_type)
new_re_input += c
re_input.append({"re_input": new_re_input, "arg1": ent_1, "arg2": ent_2, "input": inputs})
return re_input
def post_process_re_output(re_output, re_input, ner_output):
final_output = []
for idx, out in enumerate(re_output):
if out["label"] != 'O':
tmp = re_input[idx]
tmp['relation_type'] = out
tmp.pop('re_input', None)
final_output.append(tmp)
template = {"input": re_input["input"],
"entity": ner_output,
"relation": final_output}
return template
>>> input = "Hugging face is a French company in New york."
>>> output = ner_pip(input) # inference NER tags
>>> re_input = process_ner_output(output, input) # prepare a pair of entity and predict relation type
>>> re_output = []
>>> for idx in range(len(re_input)):
>>> tmp_re_output = re_pip(re_input[idx]["re_input"]) # for each pair of entity, predict relation
>>> re_output.append(tmp_re_output)
>>> re_ner_output = post_process_re_output(re_output) # post process NER and relation predictions
>>> print("Sentence: ",re_ner_output["input"])
>>> print("Entity: ", re_ner_output["entity"])
>>> print("Relation: ", re_ner_output["relation"])
```
### BibTeX entry and citation info
```bibtex
@inproceedings{lan2020gigabert,
author = {Lan, Wuwei and Chen, Yang and Xu, Wei and Ritter, Alan},
title = {Giga{BERT}: Zero-shot Transfer Learning from {E}nglish to {A}rabic},
booktitle = {Proceedings of The 2020 Conference on Empirical Methods on Natural Language Processing (EMNLP)},
year = {2020}
}
```
|