lothritz commited on
Commit
6a9dca5
·
1 Parent(s): 69a0c5a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -11
README.md CHANGED
@@ -7,16 +7,24 @@ In addition, we partially translated 6.1 million sentences from the German Wikip
7
  If you would like to use our model, please cite our paper:
8
 
9
  ```
10
- @InProceedings{lothritz-EtAl:2022:LREC,
11
- author = {Lothritz, Cedric and Lebichot, Bertrand and Allix, Kevin and Veiber, Lisa and BISSYANDE, TEGAWENDE and Klein, Jacques and Boytsov, Andrey and Lefebvre, Clément and Goujon, Anne},
12
- title = {LuxemBERT: Simple and Practical Data Augmentation in Language Model Pre-Training for Luxembourgish},
13
- booktitle = {Proceedings of the Language Resources and Evaluation Conference},
14
- month = {June},
15
- year = {2022},
16
- address = {Marseille, France},
17
- publisher = {European Language Resources Association},
18
- pages = {5080--5089},
19
- abstract = {Pre-trained Language Models such as BERT have become ubiquitous in NLP where they have achieved state-of-the-art performance in most NLP tasks. While these models are readily available for English and other widely spoken languages, they remain scarce for low-resource languages such as Luxembourgish. In this paper, we present LuxemBERT, a BERT model for the Luxembourgish language that we create using the following approach: we augment the pre-training dataset by considering text data from a closely related language that we partially translate using a simple and straightforward method. We are then able to produce the LuxemBERT model, which we show to be effective for various NLP tasks: it outperforms a simple baseline built with the available Luxembourgish text data as well the multilingual mBERT model, which is currently the only option for transformer-based language models in Luxembourgish. Furthermore, we present datasets for various downstream NLP tasks that we created for this study and will make available to researchers on request.},
20
- url = {https://aclanthology.org/2022.lrec-1.543}
 
 
 
 
 
 
 
 
21
  }
22
  ```
 
7
  If you would like to use our model, please cite our paper:
8
 
9
  ```
10
+ @inproceedings{lothritz-etal-2022-luxembert,
11
+ title = "{L}uxem{BERT}: Simple and Practical Data Augmentation in Language Model Pre-Training for {L}uxembourgish",
12
+ author = "Lothritz, Cedric and
13
+ Lebichot, Bertrand and
14
+ Allix, Kevin and
15
+ Veiber, Lisa and
16
+ Bissyande, Tegawende and
17
+ Klein, Jacques and
18
+ Boytsov, Andrey and
19
+ Lefebvre, Cl{\'e}ment and
20
+ Goujon, Anne",
21
+ booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
22
+ month = jun,
23
+ year = "2022",
24
+ address = "Marseille, France",
25
+ publisher = "European Language Resources Association",
26
+ url = "https://aclanthology.org/2022.lrec-1.543",
27
+ pages = "5080--5089",
28
+ abstract = "Pre-trained Language Models such as BERT have become ubiquitous in NLP where they have achieved state-of-the-art performance in most NLP tasks. While these models are readily available for English and other widely spoken languages, they remain scarce for low-resource languages such as Luxembourgish. In this paper, we present LuxemBERT, a BERT model for the Luxembourgish language that we create using the following approach: we augment the pre-training dataset by considering text data from a closely related language that we partially translate using a simple and straightforward method. We are then able to produce the LuxemBERT model, which we show to be effective for various NLP tasks: it outperforms a simple baseline built with the available Luxembourgish text data as well the multilingual mBERT model, which is currently the only option for transformer-based language models in Luxembourgish. Furthermore, we present datasets for various downstream NLP tasks that we created for this study and will make available to researchers on request.",
29
  }
30
  ```