Update README.md
Browse files
README.md
CHANGED
@@ -34,11 +34,11 @@ You can train a better model if you have access to adequate compute (can finetun
|
|
34 |
from datasets import load_dataset
|
35 |
from ragatouille import RAGTrainer
|
36 |
sample_size = 100000
|
37 |
-
ds = load_dataset('akhooli/arabic-triplets-1m-curated-sims-len',
|
38 |
|
39 |
-
# some data processing not in this script
|
40 |
sds = ds.shuffle(seed=42, buffer_size=10_000)
|
41 |
-
dsf = sds
|
42 |
triplets = []
|
43 |
for item in iter(dsf):
|
44 |
triplets.append((item["query"], item["positive"], item["negative"]))
|
@@ -46,11 +46,11 @@ trainer = RAGTrainer(model_name="Arabic-ColBERT-100k", pretrained_model_name="au
|
|
46 |
trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
|
47 |
|
48 |
trainer.train(batch_size=32,
|
49 |
-
nbits=
|
50 |
-
maxsteps=
|
51 |
use_ib_negatives=True, # Use in-batch negative to calculate loss
|
52 |
dim=128, # How many dimensions per embedding. 128 is the default and works well.
|
53 |
-
learning_rate=
|
54 |
doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
|
55 |
use_relu=False, # Disable ReLU -- doesn't improve performance
|
56 |
warmup_steps="auto", # Defaults to 10%
|
|
|
34 |
from datasets import load_dataset
|
35 |
from ragatouille import RAGTrainer
|
36 |
sample_size = 100000
|
37 |
+
ds = load_dataset('akhooli/arabic-triplets-1m-curated-sims-len', split="train", trust_remote_code=True, streaming=True)
|
38 |
|
39 |
+
# some data processing not in this script (data filtered based on similarity scores) and 100K selected at random
|
40 |
sds = ds.shuffle(seed=42, buffer_size=10_000)
|
41 |
+
dsf = sds
|
42 |
triplets = []
|
43 |
for item in iter(dsf):
|
44 |
triplets.append((item["query"], item["positive"], item["negative"]))
|
|
|
46 |
trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
|
47 |
|
48 |
trainer.train(batch_size=32,
|
49 |
+
nbits=4, # How many bits will the trained model use when compressing indexes
|
50 |
+
maxsteps=3125, # Maximum steps hard stop
|
51 |
use_ib_negatives=True, # Use in-batch negative to calculate loss
|
52 |
dim=128, # How many dimensions per embedding. 128 is the default and works well.
|
53 |
+
learning_rate=1e-5, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
|
54 |
doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
|
55 |
use_relu=False, # Disable ReLU -- doesn't improve performance
|
56 |
warmup_steps="auto", # Defaults to 10%
|