akhooli commited on
Commit
0063aa3
1 Parent(s): cb1fd98

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -6
README.md CHANGED
@@ -34,11 +34,11 @@ You can train a better model if you have access to adequate compute (can finetun
34
  from datasets import load_dataset
35
  from ragatouille import RAGTrainer
36
  sample_size = 100000
37
- ds = load_dataset('akhooli/arabic-triplets-1m-curated-sims-len', 'arabic', split="train", trust_remote_code=True, streaming=True)
38
 
39
- # some data processing not in this script
40
  sds = ds.shuffle(seed=42, buffer_size=10_000)
41
- dsf = sds.take(sample_size)
42
  triplets = []
43
  for item in iter(dsf):
44
  triplets.append((item["query"], item["positive"], item["negative"]))
@@ -46,11 +46,11 @@ trainer = RAGTrainer(model_name="Arabic-ColBERT-100k", pretrained_model_name="au
46
  trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
47
 
48
  trainer.train(batch_size=32,
49
- nbits=2, # How many bits will the trained model use when compressing indexes
50
- maxsteps=100000, # Maximum steps hard stop
51
  use_ib_negatives=True, # Use in-batch negative to calculate loss
52
  dim=128, # How many dimensions per embedding. 128 is the default and works well.
53
- learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
54
  doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
55
  use_relu=False, # Disable ReLU -- doesn't improve performance
56
  warmup_steps="auto", # Defaults to 10%
 
34
  from datasets import load_dataset
35
  from ragatouille import RAGTrainer
36
  sample_size = 100000
37
+ ds = load_dataset('akhooli/arabic-triplets-1m-curated-sims-len', split="train", trust_remote_code=True, streaming=True)
38
 
39
+ # some data processing not in this script (data filtered based on similarity scores) and 100K selected at random
40
  sds = ds.shuffle(seed=42, buffer_size=10_000)
41
+ dsf = sds
42
  triplets = []
43
  for item in iter(dsf):
44
  triplets.append((item["query"], item["positive"], item["negative"]))
 
46
  trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
47
 
48
  trainer.train(batch_size=32,
49
+ nbits=4, # How many bits will the trained model use when compressing indexes
50
+ maxsteps=3125, # Maximum steps hard stop
51
  use_ib_negatives=True, # Use in-batch negative to calculate loss
52
  dim=128, # How many dimensions per embedding. 128 is the default and works well.
53
+ learning_rate=1e-5, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
54
  doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
55
  use_relu=False, # Disable ReLU -- doesn't improve performance
56
  warmup_steps="auto", # Defaults to 10%