winglian commited on
Commit
1c33eb8
1 Parent(s): a798ba1

new hf_use_auth_token setting so login to hf isn't required

Browse files
README.md CHANGED
@@ -207,6 +207,9 @@ datasets:
207
  dataset_prepared_path: data/last_run_prepared
208
  # push prepared dataset to hub
209
  push_dataset_to_hub: # repo path
 
 
 
210
  # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
211
  val_set_size: 0.04
212
  # Num shards for whole dataset
 
207
  dataset_prepared_path: data/last_run_prepared
208
  # push prepared dataset to hub
209
  push_dataset_to_hub: # repo path
210
+ # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
211
+ # required to be true when used in combination with `push_dataset_to_hub`
212
+ hf_use_auth_token: # boolean
213
  # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
214
  val_set_size: 0.04
215
  # Num shards for whole dataset
src/axolotl/utils/data.py CHANGED
@@ -61,10 +61,11 @@ def load_tokenized_prepared_datasets(
61
  else Path(default_dataset_prepared_path) / ds_hash
62
  )
63
  dataset = None
 
64
  try:
65
  if cfg.push_dataset_to_hub:
66
  dataset = load_dataset(
67
- f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
68
  )
69
  dataset = dataset["train"]
70
  except:
@@ -84,7 +85,7 @@ def load_tokenized_prepared_datasets(
84
  ds: Union[Dataset, DatasetDict] = None
85
  ds_from_hub = False
86
  try:
87
- load_dataset(d.path, streaming=True, use_auth_token=True)
88
  ds_from_hub = True
89
  except FileNotFoundError:
90
  pass
@@ -100,10 +101,10 @@ def load_tokenized_prepared_datasets(
100
  d.path,
101
  streaming=False,
102
  data_files=d.data_files,
103
- use_auth_token=True,
104
  )
105
  else:
106
- ds: Dataset = load_dataset(d.path, streaming=False, use_auth_token=True)
107
  else:
108
  fp = hf_hub_download(
109
  repo_id=d.path, repo_type="dataset", filename=d.data_files
@@ -274,13 +275,14 @@ def load_prepare_datasets(
274
  )
275
 
276
  dataset = None
 
277
  try:
278
  if cfg.push_dataset_to_hub:
279
  logging.info(
280
  f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
281
  )
282
  dataset = load_dataset(
283
- f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
284
  )
285
  dataset = dataset["train"]
286
  except:
 
61
  else Path(default_dataset_prepared_path) / ds_hash
62
  )
63
  dataset = None
64
+ use_auth_token = cfg.hf_use_auth_token
65
  try:
66
  if cfg.push_dataset_to_hub:
67
  dataset = load_dataset(
68
+ f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token
69
  )
70
  dataset = dataset["train"]
71
  except:
 
85
  ds: Union[Dataset, DatasetDict] = None
86
  ds_from_hub = False
87
  try:
88
+ load_dataset(d.path, streaming=True, use_auth_token=use_auth_token)
89
  ds_from_hub = True
90
  except FileNotFoundError:
91
  pass
 
101
  d.path,
102
  streaming=False,
103
  data_files=d.data_files,
104
+ use_auth_token=use_auth_token,
105
  )
106
  else:
107
+ ds: Dataset = load_dataset(d.path, streaming=False, use_auth_token=use_auth_token)
108
  else:
109
  fp = hf_hub_download(
110
  repo_id=d.path, repo_type="dataset", filename=d.data_files
 
275
  )
276
 
277
  dataset = None
278
+ use_auth_token = cfg.hf_use_auth_token
279
  try:
280
  if cfg.push_dataset_to_hub:
281
  logging.info(
282
  f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
283
  )
284
  dataset = load_dataset(
285
+ f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token
286
  )
287
  dataset = dataset["train"]
288
  except:
src/axolotl/utils/validation.py CHANGED
@@ -37,6 +37,9 @@ def validate_config(cfg):
37
  "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
38
  )
39
 
 
 
 
40
  # TODO
41
  # MPT 7b
42
  # https://github.com/facebookresearch/bitsandbytes/issues/25
 
37
  "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
38
  )
39
 
40
+ if cfg.push_dataset_to_hub and cfg.hf_use_auth_token is not True:
41
+ raise ValueError("Require cfg.hf_use_auth_token to be True for push_dataset_to_hub")
42
+
43
  # TODO
44
  # MPT 7b
45
  # https://github.com/facebookresearch/bitsandbytes/issues/25
tests/test_validation.py CHANGED
@@ -93,3 +93,29 @@ class ValidationTest(unittest.TestCase):
93
 
94
  with pytest.raises(ValueError, match=r".*4bit.*"):
95
  validate_config(cfg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  with pytest.raises(ValueError, match=r".*4bit.*"):
95
  validate_config(cfg)
96
+
97
+ def test_hf_use_auth_token(self):
98
+ base_cfg = DictDefault(
99
+ {
100
+ "push_dataset_to_hub": None,
101
+ "hf_use_auth_token": None,
102
+ }
103
+ )
104
+
105
+ cfg = base_cfg | DictDefault(
106
+ {
107
+ "push_dataset_to_hub": "namespace/repo",
108
+ }
109
+ )
110
+
111
+ with pytest.raises(ValueError, match=r".*hf_use_auth_token.*"):
112
+ validate_config(cfg)
113
+
114
+ cfg = base_cfg | DictDefault(
115
+ {
116
+ "push_dataset_to_hub": "namespace/repo",
117
+ "hf_use_auth_token": True,
118
+ }
119
+ )
120
+ validate_config(cfg)
121
+