jpfearnworks commited on
Commit
f1fbf66
2 Parent(s): 370d057 00323f0

Merge branch 'main' of https://github.com/OpenAccess-AI-Collective/axolotl into qlora-openllama-3b-example

Browse files
.github/workflows/base.yml CHANGED
@@ -1,4 +1,4 @@
1
- name: ci-cd
2
 
3
  on:
4
  push:
 
1
+ name: ci-cd-base
2
 
3
  on:
4
  push:
docker/Dockerfile-base CHANGED
@@ -62,6 +62,7 @@ RUN git clone https://github.com/microsoft/DeepSpeed.git && \
62
  FROM base-builder AS bnb-builder
63
 
64
  WORKDIR /workspace
 
65
 
66
  RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
67
  cd bitsandbytes && \
@@ -70,6 +71,8 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
70
 
71
  FROM base-builder
72
 
 
 
73
  # recompile apex
74
  RUN python3 -m pip uninstall -y apex
75
  RUN git clone https://github.com/NVIDIA/apex
 
62
  FROM base-builder AS bnb-builder
63
 
64
  WORKDIR /workspace
65
+ ENV CUDA_VERSION_BNB=$CUDA_VERSION_BNB
66
 
67
  RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
68
  cd bitsandbytes && \
 
71
 
72
  FROM base-builder
73
 
74
+ ENV CUDA_VERSION_BNB=$CUDA_VERSION_BNB
75
+
76
  # recompile apex
77
  RUN python3 -m pip uninstall -y apex
78
  RUN git clone https://github.com/NVIDIA/apex
scripts/finetune.py CHANGED
@@ -178,6 +178,15 @@ def train(
178
  tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
179
  )
180
 
 
 
 
 
 
 
 
 
 
181
  if prepare_ds_only:
182
  logging.info("Finished preparing dataset. Exiting...")
183
  return
@@ -213,15 +222,6 @@ def train(
213
  model.save_pretrained(cfg.output_dir)
214
  return
215
 
216
- if cfg.debug:
217
- logging.info("check_dataset_labels...")
218
- check_dataset_labels(
219
- train_dataset.select(
220
- [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
221
- ),
222
- tokenizer,
223
- )
224
-
225
  trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
226
 
227
  model.config.use_cache = False
 
178
  tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
179
  )
180
 
181
+ if cfg.debug or "debug" in kwargs:
182
+ logging.info("check_dataset_labels...")
183
+ check_dataset_labels(
184
+ train_dataset.select(
185
+ [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
186
+ ),
187
+ tokenizer,
188
+ )
189
+
190
  if prepare_ds_only:
191
  logging.info("Finished preparing dataset. Exiting...")
192
  return
 
222
  model.save_pretrained(cfg.output_dir)
223
  return
224
 
 
 
 
 
 
 
 
 
 
225
  trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
226
 
227
  model.config.use_cache = False
src/axolotl/prompt_tokenizers.py CHANGED
@@ -268,6 +268,9 @@ class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy):
268
 
269
 
270
  class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
 
 
 
271
  def tokenize_prompt(self, prompt):
272
  result = {
273
  "input_ids": [],
@@ -279,7 +282,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
279
  assistant_token = self._get_assistant_token()
280
  try:
281
  for i, part in enumerate(
282
- self.prompter.build_prompt(prompt["conversations"])
283
  ):
284
  if isinstance(part, tuple):
285
  if part[0] == "USER:":
 
268
 
269
 
270
  class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
271
+ def get_conversation_thread(self, prompt):
272
+ return prompt["conversations"]
273
+
274
  def tokenize_prompt(self, prompt):
275
  result = {
276
  "input_ids": [],
 
282
  assistant_token = self._get_assistant_token()
283
  try:
284
  for i, part in enumerate(
285
+ self.prompter.build_prompt(self.get_conversation_thread(prompt))
286
  ):
287
  if isinstance(part, tuple):
288
  if part[0] == "USER:":