Revert "run PR e2e docker CI tests in Modal" (#1220) [skip ci]
Browse files- .github/workflows/tests.yml +30 -21
- cicd/Dockerfile.jinja +0 -38
- cicd/tests.py +0 -69
- docker/{Dockerfile-modal β Dockerfile-tests} +11 -8
- requirements.txt +0 -1
- src/axolotl/utils/models.py +2 -5
.github/workflows/tests.yml
CHANGED
@@ -58,15 +58,10 @@ jobs:
|
|
58 |
docker-e2e-tests:
|
59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
60 |
# this job needs to be run on self-hosted GPU runners...
|
61 |
-
runs-on:
|
62 |
timeout-minutes: 30
|
63 |
needs: [pre-commit, pytest]
|
64 |
|
65 |
-
env:
|
66 |
-
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
|
67 |
-
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
|
68 |
-
MODAL_ENVIRONMENT: axolotl-ci-cd
|
69 |
-
|
70 |
strategy:
|
71 |
fail-fast: false
|
72 |
matrix:
|
@@ -75,29 +70,43 @@ jobs:
|
|
75 |
cuda_version: 11.8.0
|
76 |
python_version: "3.10"
|
77 |
pytorch: 2.0.1
|
78 |
-
num_gpus: 1
|
79 |
- cuda: 121
|
80 |
cuda_version: 12.1.0
|
81 |
python_version: "3.10"
|
82 |
pytorch: 2.1.2
|
83 |
-
num_gpus: 1
|
84 |
steps:
|
85 |
- name: Checkout
|
86 |
uses: actions/checkout@v4
|
87 |
-
- name:
|
88 |
-
|
|
|
89 |
with:
|
90 |
-
|
91 |
-
- name:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
run: |
|
93 |
-
|
94 |
-
|
95 |
-
- name: Update env vars
|
96 |
run: |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
101 |
-
- name: Run training job on Modal
|
102 |
run: |
|
103 |
-
|
|
|
58 |
docker-e2e-tests:
|
59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
60 |
# this job needs to be run on self-hosted GPU runners...
|
61 |
+
runs-on: [self-hosted, gpu, docker]
|
62 |
timeout-minutes: 30
|
63 |
needs: [pre-commit, pytest]
|
64 |
|
|
|
|
|
|
|
|
|
|
|
65 |
strategy:
|
66 |
fail-fast: false
|
67 |
matrix:
|
|
|
70 |
cuda_version: 11.8.0
|
71 |
python_version: "3.10"
|
72 |
pytorch: 2.0.1
|
|
|
73 |
- cuda: 121
|
74 |
cuda_version: 12.1.0
|
75 |
python_version: "3.10"
|
76 |
pytorch: 2.1.2
|
|
|
77 |
steps:
|
78 |
- name: Checkout
|
79 |
uses: actions/checkout@v4
|
80 |
+
- name: Docker metadata
|
81 |
+
id: metadata
|
82 |
+
uses: docker/metadata-action@v5
|
83 |
with:
|
84 |
+
images: winglian/axolotl-tests
|
85 |
+
- name: Build Docker image
|
86 |
+
run: |
|
87 |
+
# Set up build arguments
|
88 |
+
BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
|
89 |
+
CUDA="${{ matrix.cuda }}"
|
90 |
+
PYTORCH_VERSION="${{ matrix.pytorch }}"
|
91 |
+
# Build the Docker image
|
92 |
+
docker build . \
|
93 |
+
--file ./docker/Dockerfile-tests \
|
94 |
+
--build-arg BASE_TAG=$BASE_TAG \
|
95 |
+
--build-arg CUDA=$CUDA \
|
96 |
+
--build-arg GITHUB_REF=$GITHUB_REF \
|
97 |
+
--build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
|
98 |
+
--tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
|
99 |
+
--no-cache
|
100 |
+
- name: Unit Tests w docker image
|
101 |
+
run: |
|
102 |
+
docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
|
103 |
+
- name: GPU Unit Tests w docker image
|
104 |
run: |
|
105 |
+
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
|
106 |
+
- name: GPU Unit Tests monkeypatched w docker image
|
|
|
107 |
run: |
|
108 |
+
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
|
109 |
+
- name: Prune image from docker
|
110 |
+
if: github.ref != 'refs/heads/main'
|
|
|
|
|
111 |
run: |
|
112 |
+
docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
cicd/Dockerfile.jinja
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
FROM winglian/axolotl-base:{{ BASE_TAG }}
|
2 |
-
|
3 |
-
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
4 |
-
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
5 |
-
ENV CUDA="{{ CUDA }}"
|
6 |
-
ENV BNB_CUDA_VERSION="{{ CUDA }}"
|
7 |
-
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
8 |
-
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
9 |
-
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
10 |
-
|
11 |
-
RUN apt-get update && \
|
12 |
-
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
13 |
-
|
14 |
-
WORKDIR /workspace
|
15 |
-
|
16 |
-
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
17 |
-
|
18 |
-
WORKDIR /workspace/axolotl
|
19 |
-
|
20 |
-
RUN git fetch origin +$GITHUB_REF && \
|
21 |
-
git checkout FETCH_HEAD
|
22 |
-
|
23 |
-
# If AXOLOTL_EXTRAS is set, append it in brackets
|
24 |
-
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
25 |
-
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
|
26 |
-
else \
|
27 |
-
pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
|
28 |
-
fi
|
29 |
-
|
30 |
-
# So we can test the Docker image
|
31 |
-
RUN pip install pytest
|
32 |
-
|
33 |
-
# fix so that git fetch/pull from remote works
|
34 |
-
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
35 |
-
git config --get remote.origin.fetch
|
36 |
-
|
37 |
-
# helper for huggingface-login cli
|
38 |
-
RUN git config --global credential.helper store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cicd/tests.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
modal application to run axolotl gpu tests in Modal
|
3 |
-
"""
|
4 |
-
import os
|
5 |
-
import pathlib
|
6 |
-
import tempfile
|
7 |
-
|
8 |
-
import jinja2
|
9 |
-
import modal
|
10 |
-
from jinja2 import select_autoescape
|
11 |
-
from modal import Image, Stub
|
12 |
-
|
13 |
-
cicd_path = pathlib.Path(__file__).parent.resolve()
|
14 |
-
|
15 |
-
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
16 |
-
template_env = jinja2.Environment(
|
17 |
-
loader=template_loader, autoescape=select_autoescape()
|
18 |
-
)
|
19 |
-
df_template = template_env.get_template("Dockerfile.jinja")
|
20 |
-
|
21 |
-
df_args = {
|
22 |
-
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
23 |
-
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
|
24 |
-
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
|
25 |
-
"CUDA": os.environ.get("CUDA", "118"),
|
26 |
-
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
27 |
-
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
28 |
-
}
|
29 |
-
|
30 |
-
dockerfile_contents = df_template.render(**df_args)
|
31 |
-
|
32 |
-
temp_dir = tempfile.mkdtemp()
|
33 |
-
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
34 |
-
f.write(dockerfile_contents)
|
35 |
-
|
36 |
-
cicd_image = Image.from_dockerfile(
|
37 |
-
pathlib.Path(temp_dir) / "Dockerfile",
|
38 |
-
force_build=True,
|
39 |
-
gpu="A10G",
|
40 |
-
).env(df_args)
|
41 |
-
|
42 |
-
stub = Stub("Axolotl CI/CD", secrets=[])
|
43 |
-
|
44 |
-
|
45 |
-
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
46 |
-
GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
|
47 |
-
|
48 |
-
|
49 |
-
def run_cmd(cmd: str, run_folder: str):
|
50 |
-
import subprocess # nosec
|
51 |
-
|
52 |
-
# Propagate errors from subprocess.
|
53 |
-
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
54 |
-
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
55 |
-
|
56 |
-
|
57 |
-
@stub.function(
|
58 |
-
image=cicd_image,
|
59 |
-
gpu=GPU_CONFIG,
|
60 |
-
timeout=60 * 30,
|
61 |
-
)
|
62 |
-
def cicd_pytest():
|
63 |
-
cmd = "pytest /workspace/axolotl/tests/e2e/patched/"
|
64 |
-
run_cmd(cmd, "/workspace/axolotl")
|
65 |
-
|
66 |
-
|
67 |
-
@stub.local_entrypoint()
|
68 |
-
def main():
|
69 |
-
cicd_pytest.remote()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker/{Dockerfile-modal β Dockerfile-tests}
RENAMED
@@ -1,11 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
ENV
|
8 |
-
|
|
|
|
|
|
|
9 |
|
10 |
RUN apt-get update && \
|
11 |
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
|
|
1 |
+
ARG BASE_TAG=main-base
|
2 |
+
FROM winglian/axolotl-base:$BASE_TAG
|
3 |
+
|
4 |
+
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
5 |
+
ARG AXOLOTL_EXTRAS=""
|
6 |
+
ARG CUDA="118"
|
7 |
+
ENV BNB_CUDA_VERSION=$CUDA
|
8 |
+
ARG PYTORCH_VERSION="2.0.1"
|
9 |
+
ARG GITHUB_REF="main"
|
10 |
+
|
11 |
+
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
12 |
|
13 |
RUN apt-get update && \
|
14 |
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
requirements.txt
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
2 |
-
jinja2
|
3 |
packaging==23.2
|
4 |
peft==0.7.1
|
5 |
transformers==4.37.0
|
|
|
1 |
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
|
|
2 |
packaging==23.2
|
3 |
peft==0.7.1
|
4 |
transformers==4.37.0
|
src/axolotl/utils/models.py
CHANGED
@@ -645,10 +645,7 @@ def load_model(
|
|
645 |
if not cfg.fsdp:
|
646 |
# FSDP doesn't like mixed Float and BFloat16
|
647 |
for name, module in model.named_modules():
|
648 |
-
if (
|
649 |
-
any(m in name for m in ["norm", "gate"])
|
650 |
-
or "LayerNorm" in module.__class__.__name__
|
651 |
-
):
|
652 |
module.to(torch.float32)
|
653 |
if model_config.model_type == "btlm":
|
654 |
# don't upcast lm_head for btlm
|
@@ -687,7 +684,7 @@ def load_model(
|
|
687 |
if needs_fa2_dtype or cfg.flash_attention:
|
688 |
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
689 |
for name, module in model.named_modules():
|
690 |
-
if "norm" in name
|
691 |
module.to(cfg.torch_dtype)
|
692 |
if any(m in name for m in embedding_modules):
|
693 |
if hasattr(module, "weight"):
|
|
|
645 |
if not cfg.fsdp:
|
646 |
# FSDP doesn't like mixed Float and BFloat16
|
647 |
for name, module in model.named_modules():
|
648 |
+
if any(m in name for m in ["norm", "gate"]):
|
|
|
|
|
|
|
649 |
module.to(torch.float32)
|
650 |
if model_config.model_type == "btlm":
|
651 |
# don't upcast lm_head for btlm
|
|
|
684 |
if needs_fa2_dtype or cfg.flash_attention:
|
685 |
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
686 |
for name, module in model.named_modules():
|
687 |
+
if "norm" in name:
|
688 |
module.to(cfg.torch_dtype)
|
689 |
if any(m in name for m in embedding_modules):
|
690 |
if hasattr(module, "weight"):
|