meg-huggingface
commited on
Commit
•
d023e59
1
Parent(s):
41f48cc
Adding AIEnergyStar docker deets
Browse files- Dockerfile +46 -11
- check_h100.py +15 -0
- entrypoint.sh +7 -0
Dockerfile
CHANGED
@@ -1,16 +1,51 @@
|
|
1 |
-
|
2 |
-
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
USER user
|
8 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
|
|
2 |
|
3 |
+
ARG PYTORCH_VERSION=2.4.0
|
4 |
+
ARG PYTHON_VERSION=3.9
|
5 |
+
ARG CUDA_VERSION=12.1
|
6 |
+
ARG MAMBA_VERSION=24.3.0-0
|
7 |
+
ARG CUDA_CHANNEL=nvidia
|
8 |
+
ARG INSTALL_CHANNEL=pytorch
|
9 |
+
# Automatically set by buildx
|
10 |
+
ARG TARGETPLATFORM
|
11 |
|
12 |
+
ENV PATH=/opt/conda/bin:$PATH
|
|
|
|
|
13 |
|
14 |
+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
15 |
+
build-essential \
|
16 |
+
ca-certificates \
|
17 |
+
ccache \
|
18 |
+
curl \
|
19 |
+
git && \
|
20 |
+
rm -rf /var/lib/apt/lists/*
|
21 |
|
22 |
+
# Install conda
|
23 |
+
# translating Docker's TARGETPLATFORM into mamba arches
|
24 |
+
RUN case ${TARGETPLATFORM} in \
|
25 |
+
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
|
26 |
+
*) MAMBA_ARCH=x86_64 ;; \
|
27 |
+
esac && \
|
28 |
+
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
|
29 |
+
RUN chmod +x ~/mambaforge.sh && \
|
30 |
+
bash ~/mambaforge.sh -b -p /opt/conda && \
|
31 |
+
rm ~/mambaforge.sh
|
32 |
|
33 |
+
# Install pytorch
|
34 |
+
# On arm64 we exit with an error code
|
35 |
+
RUN case ${TARGETPLATFORM} in \
|
36 |
+
"linux/arm64") exit 1 ;; \
|
37 |
+
*) /opt/conda/bin/conda update -y conda && \
|
38 |
+
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
|
39 |
+
esac && \
|
40 |
+
/opt/conda/bin/conda clean -ya
|
41 |
+
|
42 |
+
COPY requirements.txt requirements.txt
|
43 |
+
RUN pip install -r requirements.txt
|
44 |
+
|
45 |
+
RUN git clone -b energy_star_dev https://github.com/huggingface/optimum-benchmark.git /optimum-benchmark && cd optimum-benchmark && pip install -e .
|
46 |
+
|
47 |
+
COPY ./check_h100.py /check_h100.py
|
48 |
+
COPY ./entrypoint.sh /entrypoint.sh
|
49 |
+
RUN chmod +x /entrypoint.sh
|
50 |
+
|
51 |
+
ENTRYPOINT ["/entrypoint.sh"]
|
check_h100.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from optimum_benchmark.system_utils import get_gpus
|
2 |
+
|
3 |
+
if __name__=="__main__":
|
4 |
+
# Get the names of all GPU devices
|
5 |
+
gpu_name = get_gpus()
|
6 |
+
|
7 |
+
# If there are several devices, keep the name of device 0
|
8 |
+
if isinstance(gpu_name, list):
|
9 |
+
gpu_name = gpu_name[0]
|
10 |
+
|
11 |
+
# Raise an error if the device is not H100
|
12 |
+
if "NVIDIA H100" in gpu_name:
|
13 |
+
print("At least one NVIDIA H100 GPU has been detected, launching the benchmark...")
|
14 |
+
else:
|
15 |
+
raise RuntimeError(f"This Docker container should be executed on NVIDIA H100 GPUs only, detected {gpu_name}.")
|
entrypoint.sh
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
python /launch_backend.py
|
4 |
+
#python /check_h100.py
|
5 |
+
#if [[ $? = 0 ]]; then
|
6 |
+
# optimum-benchmark --config-dir /optimum-benchmark/examples/energy_star/ $@
|
7 |
+
#fi
|