Spaces:

AIEnergyScore
/

launch-computation-example

Runtime error

App Files Files Community

meg-huggingface commited on Sep 30, 2024

Commit

d023e59

1 Parent(s): 41f48cc

Adding AIEnergyStar docker deets

Browse files

Files changed (3) hide show

Dockerfile +46 -11
check_h100.py +15 -0
entrypoint.sh +7 -0

Dockerfile CHANGED Viewed

@@ -1,16 +1,51 @@
-# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
-# you will also find guides on how best to write your Dockerfile
-FROM python:3.9
-RUN useradd -m -u 1000 user
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
-WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
-COPY --chown=user . /app
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+ARG PYTORCH_VERSION=2.4.0
+ARG PYTHON_VERSION=3.9
+ARG CUDA_VERSION=12.1
+ARG MAMBA_VERSION=24.3.0-0
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+RUN git clone -b energy_star_dev https://github.com/huggingface/optimum-benchmark.git /optimum-benchmark && cd optimum-benchmark && pip install -e .
+COPY ./check_h100.py /check_h100.py
+COPY ./entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]

check_h100.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from optimum_benchmark.system_utils import get_gpus
+if __name__=="__main__":
+    # Get the names of all GPU devices
+    gpu_name = get_gpus()
+    # If there are several devices, keep the name of device 0
+    if isinstance(gpu_name, list):
+        gpu_name = gpu_name[0]
+    # Raise an error if the device is not H100
+    if "NVIDIA H100" in gpu_name:
+        print("At least one NVIDIA H100 GPU has been detected, launching the benchmark...")
+    else:
+        raise RuntimeError(f"This Docker container should be executed on NVIDIA H100 GPUs only, detected {gpu_name}.")

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+python /launch_backend.py
+#python /check_h100.py
+#if [[ $? = 0 ]]; then
+#    optimum-benchmark --config-dir /optimum-benchmark/examples/energy_star/ $@
+#fi