FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 # Install system dependencies RUN apt-get update && apt-get install -y \ git \ git-lfs \ python3.10 \ python3-pip \ python-is-python3 \ wget \ ninja-build \ gcc \ g++ \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # Create a non-root user RUN useradd -m -u 1000 user && \ chown -R user:user /app # Install basic Python packages first RUN pip3 install --no-cache-dir \ packaging \ setuptools \ wheel \ numpy \ torch==2.4.0 # Install CUDA toolkit ENV CUDA_HOME=/usr/local/cuda ENV PATH=${CUDA_HOME}/bin:${PATH} ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} # Clone Self-Lengthen repository RUN git clone https://github.com/QwenLM/Self-Lengthen.git && \ mv Self-Lengthen/* . && \ rm -rf Self-Lengthen # Install dependencies in order COPY requirements.txt . RUN pip3 install --no-cache-dir \ transformers==4.43.2 \ accelerate \ peft \ datasets \ sentencepiece \ protobuf \ tiktoken \ scipy \ gradio \ cn2an>=0.5.22 \ langdetect>=1.0.9 \ openai \ tqdm \ && pip3 install --no-cache-dir flash-attn --no-build-isolation \ && pip3 install --no-cache-dir vllm==0.5.5 vllm-flash-attn # Install FastChat RUN git clone -b self-lengthen https://github.com/quanshr/FastChat.git && \ cd FastChat && \ pip3 install ".[model_worker,webui]" # Install LLaMA Factory RUN pip3 install --no-cache-dir llamafactory # Create directories and set permissions RUN mkdir -p models results && \ chown -R user:user /app # Switch to non-root user USER user # Initialize git-lfs RUN git lfs install # Set environment variables ENV CUDA_VISIBLE_DEVICES=0 ENV WORLD_SIZE=1 ENV RANK=0 ENV MASTER_ADDR=localhost ENV MASTER_PORT=29500 ENV MODEL_PATH=/app/models/base_model ENV INSTRUCT_COUNT=5000 ENV MAX_ITER=3 # Create startup script RUN echo '#!/bin/bash\n\ \n\ # Function to wait for service\n\ wait_for_service() {\n\ local host="$1"\n\ local port="$2"\n\ local retries=30\n\ while ! nc -z "$host" "$port" > /dev/null 2>&1; do\n\ retries=$((retries-1))\n\ if [ "$retries" -eq 0 ]; then\n\ echo "Service $host:$port is not available after maximum retries"\n\ exit 1\n\ fi\n\ echo "Waiting for service $host:$port..."\n\ sleep 2\n\ done\n\ }\n\ \n\ # Download model if needed\n\ if [ ! -d "$MODEL_PATH" ]; then\n\ echo "Downloading model..."\n\ mkdir -p "$MODEL_PATH"\n\ git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct "$MODEL_PATH"\n\ fi\n\ \n\ # Start FastChat services\n\ python -m fastchat.serve.controller \ --host 0.0.0.0 \ --port 21001 > controller.log 2>&1 &\n\ \n\ # Wait for controller\n\ wait_for_service localhost 21001\n\ \n\ python -m fastchat.serve.openai_api_server \ --controller-address http://localhost:21001 \ --host 0.0.0.0 \ --port 8000 > api_server.log 2>&1 &\n\ \n\ # Wait for API server\n\ wait_for_service localhost 8000\n\ \n\ # Start model worker\n\ python -m fastchat.serve.vllm_worker \ --model-names Qwen/Qwen2-7B-Instruct \ --model-path "$MODEL_PATH" \ --controller-address http://localhost:21001 \ --host localhost \ --port 8080 \ --worker-address http://localhost:8080 > worker.log 2>&1 &\n\ \n\ # Wait for model worker\n\ wait_for_service localhost 8080\n\ \n\ # Run the training process\n\ cd /app/qwen\n\ bash run.sh --base_model="$MODEL_PATH" --instruct_count="$INSTRUCT_COUNT" --max_iter="$MAX_ITER"\n\ \n\ # Start the web interface\n\ python app.py\n' > /app/start.sh && \ chmod +x /app/start.sh # Install netcat for service checking USER root RUN apt-get update && apt-get install -y netcat-openbsd && rm -rf /var/lib/apt/lists/* USER user # Create a simple web interface COPY --chown=user:user app.py . # Expose port for web interface EXPOSE 7860 8000 21001 8080 # Command to run ENTRYPOINT ["/app/start.sh"]