name: Deploy PyTorch Training with Hugging Face Sync on: push: branches: - master - main workflow_dispatch: jobs: start-runner: name: Start self-hosted EC2 runner runs-on: ubuntu-latest outputs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} ec2-image-id: ami-044b0717aadbc9dfa ec2-instance-type: t2.xlarge subnet-id: subnet-024811dee81325f1c security-group-id: sg-0646c2a337a355a31 deploy: name: Deploy PyTorch Training Pipeline needs: start-runner runs-on: ${{ needs.start-runner.outputs.label }} outputs: ecr-registry: ${{ steps.login-ecr.outputs.registry }} image-tag: ${{ github.sha }} steps: - name: Checkout repository uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Cache Docker layers uses: actions/cache@v3 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-docker-${{ github.sha }} restore-keys: | ${{ runner.os }}-docker- - name: Log in to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - name: Create .env file run: | echo "HYDRA_FULL_ERROR=1" >> .env echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}" echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}" - name: Run Docker Compose for all services run: | docker-compose --env-file .env build --no-cache docker-compose --env-file .env up -d docker-compose logs --follow train eval docker-compose down --remove-orphans - name: Build, tag, and push Docker image to Amazon ECR env: REGISTRY: ${{ steps.login-ecr.outputs.registry }} REPOSITORY: soutrik71/mnist IMAGE_TAG: ${{ github.sha }} run: | docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG . docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest docker push $REGISTRY/$REPOSITORY:latest - name: Pull Docker image from ECR and verify env: REGISTRY: ${{ steps.login-ecr.outputs.registry }} REPOSITORY: soutrik71/mnist IMAGE_TAG: ${{ github.sha }} run: | docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG docker images | grep "$REGISTRY/$REPOSITORY" - name: Clean up environment run: | docker system prune -af --volumes sync-to-hub: name: Sync to Hugging Face Hub needs: deploy runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 lfs: true - name: Install Git LFS run: | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash sudo apt-get install git-lfs git lfs install - name: Configure Git identity run: | git config --global user.name "soutrik" git config --global user.email "soutrik.chowdhury@ab-inbev.com" - name: Add remote run: | git remote add space https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE env: HF_TOKEN: ${{ secrets.HF_TOKEN }} USER: soutrik SPACE: gradio_demo_MNIST_Classifier - name: Ensure LFS objects are present run: git lfs checkout - name: Add README.md run: | cat < README.md --- title: My Gradio App MNIST Classifier emoji: 🚀 colorFrom: blue colorTo: green sdk: gradio sdk_version: "5.7.1" app_file: app.py pinned: false --- EOF git add README.md git commit -m "Add README.md" || echo "Skip commit if no changes" - name: Push to hub run: | git push --force https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE main env: HF_TOKEN: ${{ secrets.HF_TOKEN }} USER: soutrik SPACE: gradio_demo_MNIST_Classifier stop-runner: name: Stop self-hosted EC2 runner needs: - start-runner - deploy - sync-to-hub runs-on: ubuntu-latest if: ${{ always() }} steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-runner.outputs.label }} ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} - name: Validate EC2 termination run: | aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \ --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."