name: Deploy PyTorch Training with all advanced features like self-hosted EC2 runner, Docker Buildx, Amazon ECR, Hugging Face Spaces on: push: branches: - master workflow_dispatch: jobs: start-runner: name: Start self-hosted EC2 runner runs-on: ubuntu-latest outputs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@v2 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} ec2-image-id: ami-044b0717aadbc9dfa ec2-instance-type: t2.xlarge subnet-id: subnet-024811dee81325f1c security-group-id: sg-0646c2a337a355a31 deploy: name: Deploy PyTorch Training Pipeline needs: start-runner runs-on: ${{ needs.start-runner.outputs.label }} steps: - name: Checkout repository uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Cache Docker layers uses: actions/cache@v3 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-docker-${{ github.sha }} restore-keys: | ${{ runner.os }}-docker- - name: Log in to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - name: Create .env file run: | echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}" echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}" - name: Run Docker Compose for all services run: | docker-compose build --no-cache docker-compose up -d docker-compose logs --follow train eval docker-compose down --remove-orphans - name: Build, tag, and push Docker image to Amazon ECR env: REGISTRY: ${{ steps.login-ecr.outputs.registry }} REPOSITORY: soutrik71/mnist IMAGE_TAG: ${{ github.sha }} run: | docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG . docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest docker push $REGISTRY/$REPOSITORY:latest - name: Pull Docker image from ECR and verify env: REGISTRY: ${{ steps.login-ecr.outputs.registry }} REPOSITORY: soutrik71/mnist IMAGE_TAG: ${{ github.sha }} run: | docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG docker images | grep "$REGISTRY/$REPOSITORY" - name: Clean up environment run: | rm -f .env docker system prune -af --volumes stop-runner: name: Stop self-hosted EC2 runner needs: - start-runner - deploy runs-on: ubuntu-latest if: ${{ always() }} steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} - name: Stop EC2 runner uses: machulav/ec2-github-runner@v2 with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-runner.outputs.label }} ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} - name: Validate EC2 termination run: | aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \ --query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."