soutrik
commited on
Commit
·
c3d82b0
0
Parent(s):
orphan branch
Browse files- .gitattributes +4 -0
- .github/workflows/deployment.yaml +113 -0
- .github/workflows/deployment_advanced.yaml +130 -0
- .github/workflows/do-the-job.yml +66 -0
- .github/workflows/hf_deploy.yaml +79 -0
- .github/workflows/main_cd.yml +131 -0
- .gitignore +128 -0
- .gradio/certificate.pem +31 -0
- .project-root +0 -0
- Dockerfile +54 -0
- README.md +286 -0
- app.py +95 -0
- docker-compose.yaml +63 -0
- main.py +3 -0
- poetry.lock +0 -0
- pyproject.toml +91 -0
- requirements.txt +28 -0
- src/__init__.py +0 -0
- src/dataloader.py +122 -0
- src/model.py +86 -0
- src/test.py +115 -0
- src/train.py +105 -0
- src/utils/aws_s3_services.py +88 -0
.gitattributes
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
2 |
+
checkpoints/best_model.ckpt filter=lfs diff=lfs merge=lfs -text
|
3 |
+
checkpoints/last.ckpt filter=lfs diff=lfs merge=lfs -text
|
4 |
+
checkpoints/*.ckpt filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/deployment.yaml
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy PyTorch Training with EC2 Runner and Docker Compose
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
|
8 |
+
workflow_dispatch:
|
9 |
+
|
10 |
+
jobs:
|
11 |
+
start-runner:
|
12 |
+
name: Start self-hosted EC2 runner
|
13 |
+
runs-on: ubuntu-latest
|
14 |
+
outputs:
|
15 |
+
label: ${{ steps.start-ec2-runner.outputs.label }}
|
16 |
+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
|
17 |
+
steps:
|
18 |
+
- name: Configure AWS credentials
|
19 |
+
uses: aws-actions/configure-aws-credentials@v4
|
20 |
+
with:
|
21 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
22 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
23 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
24 |
+
|
25 |
+
- name: Start EC2 runner
|
26 |
+
id: start-ec2-runner
|
27 |
+
uses: machulav/ec2-github-runner@v2
|
28 |
+
with:
|
29 |
+
mode: start
|
30 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
31 |
+
ec2-image-id: ami-044b0717aadbc9dfa
|
32 |
+
ec2-instance-type: t2.xlarge
|
33 |
+
subnet-id: subnet-024811dee81325f1c
|
34 |
+
security-group-id: sg-0646c2a337a355a31
|
35 |
+
|
36 |
+
deploy:
|
37 |
+
name: Deploy PyTorch Training Pipeline
|
38 |
+
needs: start-runner
|
39 |
+
runs-on: ${{ needs.start-runner.outputs.label }}
|
40 |
+
steps:
|
41 |
+
- name: Checkout repository
|
42 |
+
uses: actions/checkout@v4
|
43 |
+
|
44 |
+
- name: Set up Docker Buildx
|
45 |
+
uses: docker/setup-buildx-action@v3
|
46 |
+
|
47 |
+
- name: Configure AWS credentials
|
48 |
+
uses: aws-actions/configure-aws-credentials@v4
|
49 |
+
with:
|
50 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
51 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
52 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
53 |
+
|
54 |
+
- name: Log in to Amazon ECR
|
55 |
+
id: login-ecr
|
56 |
+
uses: aws-actions/amazon-ecr-login@v2
|
57 |
+
|
58 |
+
- name: Create .env file
|
59 |
+
run: |
|
60 |
+
echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
|
61 |
+
echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
|
62 |
+
echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
|
63 |
+
|
64 |
+
- name: Run Docker Compose for train and eval service
|
65 |
+
run: |
|
66 |
+
docker-compose stop
|
67 |
+
docker-compose up --build
|
68 |
+
docker-compose logs --follow
|
69 |
+
docker-compose down --remove-orphans
|
70 |
+
|
71 |
+
|
72 |
+
- name: Build, tag, and push Docker image to Amazon ECR
|
73 |
+
env:
|
74 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
75 |
+
REPOSITORY: soutrik71/mnist
|
76 |
+
IMAGE_TAG: ${{ github.sha }}
|
77 |
+
run: |
|
78 |
+
docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
|
79 |
+
docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
80 |
+
docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
|
81 |
+
docker push $REGISTRY/$REPOSITORY:latest
|
82 |
+
|
83 |
+
- name: Pull Docker image from ECR and verify
|
84 |
+
env:
|
85 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
86 |
+
REPOSITORY: soutrik71/mnist
|
87 |
+
IMAGE_TAG: ${{ github.sha }}
|
88 |
+
run: |
|
89 |
+
docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
90 |
+
docker images | grep "$REGISTRY/$REPOSITORY"
|
91 |
+
|
92 |
+
stop-runner:
|
93 |
+
name: Stop self-hosted EC2 runner
|
94 |
+
needs:
|
95 |
+
- start-runner
|
96 |
+
- deploy
|
97 |
+
runs-on: ubuntu-latest
|
98 |
+
if: ${{ always() }}
|
99 |
+
steps:
|
100 |
+
- name: Configure AWS credentials
|
101 |
+
uses: aws-actions/configure-aws-credentials@v4
|
102 |
+
with:
|
103 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
104 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
105 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
106 |
+
|
107 |
+
- name: Stop EC2 runner
|
108 |
+
uses: machulav/ec2-github-runner@v2
|
109 |
+
with:
|
110 |
+
mode: stop
|
111 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
112 |
+
label: ${{ needs.start-runner.outputs.label }}
|
113 |
+
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
.github/workflows/deployment_advanced.yaml
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy PyTorch Training with EC2 Runner and Docker Compose with Advanced Deployment
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
start-runner:
|
10 |
+
name: Start self-hosted EC2 runner
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
outputs:
|
13 |
+
label: ${{ steps.start-ec2-runner.outputs.label }}
|
14 |
+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
|
15 |
+
steps:
|
16 |
+
- name: Configure AWS credentials
|
17 |
+
uses: aws-actions/configure-aws-credentials@v4
|
18 |
+
with:
|
19 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
20 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
21 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
22 |
+
|
23 |
+
- name: Start EC2 runner
|
24 |
+
id: start-ec2-runner
|
25 |
+
uses: machulav/ec2-github-runner@v2
|
26 |
+
with:
|
27 |
+
mode: start
|
28 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
29 |
+
ec2-image-id: ami-044b0717aadbc9dfa
|
30 |
+
ec2-instance-type: t2.xlarge
|
31 |
+
subnet-id: subnet-024811dee81325f1c
|
32 |
+
security-group-id: sg-0646c2a337a355a31
|
33 |
+
|
34 |
+
deploy:
|
35 |
+
name: Deploy PyTorch Training Pipeline
|
36 |
+
needs: start-runner
|
37 |
+
runs-on: ${{ needs.start-runner.outputs.label }}
|
38 |
+
steps:
|
39 |
+
- name: Checkout repository
|
40 |
+
uses: actions/checkout@v4
|
41 |
+
|
42 |
+
- name: Set up Docker Buildx
|
43 |
+
uses: docker/setup-buildx-action@v3
|
44 |
+
|
45 |
+
- name: Configure AWS credentials
|
46 |
+
uses: aws-actions/configure-aws-credentials@v4
|
47 |
+
with:
|
48 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
49 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
50 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
51 |
+
|
52 |
+
- name: Cache Docker layers
|
53 |
+
uses: actions/cache@v3
|
54 |
+
with:
|
55 |
+
path: /tmp/.buildx-cache
|
56 |
+
key: ${{ runner.os }}-docker-${{ github.sha }}
|
57 |
+
restore-keys: |
|
58 |
+
${{ runner.os }}-docker-
|
59 |
+
|
60 |
+
- name: Log in to Amazon ECR
|
61 |
+
id: login-ecr
|
62 |
+
uses: aws-actions/amazon-ecr-login@v2
|
63 |
+
|
64 |
+
- name: Create .env file
|
65 |
+
run: |
|
66 |
+
echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
|
67 |
+
echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
|
68 |
+
echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
|
69 |
+
echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
|
70 |
+
echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
|
71 |
+
|
72 |
+
- name: Run Docker Compose for all services
|
73 |
+
run: |
|
74 |
+
docker-compose build --no-cache
|
75 |
+
docker-compose up -d
|
76 |
+
docker-compose logs --follow train eval
|
77 |
+
docker-compose down --remove-orphans
|
78 |
+
|
79 |
+
- name: Build, tag, and push Docker image to Amazon ECR
|
80 |
+
env:
|
81 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
82 |
+
REPOSITORY: soutrik71/mnist
|
83 |
+
IMAGE_TAG: ${{ github.sha }}
|
84 |
+
run: |
|
85 |
+
docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
|
86 |
+
docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
87 |
+
docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
|
88 |
+
docker push $REGISTRY/$REPOSITORY:latest
|
89 |
+
|
90 |
+
- name: Pull Docker image from ECR and verify
|
91 |
+
env:
|
92 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
93 |
+
REPOSITORY: soutrik71/mnist
|
94 |
+
IMAGE_TAG: ${{ github.sha }}
|
95 |
+
run: |
|
96 |
+
docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
97 |
+
docker images | grep "$REGISTRY/$REPOSITORY"
|
98 |
+
|
99 |
+
- name: Clean up environment
|
100 |
+
run: |
|
101 |
+
rm -f .env
|
102 |
+
docker system prune -af --volumes
|
103 |
+
|
104 |
+
stop-runner:
|
105 |
+
name: Stop self-hosted EC2 runner
|
106 |
+
needs:
|
107 |
+
- start-runner
|
108 |
+
- deploy
|
109 |
+
runs-on: ubuntu-latest
|
110 |
+
if: ${{ always() }}
|
111 |
+
steps:
|
112 |
+
- name: Configure AWS credentials
|
113 |
+
uses: aws-actions/configure-aws-credentials@v4
|
114 |
+
with:
|
115 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
116 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
117 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
118 |
+
|
119 |
+
- name: Stop EC2 runner
|
120 |
+
uses: machulav/ec2-github-runner@v2
|
121 |
+
with:
|
122 |
+
mode: stop
|
123 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
124 |
+
label: ${{ needs.start-runner.outputs.label }}
|
125 |
+
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
126 |
+
|
127 |
+
- name: Validate EC2 termination
|
128 |
+
run: |
|
129 |
+
aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
|
130 |
+
--query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."
|
.github/workflows/do-the-job.yml
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: do-the-job
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches:
|
5 |
+
- master
|
6 |
+
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
start-runner:
|
11 |
+
name: Start self-hosted EC2 runner
|
12 |
+
runs-on: ubuntu-latest
|
13 |
+
outputs:
|
14 |
+
label: ${{ steps.start-ec2-runner.outputs.label }}
|
15 |
+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
|
16 |
+
steps:
|
17 |
+
- name: Configure AWS credentials
|
18 |
+
uses: aws-actions/configure-aws-credentials@v4
|
19 |
+
with:
|
20 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
21 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
22 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
23 |
+
- name: Start EC2 runner
|
24 |
+
id: start-ec2-runner
|
25 |
+
uses: machulav/ec2-github-runner@v2
|
26 |
+
with:
|
27 |
+
mode: start
|
28 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
29 |
+
ec2-image-id: ami-044b0717aadbc9dfa
|
30 |
+
ec2-instance-type: t2.xlarge
|
31 |
+
subnet-id: subnet-024811dee81325f1c
|
32 |
+
security-group-id: sg-0646c2a337a355a31
|
33 |
+
|
34 |
+
do-the-job:
|
35 |
+
name: Do the job on the runner
|
36 |
+
needs: start-runner
|
37 |
+
runs-on: ${{ needs.start-runner.outputs.label }}
|
38 |
+
steps:
|
39 |
+
- name: Clone the repository
|
40 |
+
uses: actions/checkout@v4
|
41 |
+
with:
|
42 |
+
fetch-depth: 0
|
43 |
+
- name: Run custom command
|
44 |
+
run: echo 'Hello World!'
|
45 |
+
|
46 |
+
stop-runner:
|
47 |
+
name: Stop self-hosted EC2 runner
|
48 |
+
needs:
|
49 |
+
- start-runner
|
50 |
+
- do-the-job
|
51 |
+
runs-on: ubuntu-latest
|
52 |
+
if: ${{ always() }}
|
53 |
+
steps:
|
54 |
+
- name: Configure AWS credentials
|
55 |
+
uses: aws-actions/configure-aws-credentials@v4
|
56 |
+
with:
|
57 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
58 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
59 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
60 |
+
- name: Stop EC2 runner
|
61 |
+
uses: machulav/ec2-github-runner@v2
|
62 |
+
with:
|
63 |
+
mode: stop
|
64 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
65 |
+
label: ${{ needs.start-runner.outputs.label }}
|
66 |
+
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
.github/workflows/hf_deploy.yaml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face Hub for Gradio App MNIST Classifier # this is not working due to lfs issue
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
sync-to-hub:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
steps:
|
12 |
+
- uses: actions/checkout@v4
|
13 |
+
with:
|
14 |
+
fetch-depth: 0
|
15 |
+
lfs: true
|
16 |
+
|
17 |
+
- name: Install Git LFS
|
18 |
+
run: |
|
19 |
+
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
|
20 |
+
sudo apt-get install git-lfs
|
21 |
+
git lfs install
|
22 |
+
|
23 |
+
- name: Configure Git identity
|
24 |
+
run: |
|
25 |
+
git config --global user.name "soutrik"
|
26 |
+
git config --global user.email "soutrik.chowdhury@ab-inbev.com"
|
27 |
+
|
28 |
+
- name: Add remote
|
29 |
+
run: |
|
30 |
+
git remote add space https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE
|
31 |
+
env:
|
32 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
33 |
+
USER: soutrik
|
34 |
+
SPACE: gradio_demo_MNIST_Classifier
|
35 |
+
|
36 |
+
# # Track individual files with LFS
|
37 |
+
# - name: Track last.ckpt with Git LFS
|
38 |
+
# run: |
|
39 |
+
# git lfs track "checkpoints/last.ckpt"
|
40 |
+
# git add .gitattributes
|
41 |
+
# git commit -m "Track last.ckpt with Git LFS" || echo "Skip commit if no changes"
|
42 |
+
|
43 |
+
# - name: Track best_model.ckpt with Git LFS
|
44 |
+
# run: |
|
45 |
+
# git lfs track "checkpoints/best_model.ckpt"
|
46 |
+
# git add .gitattributes
|
47 |
+
# git commit -m "Track best_model.ckpt with Git LFS" || echo "Skip commit if no changes"
|
48 |
+
|
49 |
+
# Ensure LFS objects are checked out
|
50 |
+
- name: Ensure LFS objects are present
|
51 |
+
run: git lfs checkout
|
52 |
+
|
53 |
+
- name: Add README.md
|
54 |
+
run: |
|
55 |
+
cat <<EOF > README.md
|
56 |
+
---
|
57 |
+
title: My Gradio App MNIST Classifier
|
58 |
+
emoji: 🚀
|
59 |
+
colorFrom: blue
|
60 |
+
colorTo: green
|
61 |
+
sdk: gradio
|
62 |
+
sdk_version: "5.7.1"
|
63 |
+
app_file: app.py
|
64 |
+
pinned: false
|
65 |
+
---
|
66 |
+
EOF
|
67 |
+
git add README.md
|
68 |
+
git commit -m "Add README.md" || echo "Skip commit if no changes"
|
69 |
+
|
70 |
+
- name: Push to hub
|
71 |
+
run: |
|
72 |
+
git push --force https://$USER:$HF_TOKEN@huggingface.co/spaces/$USER/$SPACE main
|
73 |
+
env:
|
74 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
75 |
+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
76 |
+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
77 |
+
AWS_REGION: ${{ secrets.AWS_REGION }}
|
78 |
+
USER: soutrik
|
79 |
+
SPACE: gradio_demo_MNIST_Classifier
|
.github/workflows/main_cd.yml
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy PyTorch Training with all advanced features like self-hosted EC2 runner, Docker Buildx, Amazon ECR, Hugging Face Spaces
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
start-runner:
|
11 |
+
name: Start self-hosted EC2 runner
|
12 |
+
runs-on: ubuntu-latest
|
13 |
+
outputs:
|
14 |
+
label: ${{ steps.start-ec2-runner.outputs.label }}
|
15 |
+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
|
16 |
+
steps:
|
17 |
+
- name: Configure AWS credentials
|
18 |
+
uses: aws-actions/configure-aws-credentials@v4
|
19 |
+
with:
|
20 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
21 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
22 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
23 |
+
|
24 |
+
- name: Start EC2 runner
|
25 |
+
id: start-ec2-runner
|
26 |
+
uses: machulav/ec2-github-runner@v2
|
27 |
+
with:
|
28 |
+
mode: start
|
29 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
30 |
+
ec2-image-id: ami-044b0717aadbc9dfa
|
31 |
+
ec2-instance-type: t2.xlarge
|
32 |
+
subnet-id: subnet-024811dee81325f1c
|
33 |
+
security-group-id: sg-0646c2a337a355a31
|
34 |
+
|
35 |
+
deploy:
|
36 |
+
name: Deploy PyTorch Training Pipeline
|
37 |
+
needs: start-runner
|
38 |
+
runs-on: ${{ needs.start-runner.outputs.label }}
|
39 |
+
steps:
|
40 |
+
- name: Checkout repository
|
41 |
+
uses: actions/checkout@v4
|
42 |
+
|
43 |
+
- name: Set up Docker Buildx
|
44 |
+
uses: docker/setup-buildx-action@v3
|
45 |
+
|
46 |
+
- name: Configure AWS credentials
|
47 |
+
uses: aws-actions/configure-aws-credentials@v4
|
48 |
+
with:
|
49 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
50 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
51 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
52 |
+
|
53 |
+
- name: Cache Docker layers
|
54 |
+
uses: actions/cache@v3
|
55 |
+
with:
|
56 |
+
path: /tmp/.buildx-cache
|
57 |
+
key: ${{ runner.os }}-docker-${{ github.sha }}
|
58 |
+
restore-keys: |
|
59 |
+
${{ runner.os }}-docker-
|
60 |
+
|
61 |
+
- name: Log in to Amazon ECR
|
62 |
+
id: login-ecr
|
63 |
+
uses: aws-actions/amazon-ecr-login@v2
|
64 |
+
|
65 |
+
- name: Create .env file
|
66 |
+
run: |
|
67 |
+
echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
|
68 |
+
echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
|
69 |
+
echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
|
70 |
+
echo "::add-mask::${{ secrets.AWS_ACCESS_KEY_ID }}"
|
71 |
+
echo "::add-mask::${{ secrets.AWS_SECRET_ACCESS_KEY }}"
|
72 |
+
|
73 |
+
- name: Run Docker Compose for all services
|
74 |
+
run: |
|
75 |
+
docker-compose build --no-cache
|
76 |
+
docker-compose up -d
|
77 |
+
docker-compose logs --follow train eval
|
78 |
+
docker-compose down --remove-orphans
|
79 |
+
|
80 |
+
- name: Build, tag, and push Docker image to Amazon ECR
|
81 |
+
env:
|
82 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
83 |
+
REPOSITORY: soutrik71/mnist
|
84 |
+
IMAGE_TAG: ${{ github.sha }}
|
85 |
+
run: |
|
86 |
+
docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
|
87 |
+
docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
88 |
+
docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
|
89 |
+
docker push $REGISTRY/$REPOSITORY:latest
|
90 |
+
|
91 |
+
- name: Pull Docker image from ECR and verify
|
92 |
+
env:
|
93 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
94 |
+
REPOSITORY: soutrik71/mnist
|
95 |
+
IMAGE_TAG: ${{ github.sha }}
|
96 |
+
run: |
|
97 |
+
docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
98 |
+
docker images | grep "$REGISTRY/$REPOSITORY"
|
99 |
+
|
100 |
+
- name: Clean up environment
|
101 |
+
run: |
|
102 |
+
rm -f .env
|
103 |
+
docker system prune -af --volumes
|
104 |
+
|
105 |
+
stop-runner:
|
106 |
+
name: Stop self-hosted EC2 runner
|
107 |
+
needs:
|
108 |
+
- start-runner
|
109 |
+
- deploy
|
110 |
+
runs-on: ubuntu-latest
|
111 |
+
if: ${{ always() }}
|
112 |
+
steps:
|
113 |
+
- name: Configure AWS credentials
|
114 |
+
uses: aws-actions/configure-aws-credentials@v4
|
115 |
+
with:
|
116 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
117 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
118 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
119 |
+
|
120 |
+
- name: Stop EC2 runner
|
121 |
+
uses: machulav/ec2-github-runner@v2
|
122 |
+
with:
|
123 |
+
mode: stop
|
124 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
125 |
+
label: ${{ needs.start-runner.outputs.label }}
|
126 |
+
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
127 |
+
|
128 |
+
- name: Validate EC2 termination
|
129 |
+
run: |
|
130 |
+
aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }} \
|
131 |
+
--query "Reservations[].Instances[].State.Name" --output text | grep "terminated" || echo "Runner not terminated."
|
.gitignore
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# celery beat schedule file
|
88 |
+
celerybeat-schedule
|
89 |
+
|
90 |
+
# SageMath parsed files
|
91 |
+
*.sage.py
|
92 |
+
|
93 |
+
# Environments
|
94 |
+
.env
|
95 |
+
.venv
|
96 |
+
env/
|
97 |
+
venv/
|
98 |
+
ENV/
|
99 |
+
env.bak/
|
100 |
+
venv.bak/
|
101 |
+
|
102 |
+
# Spyder project settings
|
103 |
+
.spyderproject
|
104 |
+
.spyderworkspace
|
105 |
+
|
106 |
+
# Rope project settings
|
107 |
+
.ropeproject
|
108 |
+
|
109 |
+
# mkdocs documentation
|
110 |
+
/site
|
111 |
+
|
112 |
+
# mypy
|
113 |
+
.mypy_cache/
|
114 |
+
.dmypy.json
|
115 |
+
dmypy.json
|
116 |
+
|
117 |
+
# Pyre type checker
|
118 |
+
.pyre/
|
119 |
+
|
120 |
+
# PyTorch
|
121 |
+
*.pt
|
122 |
+
*.pth
|
123 |
+
logs/
|
124 |
+
data/
|
125 |
+
checkpoints/
|
126 |
+
checkpoints/*
|
127 |
+
checkpoints/best_model.ckpt
|
128 |
+
checkpoints/last.ckpt
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
.project-root
ADDED
File without changes
|
Dockerfile
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Stage 1: Build environment with Poetry and dependencies
|
2 |
+
FROM python:3.10.15-slim as builder
|
3 |
+
|
4 |
+
LABEL maintainer="Soutrik soutrik1991@gmail.com" \
|
5 |
+
description="Docker image for running a Python app with dependencies managed by Poetry."
|
6 |
+
|
7 |
+
# Install Poetry and necessary system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends curl && \
|
9 |
+
curl -sSL https://install.python-poetry.org | python3 - && \
|
10 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Add Poetry to the PATH explicitly
|
13 |
+
ENV PATH="/root/.local/bin:$PATH"
|
14 |
+
|
15 |
+
# Set the working directory to /app
|
16 |
+
WORKDIR /app
|
17 |
+
|
18 |
+
# Copy pyproject.toml and poetry.lock to install dependencies
|
19 |
+
COPY pyproject.toml poetry.lock /app/
|
20 |
+
|
21 |
+
# Configure Poetry environment
|
22 |
+
ENV POETRY_NO_INTERACTION=1 \
|
23 |
+
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
24 |
+
POETRY_CACHE_DIR=/tmp/poetry_cache
|
25 |
+
|
26 |
+
# Install dependencies without installing the package itself
|
27 |
+
RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
|
28 |
+
|
29 |
+
# Additional steps: Uninstall and re-add cryptography
|
30 |
+
RUN poetry run pip uninstall -y cryptography && \
|
31 |
+
poetry add cryptography --lock
|
32 |
+
|
33 |
+
# Stage 2: Runtime environment
|
34 |
+
FROM python:3.10.15-slim as runner
|
35 |
+
|
36 |
+
# Install curl for health check script
|
37 |
+
RUN apt-get update && apt-get install -y --no-install-recommends curl && \
|
38 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
39 |
+
|
40 |
+
# Copy application source code and necessary files
|
41 |
+
COPY src /app/src
|
42 |
+
COPY main.py /app/main.py
|
43 |
+
|
44 |
+
# Copy virtual environment from the builder stage
|
45 |
+
COPY --from=builder /app/.venv /app/.venv
|
46 |
+
|
47 |
+
# Set the working directory to /app
|
48 |
+
WORKDIR /app
|
49 |
+
|
50 |
+
# Set the environment path to use the virtual environment
|
51 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
52 |
+
|
53 |
+
# Default command
|
54 |
+
CMD ["python", "-m", "main"]
|
README.md
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: My Gradio App Mnist Classifier
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: "5.7.1"
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# aws_ec2_automation
|
13 |
+
Here’s a detailed explanation of the GitHub Actions (GHA) pipeline in **raw Markdown format**:
|
14 |
+
|
15 |
+
---
|
16 |
+
|
17 |
+
# GitHub Actions Pipeline Documentation
|
18 |
+
|
19 |
+
## Name: Deploy PyTorch Training with EC2 Runner and Docker Compose
|
20 |
+
|
21 |
+
This pipeline automates the following tasks:
|
22 |
+
1. Starts an EC2 instance as a self-hosted GitHub runner.
|
23 |
+
2. Deploys a PyTorch training pipeline using Docker Compose.
|
24 |
+
3. Builds, tags, and pushes Docker images to Amazon ECR.
|
25 |
+
4. Stops the EC2 instance after the job is completed.
|
26 |
+
|
27 |
+
---
|
28 |
+
|
29 |
+
### Workflow Triggers
|
30 |
+
|
31 |
+
```yaml
|
32 |
+
on:
|
33 |
+
push:
|
34 |
+
branches:
|
35 |
+
- main
|
36 |
+
```
|
37 |
+
|
38 |
+
- **Trigger**: This workflow runs whenever a push is made to the `main` branch.
|
39 |
+
|
40 |
+
---
|
41 |
+
|
42 |
+
## Jobs Overview
|
43 |
+
|
44 |
+
### 1. **start-runner**
|
45 |
+
Starts a self-hosted EC2 runner using the GitHub Actions Runner.
|
46 |
+
|
47 |
+
#### Steps:
|
48 |
+
1. **Configure AWS Credentials**:
|
49 |
+
```yaml
|
50 |
+
- name: Configure AWS credentials
|
51 |
+
uses: aws-actions/configure-aws-credentials@v4
|
52 |
+
with:
|
53 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
54 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
55 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
56 |
+
```
|
57 |
+
- Authenticates with AWS using access keys and the region specified in the secrets.
|
58 |
+
- Required for creating and managing the EC2 instance.
|
59 |
+
|
60 |
+
2. **Start EC2 Runner**:
|
61 |
+
```yaml
|
62 |
+
- name: Start EC2 runner
|
63 |
+
id: start-ec2-runner
|
64 |
+
uses: machulav/ec2-github-runner@v2
|
65 |
+
with:
|
66 |
+
mode: start
|
67 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
68 |
+
ec2-image-id: ami-044b0717aadbc9dfa
|
69 |
+
ec2-instance-type: t2.xlarge
|
70 |
+
subnet-id: subnet-024811dee81325f1c
|
71 |
+
security-group-id: sg-0646c2a337a355a31
|
72 |
+
```
|
73 |
+
- Starts an EC2 instance with the specified AMI, instance type, subnet, and security group.
|
74 |
+
- Outputs:
|
75 |
+
- `label`: A unique label for the EC2 runner.
|
76 |
+
- `ec2-instance-id`: The ID of the created EC2 instance.
|
77 |
+
|
78 |
+
---
|
79 |
+
|
80 |
+
### 2. **deploy**
|
81 |
+
Deploys the PyTorch training pipeline using the EC2 runner started in the previous step.
|
82 |
+
|
83 |
+
#### Dependencies:
|
84 |
+
```yaml
|
85 |
+
needs: start-runner
|
86 |
+
runs-on: ${{ needs.start-runner.outputs.label }}
|
87 |
+
```
|
88 |
+
- **Depends on** the `start-runner` job and runs on the newly created EC2 instance.
|
89 |
+
|
90 |
+
#### Steps:
|
91 |
+
1. **Checkout Repository**:
|
92 |
+
```yaml
|
93 |
+
- name: Checkout repository
|
94 |
+
uses: actions/checkout@v4
|
95 |
+
```
|
96 |
+
- Clones the current repository to the runner.
|
97 |
+
|
98 |
+
2. **Set Up Docker Buildx**:
|
99 |
+
```yaml
|
100 |
+
- name: Set up Docker Buildx
|
101 |
+
uses: docker/setup-buildx-action@v3
|
102 |
+
```
|
103 |
+
- Configures Docker Buildx for building multi-platform Docker images.
|
104 |
+
|
105 |
+
3. **Configure AWS Credentials**:
|
106 |
+
```yaml
|
107 |
+
- name: Configure AWS credentials
|
108 |
+
uses: aws-actions/configure-aws-credentials@v4
|
109 |
+
with:
|
110 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
111 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
112 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
113 |
+
```
|
114 |
+
- Reconfigures AWS credentials for Docker ECR authentication and resource management.
|
115 |
+
|
116 |
+
4. **Log in to Amazon ECR**:
|
117 |
+
```yaml
|
118 |
+
- name: Log in to Amazon ECR
|
119 |
+
id: login-ecr
|
120 |
+
uses: aws-actions/amazon-ecr-login@v2
|
121 |
+
```
|
122 |
+
- Logs into Amazon ECR for pushing and pulling Docker images.
|
123 |
+
|
124 |
+
5. **Create `.env` File**:
|
125 |
+
```yaml
|
126 |
+
- name: Create .env file
|
127 |
+
run: |
|
128 |
+
echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
|
129 |
+
echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
|
130 |
+
echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
|
131 |
+
```
|
132 |
+
- Generates a `.env` file for the application with AWS credentials and region.
|
133 |
+
|
134 |
+
6. **Run Docker Compose for Train and Eval Services**:
|
135 |
+
```yaml
|
136 |
+
- name: Run Docker Compose for train and eval service
|
137 |
+
run: |
|
138 |
+
docker-compose build
|
139 |
+
docker-compose up --build
|
140 |
+
docker-compose logs --follow
|
141 |
+
docker-compose down --remove-orphans
|
142 |
+
```
|
143 |
+
- **Build**: Builds all services defined in the `docker-compose.yml` file.
|
144 |
+
- **Up**: Runs all services, including training and evaluation.
|
145 |
+
- **Logs**: Outputs logs for debugging purposes.
|
146 |
+
- **Down**: Stops all services and removes orphaned containers.
|
147 |
+
|
148 |
+
7. **Build, Tag, and Push Docker Image to Amazon ECR**:
|
149 |
+
```yaml
|
150 |
+
- name: Build, tag, and push Docker image to Amazon ECR
|
151 |
+
env:
|
152 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
153 |
+
REPOSITORY: soutrik71/mnist
|
154 |
+
IMAGE_TAG: ${{ github.sha }}
|
155 |
+
run: |
|
156 |
+
docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
|
157 |
+
docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
158 |
+
docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
|
159 |
+
docker push $REGISTRY/$REPOSITORY:latest
|
160 |
+
```
|
161 |
+
- **Build**: Creates a Docker image with the repository and tag.
|
162 |
+
- **Push**: Pushes the image to Amazon ECR.
|
163 |
+
- **Tag**: Updates the `latest` tag.
|
164 |
+
|
165 |
+
8. **Pull and Verify Docker Image from ECR**:
|
166 |
+
```yaml
|
167 |
+
- name: Pull Docker image from ECR and verify
|
168 |
+
env:
|
169 |
+
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
170 |
+
REPOSITORY: soutrik71/mnist
|
171 |
+
IMAGE_TAG: ${{ github.sha }}
|
172 |
+
run: |
|
173 |
+
docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
|
174 |
+
docker images | grep "$REGISTRY/$REPOSITORY"
|
175 |
+
```
|
176 |
+
- **Pull**: Pulls the built image from ECR.
|
177 |
+
- **Verify**: Ensures the image exists locally.
|
178 |
+
|
179 |
+
9. **Clean Up Environment**:
|
180 |
+
```yaml
|
181 |
+
- name: Clean up environment
|
182 |
+
run: |
|
183 |
+
rm -f .env
|
184 |
+
docker system prune -af
|
185 |
+
```
|
186 |
+
- Deletes the `.env` file and removes unused Docker resources.
|
187 |
+
|
188 |
+
---
|
189 |
+
|
190 |
+
### 3. **stop-runner**
|
191 |
+
Stops and terminates the EC2 runner created in the `start-runner` job.
|
192 |
+
|
193 |
+
#### Dependencies:
|
194 |
+
```yaml
|
195 |
+
needs:
|
196 |
+
- start-runner
|
197 |
+
- deploy
|
198 |
+
```
|
199 |
+
|
200 |
+
#### Steps:
|
201 |
+
1. **Configure AWS Credentials**:
|
202 |
+
```yaml
|
203 |
+
- name: Configure AWS credentials
|
204 |
+
uses: aws-actions/configure-aws-credentials@v4
|
205 |
+
with:
|
206 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
207 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
208 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
209 |
+
```
|
210 |
+
|
211 |
+
2. **Stop EC2 Runner**:
|
212 |
+
```yaml
|
213 |
+
- name: Stop EC2 runner
|
214 |
+
uses: machulav/ec2-github-runner@v2
|
215 |
+
with:
|
216 |
+
mode: stop
|
217 |
+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
218 |
+
label: ${{ needs.start-runner.outputs.label }}
|
219 |
+
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
220 |
+
```
|
221 |
+
- Stops the EC2 runner instance created in the first job.
|
222 |
+
|
223 |
+
3. **Validate EC2 Termination**:
|
224 |
+
```yaml
|
225 |
+
- name: Validate EC2 termination
|
226 |
+
run: aws ec2 describe-instances --instance-ids ${{ needs.start-runner.outputs.ec2-instance-id }}
|
227 |
+
```
|
228 |
+
- Ensures the EC2 instance has been properly terminated.
|
229 |
+
|
230 |
+
---
|
231 |
+
|
232 |
+
### Key Highlights
|
233 |
+
1. **Sequential Execution**:
|
234 |
+
- The `start-runner`, `deploy`, and `stop-runner` jobs are executed sequentially.
|
235 |
+
|
236 |
+
2. **Error Handling**:
|
237 |
+
- The `stop-runner` job runs even if previous jobs fail (`if: ${{ always() }}`).
|
238 |
+
|
239 |
+
3. **Efficiency**:
|
240 |
+
- Docker layer caching speeds up builds.
|
241 |
+
- Cleanup steps maintain a clean environment.
|
242 |
+
|
243 |
+
4. **Security**:
|
244 |
+
- Secrets are masked and removed after use.
|
245 |
+
- Proper resource cleanup ensures cost efficiency.
|
246 |
+
|
247 |
+
---
|
248 |
+
|
249 |
+
This pipeline ensures robust deployment with error handling, logging, and cleanup mechanisms. So far we have discussed the GitHub Actions pipeline , the basic structure of the pipeline, and the steps involved in the pipeline.
|
250 |
+
Next we will have an interdependent pipeline where the output of one job will be used as input for the next job.
|
251 |
+
|
252 |
+
---
|
253 |
+
## Advanced Pipeline with
|
254 |
+
* Sequential Flow: Each job has clear dependencies, ensuring no step runs out of order.
|
255 |
+
* Code Checkout: Explicit repository checkout in each job ensures consistent source code.
|
256 |
+
* Secure Credential Handling: Sensitive credentials are masked and stored securely.
|
257 |
+
* Resource Cleanup: Includes Docker clean-up and EC2 instance termination validation.
|
258 |
+
* Logging: Added detailed logs to improve debugging and monitoring.
|
259 |
+
|
260 |
+
|
261 |
+
Step 1: Start EC2 Runner
|
262 |
+
Purpose: Initializes a self-hosted EC2 runner for running subsequent jobs.
|
263 |
+
Key Actions:
|
264 |
+
Configures AWS credentials.
|
265 |
+
Launches an EC2 instance using specified AMI, instance type, and networking configurations.
|
266 |
+
Outputs the runner label and instance ID for downstream jobs.
|
267 |
+
Step 2: Test PyTorch Code Using Docker Compose
|
268 |
+
Purpose: Tests the PyTorch training and evaluation services.
|
269 |
+
Key Actions:
|
270 |
+
Checks out the repository.
|
271 |
+
Sets up Docker Buildx for advanced build capabilities.
|
272 |
+
Configures AWS credentials and creates a masked .env file for secure credential sharing.
|
273 |
+
Runs all services (train, eval) using Docker Compose, monitors logs, and cleans up containers.
|
274 |
+
Step 3: Build, Tag, and Push Docker Image
|
275 |
+
Purpose: Builds a Docker image, tags it, and pushes it to Amazon ECR after successful tests.
|
276 |
+
Key Actions:
|
277 |
+
Checks out the repository again to ensure consistency.
|
278 |
+
Logs into Amazon ECR using AWS credentials.
|
279 |
+
Builds and tags the Docker image with latest and SHA-based tags.
|
280 |
+
Pushes the image to Amazon ECR and verifies by pulling it back.
|
281 |
+
Step 4: Stop and Delete EC2 Runner
|
282 |
+
Purpose: Stops and terminates the EC2 instance to ensure cost efficiency and cleanup.
|
283 |
+
Key Actions:
|
284 |
+
Configures AWS credentials.
|
285 |
+
Stops the EC2 instance using the label and instance ID from start-runner.
|
286 |
+
Validates the termination state of the EC2 instance to ensure proper cleanup.
|
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import torchvision.transforms as transforms
|
4 |
+
from PIL import Image
|
5 |
+
from pathlib import Path
|
6 |
+
from loguru import logger
|
7 |
+
from src.model import LitEfficientNet
|
8 |
+
from src.utils.aws_s3_services import S3Handler
|
9 |
+
|
10 |
+
# Configure Loguru for logging
|
11 |
+
logger.add("logs/inference.log", rotation="1 MB", level="INFO")
|
12 |
+
|
13 |
+
|
14 |
+
class MNISTClassifier:
|
15 |
+
def __init__(self, checkpoint_path="./checkpoints/best_model.ckpt"):
|
16 |
+
self.checkpoint_path = checkpoint_path
|
17 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
logger.info(f"Inference will run on device: {self.device}")
|
19 |
+
|
20 |
+
# Load the model
|
21 |
+
self.model = self.load_model()
|
22 |
+
self.model.eval()
|
23 |
+
|
24 |
+
# Define transforms
|
25 |
+
self.transform = transforms.Compose(
|
26 |
+
[
|
27 |
+
transforms.Resize((28, 28)),
|
28 |
+
transforms.ToTensor(),
|
29 |
+
transforms.Normalize((0.5,), (0.5,)),
|
30 |
+
]
|
31 |
+
)
|
32 |
+
self.labels = [str(i) for i in range(10)] # MNIST labels are 0-9
|
33 |
+
|
34 |
+
def load_model(self):
|
35 |
+
"""
|
36 |
+
Loads the model checkpoint for inference.
|
37 |
+
"""
|
38 |
+
if not Path(self.checkpoint_path).exists():
|
39 |
+
logger.error(f"Checkpoint not found: {self.checkpoint_path}")
|
40 |
+
raise FileNotFoundError(f"Checkpoint not found: {self.checkpoint_path}")
|
41 |
+
|
42 |
+
logger.info(f"Loading model from checkpoint: {self.checkpoint_path}")
|
43 |
+
return LitEfficientNet.load_from_checkpoint(self.checkpoint_path).to(
|
44 |
+
self.device
|
45 |
+
)
|
46 |
+
|
47 |
+
@torch.no_grad()
|
48 |
+
def predict(self, image):
|
49 |
+
"""
|
50 |
+
Perform inference on a single image.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
image: Input image in PIL format.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
dict: Predicted class probabilities.
|
57 |
+
"""
|
58 |
+
if image is None:
|
59 |
+
logger.error("No image provided for prediction.")
|
60 |
+
return None
|
61 |
+
|
62 |
+
# Convert to tensor and preprocess
|
63 |
+
img_tensor = self.transform(image).unsqueeze(0).to(self.device)
|
64 |
+
|
65 |
+
# Perform inference
|
66 |
+
output = self.model(img_tensor)
|
67 |
+
probabilities = torch.nn.functional.softmax(output[0], dim=0)
|
68 |
+
|
69 |
+
# Map probabilities to labels
|
70 |
+
return {self.labels[idx]: float(prob) for idx, prob in enumerate(probabilities)}
|
71 |
+
|
72 |
+
|
73 |
+
# Instantiate the classifier
|
74 |
+
checkpoint_path = "./checkpoints/best_model.ckpt"
|
75 |
+
|
76 |
+
# Download checkpoint from S3 (if needed)
|
77 |
+
s3_handler = S3Handler(bucket_name="deep-bucket-s3")
|
78 |
+
s3_handler.download_folder(
|
79 |
+
"checkpoints_test",
|
80 |
+
"checkpoints",
|
81 |
+
)
|
82 |
+
|
83 |
+
classifier = MNISTClassifier(checkpoint_path=checkpoint_path)
|
84 |
+
|
85 |
+
# Define Gradio interface
|
86 |
+
demo = gr.Interface(
|
87 |
+
fn=classifier.predict,
|
88 |
+
inputs=gr.Image(height=160, width=160, image_mode="L", type="pil"),
|
89 |
+
outputs=gr.Label(num_top_classes=1),
|
90 |
+
title="MNIST Classifier",
|
91 |
+
description="Upload a handwritten digit image to classify it (0-9).",
|
92 |
+
)
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
demo.launch(share=True)
|
docker-compose.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
train:
|
3 |
+
build:
|
4 |
+
context: .
|
5 |
+
command: |
|
6 |
+
python -m src.train && \
|
7 |
+
touch ./checkpoints/train_done.flag
|
8 |
+
volumes:
|
9 |
+
- ./data:/app/data
|
10 |
+
- ./checkpoints:/app/checkpoints
|
11 |
+
- ./logs:/app/logs
|
12 |
+
environment:
|
13 |
+
- PYTHONUNBUFFERED=1
|
14 |
+
- PYTHONPATH=/app
|
15 |
+
- NUM_WORKERS=4 # Set the number of workers
|
16 |
+
shm_size: '4g'
|
17 |
+
deploy:
|
18 |
+
resources:
|
19 |
+
limits:
|
20 |
+
memory: 8g # Limit to 8GB RAM
|
21 |
+
cpus: '4.0' # Use up to 4 CPU cores
|
22 |
+
reservations:
|
23 |
+
memory: 6g # Reserve 6GB RAM
|
24 |
+
cpus: '4.0' # Reserve 4 CPU cores
|
25 |
+
networks:
|
26 |
+
- default
|
27 |
+
env_file:
|
28 |
+
- .env
|
29 |
+
|
30 |
+
eval:
|
31 |
+
build:
|
32 |
+
context: .
|
33 |
+
command: |
|
34 |
+
sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.test'
|
35 |
+
volumes:
|
36 |
+
- ./data:/app/data
|
37 |
+
- ./checkpoints:/app/checkpoints
|
38 |
+
- ./logs:/app/logs
|
39 |
+
environment:
|
40 |
+
- PYTHONUNBUFFERED=1
|
41 |
+
- PYTHONPATH=/app
|
42 |
+
- NUM_WORKERS=2 # Set the number of workers
|
43 |
+
shm_size: '4g'
|
44 |
+
deploy:
|
45 |
+
resources:
|
46 |
+
limits:
|
47 |
+
memory: 4g # Limit to 4GB RAM
|
48 |
+
cpus: '4.0' # Use up to 4 CPU core
|
49 |
+
reservations:
|
50 |
+
memory: 2g # Reserve 2GB RAM
|
51 |
+
cpus: '2' # Reserve 2 CPU core
|
52 |
+
networks:
|
53 |
+
- default
|
54 |
+
env_file:
|
55 |
+
- .env
|
56 |
+
|
57 |
+
volumes:
|
58 |
+
data:
|
59 |
+
checkpoints:
|
60 |
+
logs:
|
61 |
+
|
62 |
+
networks:
|
63 |
+
default:
|
main.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
print(os.getcwd())
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "pytorch_fastapi_project"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Consolidated PyTorch and FastAPI project for AWS deployment and GHA testing"
|
5 |
+
authors = ["soutrik71 <soutrik.chowdhury@ab-inbev.com>"]
|
6 |
+
license = "Apache-2.0"
|
7 |
+
readme = "README.md"
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = "3.10.15"
|
11 |
+
black = "24.8.0"
|
12 |
+
coverage = ">=7.6.1"
|
13 |
+
hydra-colorlog = "1.2.0"
|
14 |
+
hydra-core = "1.3.2"
|
15 |
+
lightning = {version = "2.4.0", extras = ["extra"]}
|
16 |
+
loguru = "0.7.2"
|
17 |
+
pytest = "^8.3.3"
|
18 |
+
rich = "13.8.1"
|
19 |
+
rootutils = "1.0.7"
|
20 |
+
tensorboard = "2.17.1"
|
21 |
+
timm = "1.0.9"
|
22 |
+
pandas = "^2.2.3"
|
23 |
+
numpy = "^1.26.0"
|
24 |
+
ruff = "*"
|
25 |
+
torch = {version = "^2.4.1+cpu", source = "pytorch_cpu"}
|
26 |
+
torchvision = {version = "^0.19.1+cpu", source = "pytorch_cpu"}
|
27 |
+
seaborn = "^0.13.2"
|
28 |
+
pydantic = "^2.9.2"
|
29 |
+
kaggle = "^1.6.17"
|
30 |
+
pytest-cov = "^5.0.0"
|
31 |
+
pytest-mock = "^3.14.0"
|
32 |
+
flake8 = "^7.1.1"
|
33 |
+
dvc-gdrive = "^3.0.1"
|
34 |
+
dvc-azure = "^3.1.0"
|
35 |
+
transformers = "^4.45.2"
|
36 |
+
fastapi = "^0.115.4"
|
37 |
+
pydantic-settings = "^2.6.1"
|
38 |
+
uvicorn = "^0.32.0"
|
39 |
+
tenacity = "^9.0.0"
|
40 |
+
gunicorn = "^23.0.0"
|
41 |
+
aim = "^3.25.0"
|
42 |
+
mlflow = "^2.17.1"
|
43 |
+
hydra-optuna-sweeper = "^1.2.0"
|
44 |
+
dvc = "^3.56.0"
|
45 |
+
platformdirs = "3.10"
|
46 |
+
fastapi-utils = "^0.7.0"
|
47 |
+
httpx = "^0.27.2"
|
48 |
+
typing-inspect = "^0.9.0"
|
49 |
+
requests = "^2.32.3"
|
50 |
+
fastapi-restful = {extras = ["all"], version = "^0.6.0"}
|
51 |
+
aioredis = "^2.0.1"
|
52 |
+
psycopg2-binary = "^2.9.10"
|
53 |
+
asyncpg = "^0.30.0"
|
54 |
+
confluent-kafka = "^2.6.0"
|
55 |
+
aiokafka = "^0.12.0"
|
56 |
+
azure-servicebus = "^7.12.3"
|
57 |
+
aiohttp = "^3.10.10"
|
58 |
+
aiofiles = "*"
|
59 |
+
aiologger = "^0.7.0"
|
60 |
+
pyyaml = "^6.0.2"
|
61 |
+
sqlalchemy-utils = "^0.41.2"
|
62 |
+
sqlalchemy = "^2.0.36"
|
63 |
+
alembic = "^1.13.3"
|
64 |
+
fastapi-limiter = "^0.1.6"
|
65 |
+
redis = "5.0.8"
|
66 |
+
redisearch = "2.0.0"
|
67 |
+
python-multipart = "*"
|
68 |
+
python-dotenv = "^1.0.1"
|
69 |
+
celery = "^5.4.0"
|
70 |
+
fastapi-cache2 = "^0.2.2"
|
71 |
+
aiocache = "^0.12.3"
|
72 |
+
dvc-s3 = "^3.2.0"
|
73 |
+
litserve = "^0.2.4"
|
74 |
+
gpustat = "^1.1.1"
|
75 |
+
nvitop = "^1.3.2"
|
76 |
+
pyopenssl = "^23.0.0"
|
77 |
+
cryptography = "^41.0.0"
|
78 |
+
accelerate = "^1.1.1"
|
79 |
+
gradio="5.7.1"
|
80 |
+
|
81 |
+
[tool.poetry.dev-dependencies]
|
82 |
+
pytest-asyncio = "^0.20.3"
|
83 |
+
|
84 |
+
[[tool.poetry.source]]
|
85 |
+
name = "pytorch_cpu"
|
86 |
+
url = "https://download.pytorch.org/whl/cpu"
|
87 |
+
priority = "explicit"
|
88 |
+
|
89 |
+
[build-system]
|
90 |
+
requires = ["poetry-core"]
|
91 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.4.1
|
2 |
+
torchvision==0.19.1
|
3 |
+
hydra-colorlog==1.2.0
|
4 |
+
hydra-core==1.3.2
|
5 |
+
lightning[extra]==2.4.0
|
6 |
+
loguru==0.7.2
|
7 |
+
rich==13.8.1
|
8 |
+
rootutils==1.0.7
|
9 |
+
tensorboard==2.17.1
|
10 |
+
timm==1.0.9
|
11 |
+
pandas>=2.2.3
|
12 |
+
numpy>=1.26.0
|
13 |
+
transformers>=4.45.2
|
14 |
+
aim>=3.25.0
|
15 |
+
mlflow>=2.17.1
|
16 |
+
hydra-optuna-sweeper>=1.2.0
|
17 |
+
aiologger>=0.7.0
|
18 |
+
pyyaml>=6.0.2
|
19 |
+
dvc-s3>=3.2.0
|
20 |
+
litserve>=0.2.4
|
21 |
+
gpustat>=1.1.1
|
22 |
+
nvitop>=1.3.2
|
23 |
+
gradio==5.7.1
|
24 |
+
gradio-client>=1.5.0
|
25 |
+
accelerate>=1.1.1
|
26 |
+
cryptography>=44.0.0
|
27 |
+
boto3
|
28 |
+
pyopenssl>=24.3.0
|
src/__init__.py
ADDED
File without changes
|
src/dataloader.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import DataLoader, Subset
|
4 |
+
from torchvision import datasets, transforms
|
5 |
+
import lightning as pl
|
6 |
+
from typing import Optional
|
7 |
+
from multiprocessing import cpu_count
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
|
10 |
+
# Configure Loguru to save logs to the logs/ directory
|
11 |
+
logger.add("logs/dataloader.log", rotation="1 MB", level="INFO")
|
12 |
+
|
13 |
+
|
14 |
+
class MNISTDataModule(pl.LightningDataModule):
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
batch_size: int = 64,
|
18 |
+
data_dir: str = "./data",
|
19 |
+
num_workers: int = int(cpu_count()),
|
20 |
+
train_subset_fraction: float = 0.25, # Fraction of training data to use
|
21 |
+
):
|
22 |
+
"""
|
23 |
+
Initializes the MNIST Data Module with configurations for dataloaders.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
batch_size (int): Batch size for training, validation, and testing.
|
27 |
+
data_dir (str): Directory to download and store the dataset.
|
28 |
+
num_workers (int): Number of workers for data loading.
|
29 |
+
train_subset_fraction (float): Fraction of training data to use (0.0 < fraction <= 1.0).
|
30 |
+
"""
|
31 |
+
super().__init__()
|
32 |
+
self.batch_size = batch_size
|
33 |
+
self.data_dir = data_dir
|
34 |
+
self.num_workers = num_workers
|
35 |
+
self.train_subset_fraction = train_subset_fraction
|
36 |
+
self.transform = transforms.Compose(
|
37 |
+
[transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
|
38 |
+
)
|
39 |
+
logger.info(f"MNIST DataModule initialized with batch size {self.batch_size}")
|
40 |
+
|
41 |
+
def prepare_data(self):
|
42 |
+
"""
|
43 |
+
Downloads the MNIST dataset if not already downloaded.
|
44 |
+
"""
|
45 |
+
datasets.MNIST(root=self.data_dir, train=True, download=True)
|
46 |
+
datasets.MNIST(root=self.data_dir, train=False, download=True)
|
47 |
+
logger.info("MNIST dataset downloaded.")
|
48 |
+
|
49 |
+
def setup(self, stage: Optional[str] = None):
|
50 |
+
"""
|
51 |
+
Set up the dataset for different stages.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
stage (str, optional): One of "fit", "validate", "test", or "predict".
|
55 |
+
"""
|
56 |
+
logger.info(f"Setting up data for stage: {stage}")
|
57 |
+
if stage == "fit" or stage is None:
|
58 |
+
full_train_dataset = datasets.MNIST(
|
59 |
+
root=self.data_dir, train=True, transform=self.transform
|
60 |
+
)
|
61 |
+
train_indices, _ = train_test_split(
|
62 |
+
range(len(full_train_dataset)),
|
63 |
+
train_size=self.train_subset_fraction,
|
64 |
+
random_state=42,
|
65 |
+
)
|
66 |
+
self.mnist_train = Subset(full_train_dataset, train_indices)
|
67 |
+
|
68 |
+
self.mnist_val = datasets.MNIST(
|
69 |
+
root=self.data_dir, train=False, transform=self.transform
|
70 |
+
)
|
71 |
+
logger.info(f"Loaded training subset: {len(self.mnist_train)} samples.")
|
72 |
+
logger.info(f"Loaded validation data: {len(self.mnist_val)} samples.")
|
73 |
+
if stage == "test" or stage is None:
|
74 |
+
self.mnist_test = datasets.MNIST(
|
75 |
+
root=self.data_dir, train=False, transform=self.transform
|
76 |
+
)
|
77 |
+
logger.info(f"Loaded test data: {len(self.mnist_test)} samples.")
|
78 |
+
|
79 |
+
def train_dataloader(self) -> DataLoader:
|
80 |
+
"""
|
81 |
+
Returns the training DataLoader.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
DataLoader: Training data loader.
|
85 |
+
"""
|
86 |
+
logger.info("Creating training DataLoader...")
|
87 |
+
return DataLoader(
|
88 |
+
self.mnist_train,
|
89 |
+
batch_size=self.batch_size,
|
90 |
+
shuffle=True,
|
91 |
+
num_workers=self.num_workers,
|
92 |
+
)
|
93 |
+
|
94 |
+
def val_dataloader(self) -> DataLoader:
|
95 |
+
"""
|
96 |
+
Returns the validation DataLoader.
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
DataLoader: Validation data loader.
|
100 |
+
"""
|
101 |
+
logger.info("Creating validation DataLoader...")
|
102 |
+
return DataLoader(
|
103 |
+
self.mnist_val,
|
104 |
+
batch_size=self.batch_size,
|
105 |
+
shuffle=False,
|
106 |
+
num_workers=self.num_workers,
|
107 |
+
)
|
108 |
+
|
109 |
+
def test_dataloader(self) -> DataLoader:
|
110 |
+
"""
|
111 |
+
Returns the test DataLoader.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
DataLoader: Test data loader.
|
115 |
+
"""
|
116 |
+
logger.info("Creating test DataLoader...")
|
117 |
+
return DataLoader(
|
118 |
+
self.mnist_test,
|
119 |
+
batch_size=self.batch_size,
|
120 |
+
shuffle=False,
|
121 |
+
num_workers=self.num_workers,
|
122 |
+
)
|
src/model.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import lightning as pl
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch
|
4 |
+
from timm import create_model
|
5 |
+
from torchmetrics.classification import Accuracy
|
6 |
+
from torch.optim.lr_scheduler import StepLR
|
7 |
+
import torch.optim as optim
|
8 |
+
from loguru import logger
|
9 |
+
|
10 |
+
logger.add("logs/model.log", rotation="1 MB", level="INFO")
|
11 |
+
|
12 |
+
|
13 |
+
class LitEfficientNet(pl.LightningModule):
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
model_name="tf_efficientnet_lite0",
|
17 |
+
num_classes=10,
|
18 |
+
lr=1e-3,
|
19 |
+
custom_loss=None,
|
20 |
+
):
|
21 |
+
"""
|
22 |
+
Initializes a CNN model from TIMM and integrates TorchMetrics.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
model_name (str): TIMM model name (e.g., "tf_efficientnet_lite0").
|
26 |
+
num_classes (int): Number of output classes (e.g., 0–9 for MNIST).
|
27 |
+
lr (float): Learning rate for the optimizer.
|
28 |
+
custom_loss (callable, optional): Custom loss function. Defaults to CrossEntropyLoss.
|
29 |
+
"""
|
30 |
+
super().__init__()
|
31 |
+
|
32 |
+
self.lr = lr
|
33 |
+
self.model = create_model(
|
34 |
+
model_name,
|
35 |
+
pretrained=True,
|
36 |
+
num_classes=num_classes,
|
37 |
+
in_chans=1, # Set to 1 channel for grayscale input
|
38 |
+
)
|
39 |
+
self.loss_fn = custom_loss or nn.CrossEntropyLoss()
|
40 |
+
self.train_acc = Accuracy(num_classes=num_classes, task="multiclass")
|
41 |
+
self.val_acc = Accuracy(num_classes=num_classes, task="multiclass")
|
42 |
+
self.test_acc = Accuracy(num_classes=num_classes, task="multiclass")
|
43 |
+
logger.info(f"Model initialized with TIMM backbone: {model_name}")
|
44 |
+
logger.info(f"Number of output classes: {num_classes}")
|
45 |
+
|
46 |
+
def forward(self, x):
|
47 |
+
"""
|
48 |
+
Forward pass of the model.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
x (torch.Tensor): Input tensor.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
torch.Tensor: Model predictions.
|
55 |
+
"""
|
56 |
+
return self.model(x)
|
57 |
+
|
58 |
+
def training_step(self, batch, batch_idx):
|
59 |
+
x, y = batch
|
60 |
+
y_hat = self(x)
|
61 |
+
loss = self.loss_fn(y_hat, y)
|
62 |
+
self.train_acc.update(y_hat, y)
|
63 |
+
self.log("train_loss", loss, prog_bar=True, logger=True)
|
64 |
+
self.log("train_acc", self.train_acc, prog_bar=True, logger=True)
|
65 |
+
return loss
|
66 |
+
|
67 |
+
def validation_step(self, batch, batch_idx):
|
68 |
+
x, y = batch
|
69 |
+
y_hat = self(x)
|
70 |
+
loss = self.loss_fn(y_hat, y)
|
71 |
+
self.val_acc.update(y_hat, y)
|
72 |
+
self.log("val_loss", loss, prog_bar=True, logger=True)
|
73 |
+
self.log("val_acc", self.val_acc, prog_bar=True, logger=True)
|
74 |
+
|
75 |
+
def test_step(self, batch, batch_idx):
|
76 |
+
x, y = batch
|
77 |
+
y_hat = self(x)
|
78 |
+
self.test_acc.update(y_hat, y)
|
79 |
+
self.log("test_acc", self.test_acc, prog_bar=True, logger=True)
|
80 |
+
|
81 |
+
def configure_optimizers(self):
|
82 |
+
optimizer = optim.Adam(self.parameters(), lr=self.lr)
|
83 |
+
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
|
84 |
+
logger.info(f"Optimizer: Adam, Learning Rate: {self.lr}")
|
85 |
+
logger.info("Scheduler: StepLR with step_size=1 and gamma=0.9")
|
86 |
+
return [optimizer], [scheduler]
|
src/test.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from loguru import logger
|
3 |
+
from src.model import LitEfficientNet
|
4 |
+
from src.dataloader import MNISTDataModule
|
5 |
+
from torchmetrics.classification import Accuracy
|
6 |
+
from pathlib import Path
|
7 |
+
from src.utils.aws_s3_services import S3Handler
|
8 |
+
|
9 |
+
# Configure Loguru to save logs to the logs/ directory
|
10 |
+
logger.add("logs/test.log", rotation="1 MB", level="INFO")
|
11 |
+
|
12 |
+
|
13 |
+
def infer(checkpoint_path, image):
|
14 |
+
"""
|
15 |
+
Perform inference on a single image using the model checkpoint.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
checkpoint_path (str): Path to the model checkpoint.
|
19 |
+
image (torch.Tensor): Image tensor to predict (shape: [1, 28, 28] for MNIST).
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
int: Predicted class (0-9).
|
23 |
+
"""
|
24 |
+
logger.info(f"Loading model from checkpoint: {checkpoint_path} for inference...")
|
25 |
+
if not Path(checkpoint_path).exists():
|
26 |
+
logger.error(f"Checkpoint not found: {checkpoint_path}")
|
27 |
+
raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
|
28 |
+
|
29 |
+
# Detect device
|
30 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
31 |
+
logger.info(f"Inference will run on device: {device}")
|
32 |
+
|
33 |
+
# Load the model
|
34 |
+
model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
|
35 |
+
model.eval()
|
36 |
+
|
37 |
+
# Perform inference
|
38 |
+
with torch.no_grad():
|
39 |
+
if image.dim() == 3:
|
40 |
+
image = image.unsqueeze(0) # Add batch dimension if needed
|
41 |
+
image = image.to(device) # Ensure the image is on the same device as the model
|
42 |
+
prediction = model(image)
|
43 |
+
predicted_class = torch.argmax(prediction, dim=1).item()
|
44 |
+
|
45 |
+
logger.info(f"Predicted class: {predicted_class}")
|
46 |
+
return predicted_class
|
47 |
+
|
48 |
+
|
49 |
+
def test_model(checkpoint_path):
|
50 |
+
"""
|
51 |
+
Test the model using the test dataset and log metrics.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
checkpoint_path (str): Path to the model checkpoint.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
float: Final test accuracy.
|
58 |
+
"""
|
59 |
+
logger.info(f"Loading model from checkpoint: {checkpoint_path} for testing...")
|
60 |
+
if not Path(checkpoint_path).exists():
|
61 |
+
logger.error(f"Checkpoint not found: {checkpoint_path}")
|
62 |
+
raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
|
63 |
+
|
64 |
+
# Detect device
|
65 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
66 |
+
logger.info(f"Testing will run on device: {device}")
|
67 |
+
|
68 |
+
# Load the model
|
69 |
+
model = LitEfficientNet.load_from_checkpoint(checkpoint_path).to(device)
|
70 |
+
model.eval()
|
71 |
+
|
72 |
+
# Set up data module and load test data
|
73 |
+
data_module = MNISTDataModule()
|
74 |
+
data_module.setup(stage="test")
|
75 |
+
test_loader = data_module.test_dataloader()
|
76 |
+
|
77 |
+
# Initialize accuracy metric
|
78 |
+
test_acc = Accuracy(num_classes=10, task="multiclass").to(device)
|
79 |
+
|
80 |
+
# Evaluate model on test data
|
81 |
+
logger.info("Evaluating on test dataset...")
|
82 |
+
with torch.no_grad():
|
83 |
+
for images, labels in test_loader:
|
84 |
+
images, labels = images.to(device), labels.to(
|
85 |
+
device
|
86 |
+
) # Move data to the same device
|
87 |
+
outputs = model(images)
|
88 |
+
test_acc.update(outputs, labels)
|
89 |
+
|
90 |
+
accuracy = test_acc.compute().item()
|
91 |
+
logger.info(f"Final Test Accuracy (TorchMetrics): {accuracy:.2%}")
|
92 |
+
return accuracy
|
93 |
+
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
|
97 |
+
# downloading from s3
|
98 |
+
s3_handler = S3Handler(bucket_name="deep-bucket-s3")
|
99 |
+
s3_handler.download_folder(
|
100 |
+
"checkpoints_test",
|
101 |
+
"checkpoints",
|
102 |
+
)
|
103 |
+
checkpoint_path = "./checkpoints/best_model.ckpt"
|
104 |
+
try:
|
105 |
+
# Perform testing
|
106 |
+
test_accuracy = test_model(checkpoint_path)
|
107 |
+
logger.info(f"Test completed successfully with accuracy: {test_accuracy:.2%}")
|
108 |
+
|
109 |
+
# Example inference
|
110 |
+
logger.info("Running inference on a single test image...")
|
111 |
+
dummy_image = torch.randn(1, 28, 28) # Replace with actual test image
|
112 |
+
predicted_class = infer(checkpoint_path, dummy_image)
|
113 |
+
logger.info(f"Inference result: Predicted class {predicted_class}")
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(f"An error occurred: {e}")
|
src/train.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import lightning as pl
|
2 |
+
from lightning.pytorch.callbacks import (
|
3 |
+
ModelCheckpoint,
|
4 |
+
EarlyStopping,
|
5 |
+
LearningRateMonitor,
|
6 |
+
RichProgressBar,
|
7 |
+
)
|
8 |
+
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
|
9 |
+
from lightning.pytorch.callbacks import ModelSummary
|
10 |
+
from src.dataloader import MNISTDataModule
|
11 |
+
from src.model import LitEfficientNet
|
12 |
+
from loguru import logger
|
13 |
+
import os
|
14 |
+
from src.utils.aws_s3_services import S3Handler
|
15 |
+
|
16 |
+
# Ensure the logs directory exists
|
17 |
+
os.makedirs("logs", exist_ok=True)
|
18 |
+
|
19 |
+
# Configure Loguru for logging
|
20 |
+
logger.add("logs/training.log", rotation="1 MB", level="INFO")
|
21 |
+
|
22 |
+
|
23 |
+
def main():
|
24 |
+
"""
|
25 |
+
Main training loop for the model with advanced configuration (CPU training).
|
26 |
+
"""
|
27 |
+
# Data Module
|
28 |
+
logger.info("Setting up data module...")
|
29 |
+
data_module = MNISTDataModule(batch_size=256)
|
30 |
+
|
31 |
+
# Model
|
32 |
+
logger.info("Setting up model...")
|
33 |
+
model = LitEfficientNet(model_name="tf_efficientnet_lite0", num_classes=10, lr=1e-3)
|
34 |
+
logger.info(model)
|
35 |
+
|
36 |
+
# Callbacks
|
37 |
+
logger.info("Setting up callbacks...")
|
38 |
+
checkpoint_callback = ModelCheckpoint(
|
39 |
+
monitor="val_acc",
|
40 |
+
dirpath="checkpoints/",
|
41 |
+
filename="best_model",
|
42 |
+
save_top_k=1,
|
43 |
+
mode="max",
|
44 |
+
auto_insert_metric_name=False,
|
45 |
+
verbose=True,
|
46 |
+
save_last=True,
|
47 |
+
enable_version_counter=False,
|
48 |
+
)
|
49 |
+
early_stopping_callback = EarlyStopping(
|
50 |
+
monitor="val_acc",
|
51 |
+
patience=5, # Extended patience for advanced models
|
52 |
+
mode="max",
|
53 |
+
verbose=True,
|
54 |
+
)
|
55 |
+
lr_monitor = LearningRateMonitor(logging_interval="epoch") # Log learning rate
|
56 |
+
rich_progress = RichProgressBar()
|
57 |
+
model_summary = ModelSummary(
|
58 |
+
max_depth=1
|
59 |
+
) # Show only the first level of model layers
|
60 |
+
|
61 |
+
# Loggers
|
62 |
+
logger.info("Setting up loggers...")
|
63 |
+
csv_logger = CSVLogger("logs/", name="mnist_csv")
|
64 |
+
tb_logger = TensorBoardLogger("logs/", name="mnist_tb")
|
65 |
+
|
66 |
+
# Trainer Configuration for CPU
|
67 |
+
logger.info("Setting up trainer...")
|
68 |
+
trainer = pl.Trainer(
|
69 |
+
max_epochs=2,
|
70 |
+
callbacks=[
|
71 |
+
checkpoint_callback,
|
72 |
+
early_stopping_callback,
|
73 |
+
lr_monitor,
|
74 |
+
rich_progress,
|
75 |
+
model_summary,
|
76 |
+
],
|
77 |
+
logger=[csv_logger, tb_logger],
|
78 |
+
deterministic=True,
|
79 |
+
accelerator="auto",
|
80 |
+
devices="auto",
|
81 |
+
)
|
82 |
+
|
83 |
+
# Train the model
|
84 |
+
logger.info("Training the model...")
|
85 |
+
trainer.fit(model, datamodule=data_module)
|
86 |
+
|
87 |
+
# Test the model
|
88 |
+
logger.info("Testing the model...")
|
89 |
+
data_module.setup(stage="test")
|
90 |
+
trainer.test(model, datamodule=data_module)
|
91 |
+
|
92 |
+
# write a checkpoints/train_done.flag
|
93 |
+
with open("checkpoints/train_done.flag", "w") as f:
|
94 |
+
f.write("Training done.")
|
95 |
+
|
96 |
+
# upload checkpoints to S3
|
97 |
+
s3_handler = S3Handler(bucket_name="deep-bucket-s3")
|
98 |
+
s3_handler.upload_folder(
|
99 |
+
"checkpoints",
|
100 |
+
"checkpoints_test",
|
101 |
+
)
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
main()
|
src/utils/aws_s3_services.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
from dotenv import load_dotenv, find_dotenv
|
5 |
+
|
6 |
+
# Load environment variables from .env file
|
7 |
+
load_dotenv(find_dotenv(".env"))
|
8 |
+
|
9 |
+
|
10 |
+
class S3Handler:
|
11 |
+
def __init__(self, bucket_name):
|
12 |
+
self.bucket_name = bucket_name
|
13 |
+
self.s3 = boto3.client(
|
14 |
+
"s3",
|
15 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
16 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
17 |
+
region_name=os.getenv("AWS_REGION"),
|
18 |
+
)
|
19 |
+
|
20 |
+
def upload_folder(self, source_folder, dest_folder, filenames=None):
|
21 |
+
"""
|
22 |
+
Upload specified files or all files from a local folder to an S3 folder.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
source_folder (str): Local source folder path.
|
26 |
+
dest_folder (str): Destination folder path in S3.
|
27 |
+
filenames (list): List of filenames to upload (relative to source_folder). If None, uploads all files.
|
28 |
+
"""
|
29 |
+
source_folder = Path(source_folder)
|
30 |
+
|
31 |
+
# Select files based on filenames list or all files if filenames is None
|
32 |
+
files_to_upload = (
|
33 |
+
[source_folder / file for file in filenames]
|
34 |
+
if filenames
|
35 |
+
else list(source_folder.rglob("*"))
|
36 |
+
)
|
37 |
+
|
38 |
+
for file_path in files_to_upload:
|
39 |
+
if file_path.is_file():
|
40 |
+
s3_path = f"{dest_folder}/{file_path.relative_to(source_folder)}"
|
41 |
+
self.s3.upload_file(str(file_path), self.bucket_name, s3_path)
|
42 |
+
print(f"Uploaded: {file_path} to {s3_path}")
|
43 |
+
else:
|
44 |
+
print(f"File not found: {file_path}")
|
45 |
+
|
46 |
+
def download_folder(self, s3_folder, dest_folder):
|
47 |
+
"""
|
48 |
+
Download all files from an S3 folder to a local folder.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
s3_folder (str): Source folder in S3.
|
52 |
+
dest_folder (str): Local destination folder path.
|
53 |
+
"""
|
54 |
+
dest_folder = Path(dest_folder).resolve()
|
55 |
+
paginator = self.s3.get_paginator("list_objects_v2")
|
56 |
+
|
57 |
+
for page in paginator.paginate(Bucket=self.bucket_name, Prefix=s3_folder):
|
58 |
+
for obj in page.get("Contents", []):
|
59 |
+
s3_path = obj["Key"]
|
60 |
+
# Skip folder itself if returned by S3
|
61 |
+
if s3_path.endswith("/"):
|
62 |
+
continue
|
63 |
+
|
64 |
+
# Compute relative path and local destination
|
65 |
+
relative_path = Path(s3_path[len(s3_folder) :].lstrip("/"))
|
66 |
+
local_path = dest_folder / relative_path
|
67 |
+
|
68 |
+
# Create necessary local directories
|
69 |
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
70 |
+
|
71 |
+
# Download file
|
72 |
+
self.s3.download_file(self.bucket_name, s3_path, str(local_path))
|
73 |
+
print(f"Downloaded: {s3_path} to {local_path}")
|
74 |
+
|
75 |
+
|
76 |
+
# Usage Example
|
77 |
+
if __name__ == "__main__":
|
78 |
+
# Initialize with bucket name
|
79 |
+
s3_handler = S3Handler(bucket_name="deep-bucket-s3")
|
80 |
+
|
81 |
+
# Upload specific files
|
82 |
+
s3_handler.upload_folder(
|
83 |
+
"checkpoints_test",
|
84 |
+
"checkpoints_test",
|
85 |
+
)
|
86 |
+
|
87 |
+
# Download example
|
88 |
+
s3_handler.download_folder("checkpoints_test", "checkpoints_test")
|