From ccfd686d1e60b02a6a3fc85507527fc9e8cd0edf Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 12 Feb 2025 12:10:16 +0000 Subject: [PATCH 01/15] Updated fleet instance types and image ami --- aws/ref/create_fleet_config.json | 14 +++++++------- aws/ref/launch_template_config.json | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/aws/ref/create_fleet_config.json b/aws/ref/create_fleet_config.json index 6a00656..9b5ddc4 100644 --- a/aws/ref/create_fleet_config.json +++ b/aws/ref/create_fleet_config.json @@ -20,6 +20,9 @@ { "InstanceType": "g4ad.4xlarge" }, + { + "InstanceType": "g4ad.8xlarge" + }, { "InstanceType": "g4dn.xlarge" }, @@ -30,19 +33,16 @@ "InstanceType": "g4dn.4xlarge" }, { - "InstanceType": "g5.xlarge" - }, - { - "InstanceType": "g5.2xlarge" + "InstanceType": "g4dn.8xlarge" }, { - "InstanceType": "g6.xlarge" + "InstanceType": "g5.xlarge" }, { - "InstanceType": "g6.2xlarge" + "InstanceType": "g5.2xlarge" }, { - "InstanceType": "g6.4xlarge" + "InstanceType": "g5.4xlarge" } ] } diff --git a/aws/ref/launch_template_config.json b/aws/ref/launch_template_config.json index d6f15e8..f08eda3 100644 --- a/aws/ref/launch_template_config.json +++ b/aws/ref/launch_template_config.json @@ -20,8 +20,8 @@ "Groups":["sg-03864b806cd78ded3"] } ], - "ImageId": "ami-00385a401487aefa4", - "InstanceType": "t2.micro", + "ImageId": "ami-0000c3ca6a299ab94", + "InstanceType": "g4ad.xlarge", "KeyName": "kaggle", "Placement": { "AvailabilityZone": "eu-west-1a" From 38c9ee8c01130114861a44a2bf63915251985735 Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 12 Feb 2025 14:13:40 +0000 Subject: [PATCH 02/15] Updated ami number --- aws/ref/launch_template_config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/ref/launch_template_config.json b/aws/ref/launch_template_config.json index f08eda3..87bfdaa 100644 --- a/aws/ref/launch_template_config.json +++ b/aws/ref/launch_template_config.json @@ -7,7 +7,7 @@ { "DeviceName": "/dev/xvda", "Ebs": { - "VolumeSize": 50, + "VolumeSize": 70, "VolumeType": "gp3" } } @@ -20,7 +20,7 @@ "Groups":["sg-03864b806cd78ded3"] } ], - "ImageId": "ami-0000c3ca6a299ab94", + "ImageId": "ami-000b13fcd5cd7b0f8", "InstanceType": "g4ad.xlarge", "KeyName": "kaggle", "Placement": { From 33944581ac4e278b328f99a0ccf672dbaaf9a10b Mon Sep 17 00:00:00 2001 From: Oisin Date: Wed, 12 Feb 2025 14:13:59 +0000 Subject: [PATCH 03/15] Added docker run examples --- exeDocker.cmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exeDocker.cmd b/exeDocker.cmd index a549d57..9a00bd5 100644 --- a/exeDocker.cmd +++ b/exeDocker.cmd @@ -13,6 +13,8 @@ call docker build --no-cache -t %DOCKER_IMAGE% . :: run docker container call docker run --name %DOCKER_CONTAINER_NAME% --shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE% +::call docker run --entrypoint sh --name %DOCKER_CONTAINER_NAME% ---shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE% +::call docker run -it --entrypoint bash --name cc --shm-size=512m --volume /home/ec2-user/.creds:/home/ubuntu/CatClassifier/.creds --rm oislen/cat-classifier:latest :: useful docker commands :: docker images From 9c7a07df1ada92ddd47acab7da32194f9eacf832 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 08:53:23 +0000 Subject: [PATCH 04/15] Reordered command to place --isFleet forward --- aws/exeRunEC2.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/exeRunEC2.cmd b/aws/exeRunEC2.cmd index b21da7e..df4a31a 100644 --- a/aws/exeRunEC2.cmd +++ b/aws/exeRunEC2.cmd @@ -1 +1 @@ -call python prg_run_ec2_instance.py --launch --terminate --describe --isFleet \ No newline at end of file +call python prg_run_ec2_instance.py --isFleet --launch --terminate --describe \ No newline at end of file From 6af99e16aecc22609b4fa7869cc77f14d0379488 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 08:54:37 +0000 Subject: [PATCH 05/15] Replaced ec2-user with ubuntu. Updated removed sed -i replace and bash linux_docker_setup --- aws/exeSetUpEC2.cmd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/aws/exeSetUpEC2.cmd b/aws/exeSetUpEC2.cmd index 65884e9..a268627 100644 --- a/aws/exeSetUpEC2.cmd +++ b/aws/exeSetUpEC2.cmd @@ -1,5 +1,6 @@ :: set EC2 login info -SET EC2_USER=ec2-user +::SET EC2_USER=ec2-user +SET EC2_USER=ubuntu SET EC2_PEM_FPATH="C:\Users\oisin\.aws\kaggle.pem" SET EC2_CREDS_FDIR=E:\GitHub\CatClassifier\.creds SET EC2_SETUP_FPATH=E:\GitHub\CatClassifier\aws\linux_docker_setup.sh @@ -17,5 +18,6 @@ call scp -i %EC2_PEM_FPATH% -r %EC2_CREDS_FDIR% %EC2_USER%@%EC2_DNS%:~/. call scp -i %EC2_PEM_FPATH% %EC2_SETUP_FPATH% %EC2_USER%@%EC2_DNS%:~/linux_docker_setup.sh call scp -i %EC2_PEM_FPATH% %EC2_EXTRACT_FPATH% %EC2_USER%@%EC2_DNS%:~/docker_extract_data.sh :: ssh to EC2 and run linux setp -call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh" +call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% +::call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh" ENDLOCAL \ No newline at end of file From 4cef83eb7a11e13e1fa1c58c32c775603b42b868 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 08:55:04 +0000 Subject: [PATCH 06/15] Updated storage to update root to 100gb --- aws/ref/launch_template_config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/ref/launch_template_config.json b/aws/ref/launch_template_config.json index 87bfdaa..10f688b 100644 --- a/aws/ref/launch_template_config.json +++ b/aws/ref/launch_template_config.json @@ -5,9 +5,9 @@ "LaunchTemplateData": { "BlockDeviceMappings": [ { - "DeviceName": "/dev/xvda", + "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": 70, + "VolumeSize": 100, "VolumeType": "gp3" } } From 78ad0c9148cf85e9362af7eefe84197da113047a Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 08:55:46 +0000 Subject: [PATCH 07/15] Removed g4ad instances, and add additional g4dn and g5 instances. --- aws/ref/create_fleet_config.json | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/aws/ref/create_fleet_config.json b/aws/ref/create_fleet_config.json index 9b5ddc4..d296070 100644 --- a/aws/ref/create_fleet_config.json +++ b/aws/ref/create_fleet_config.json @@ -11,18 +11,6 @@ "Version": "$Latest" }, "Overrides": [ - { - "InstanceType": "g4ad.xlarge" - }, - { - "InstanceType": "g4ad.2xlarge" - }, - { - "InstanceType": "g4ad.4xlarge" - }, - { - "InstanceType": "g4ad.8xlarge" - }, { "InstanceType": "g4dn.xlarge" }, @@ -35,6 +23,9 @@ { "InstanceType": "g4dn.8xlarge" }, + { + "InstanceType": "g4dn.12xlarge" + }, { "InstanceType": "g5.xlarge" }, @@ -43,6 +34,12 @@ }, { "InstanceType": "g5.4xlarge" + }, + { + "InstanceType": "g5.8xlarge" + }, + { + "InstanceType": "g5.12xlarge" } ] } From 8ab1079e6930ddf9f4d425e8dbbf83d18a72e6a7 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 10:28:35 +0000 Subject: [PATCH 08/15] Added comments for installing anaconda --- Dockerfile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Dockerfile b/Dockerfile index c48e678..9d1d3ed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,6 +25,15 @@ COPY . /home/ubuntu/CatClassifier RUN mkdir /home/${user}/CatClassifier/data RUN mkdir /home/${user}/CatClassifier/model/checkpoints +## install anaconda +#RUN wget https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh +#RUN shasum -a 256 ~/Anaconda3-2024.10-1-Linux-x86_64.sh +#RUN bash ~/Anaconda3-2024.10-1-Linux-x86_64.sh -b -p /home/ubuntu/anaconda3 +#RUN export PATH="/home/ubuntu/anaconda3/bin:PATH" +#RUN conda init +#RUN source ~/.bashrc +# RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch-nightly -c nvidia -y + # add deadsnakes ppa RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:deadsnakes/ppa From a58879f9d97426b56f5feab0f4f1a07e697bf9fc Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 10:29:40 +0000 Subject: [PATCH 09/15] Added comments for installing cuda toolkit --- aws/linux_docker_setup.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh index 89c1a5c..8679658 100644 --- a/aws/linux_docker_setup.sh +++ b/aws/linux_docker_setup.sh @@ -3,7 +3,7 @@ # 2. make sure to increase volume in /dev/nvme0n1 (/dev/xvda) e.g. 100gb # linux file formatting -# sudo yum install -y dos2unix +# sudo apt install -y dos2unix # dos2unix ./linux_docker_setup.sh #-- EC2 Spot Instance Checks --# @@ -14,6 +14,9 @@ df -h lscpu # calculate percentage of used memory free -m | awk 'FNR == 2 {print $3/($3+$4)*100}' +# check gpu status +nvidia-smi +# watch -n 0.5 nvidia-smi #-- Configure Permissions and Overcommit Settings --# @@ -54,11 +57,14 @@ sudo umount /tmp #-- Download Required Programmes --# # update os -sudo yum update -y +sudo apt update -y # install required base software -sudo yum install -y htop vim tmux dos2unix docker git +sudo apt install -y htop vim tmux dos2unix docker git # remove unneed dependencies -sudo yum autoremove +sudo apt autoremove +# install nvidia cuda toolkit +#sudo apt install nvidia-cuda-toolkit +# nvcc --version #-- Pull Git Repo --# From d1d7e4182f5c00220b30aa7212b786b764726863 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 11:17:45 +0000 Subject: [PATCH 10/15] Added command for installing torch for cuda 12.1 --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9d1d3ed..ca88cd7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,7 @@ RUN mkdir /home/${user}/CatClassifier/model/checkpoints #RUN export PATH="/home/ubuntu/anaconda3/bin:PATH" #RUN conda init #RUN source ~/.bashrc -# RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch-nightly -c nvidia -y +#RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch-nightly -c nvidia -y # add deadsnakes ppa RUN apt-get install -y software-properties-common @@ -42,6 +42,7 @@ RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-venv RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN /opt/venv/bin/python3 -m pip install -v -r /home/ubuntu/CatClassifier/requirements.txt +#RUN /opt/venv/bin/python3# python3 -m pip install torch==2.5.1+cu121 torchaudio==2.5.1+cu121 torchvision==0.20.1+cu121 WORKDIR /home/${user} ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"] \ No newline at end of file From 7c7a1dcb8d47a75ff9c4fca866946bee20c65b70 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 11:18:08 +0000 Subject: [PATCH 11/15] Running apt-get --- aws/linux_docker_setup.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh index 8679658..e0196ff 100644 --- a/aws/linux_docker_setup.sh +++ b/aws/linux_docker_setup.sh @@ -3,7 +3,7 @@ # 2. make sure to increase volume in /dev/nvme0n1 (/dev/xvda) e.g. 100gb # linux file formatting -# sudo apt install -y dos2unix +# sudo apt-get install -y dos2unix # dos2unix ./linux_docker_setup.sh #-- EC2 Spot Instance Checks --# @@ -57,14 +57,11 @@ sudo umount /tmp #-- Download Required Programmes --# # update os -sudo apt update -y +sudo apt-get update -y # install required base software -sudo apt install -y htop vim tmux dos2unix docker git +sudo apt-get install -y htop vim tmux dos2unix docker git # remove unneed dependencies -sudo apt autoremove -# install nvidia cuda toolkit -#sudo apt install nvidia-cuda-toolkit -# nvcc --version +sudo apt-get autoremove #-- Pull Git Repo --# From 8912843230f9bb9f708119d964e1ed7693466eec Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 11:18:33 +0000 Subject: [PATCH 12/15] Modifyied torch to install for cuda 12.1 --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1d71990..942b78d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,8 @@ matplotlib==3.10.0 seaborn==0.13.2 kaggle==1.6.17 jupyterlab==4.3.5 -torch==2.6.0 -torchvision==0.21.0 +torch==2.5.1+cu121 +torchaudio==2.5.1+cu121 +torchvision==0.20.1+cu121 beartype==0.19.0 boto3==1.36.13 \ No newline at end of file From 5d323c4b004c4f8c1d422b873c4a1332a8045394 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 12:09:29 +0000 Subject: [PATCH 13/15] Added globel environment variable for controlling the use of gpu when using pytorch --- model/arch/classify_image_torch.py | 2 +- model/cons.py | 3 ++- model/prg_torch_model.py | 2 +- report/torch_analysis_results.ipynb | 4 ++-- report/torch_analysis_results.qmd | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/model/arch/classify_image_torch.py b/model/arch/classify_image_torch.py index 1703ac5..b794307 100644 --- a/model/arch/classify_image_torch.py +++ b/model/arch/classify_image_torch.py @@ -33,7 +33,7 @@ from model.torch.CustomDataset import CustomDataset # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size diff --git a/model/cons.py b/model/cons.py index 3104f6f..1c46796 100644 --- a/model/cons.py +++ b/model/cons.py @@ -78,4 +78,5 @@ shuffle = False # multiprocessing -num_workers = os.cpu_count() \ No newline at end of file +num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count()) +check_gpu = os.environ.get("PARAM_CHECK_GPU", False) \ No newline at end of file diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 2cc37d8..ab1f1db 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -35,7 +35,7 @@ num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size diff --git a/report/torch_analysis_results.ipynb b/report/torch_analysis_results.ipynb index fca9ada..41ce24f 100644 --- a/report/torch_analysis_results.ipynb +++ b/report/torch_analysis_results.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "504c7d94", "metadata": {}, "outputs": [ @@ -113,7 +113,7 @@ ], "source": [ "# device configuration\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')\n", "# load trained torch model\n", "model = VGG16_pretrained(num_classes=2).to(device)\n", "model.load(input_fpath=cons.torch_model_pt_fpath)\n", diff --git a/report/torch_analysis_results.qmd b/report/torch_analysis_results.qmd index 28c8aab..c543592 100644 --- a/report/torch_analysis_results.qmd +++ b/report/torch_analysis_results.qmd @@ -43,7 +43,7 @@ A pre-trained VGG CNN model with 16 layers was trained using the processed image ```{python} # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') # load trained torch model model = VGG16_pretrained(num_classes=2).to(device) model.load(input_fpath=cons.torch_model_pt_fpath) From f165cbf80fe26b5ac26ce2f242bd73a2587506a6 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 12:23:47 +0000 Subject: [PATCH 14/15] Removed torch from requirements. Installing separetly given cuda gpu dependency --- requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 942b78d..d7f7308 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,5 @@ matplotlib==3.10.0 seaborn==0.13.2 kaggle==1.6.17 jupyterlab==4.3.5 -torch==2.5.1+cu121 -torchaudio==2.5.1+cu121 -torchvision==0.20.1+cu121 beartype==0.19.0 boto3==1.36.13 \ No newline at end of file From 5c9b43dd3dbf2976722c71297f476d831d6a0c4c Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 12:24:41 +0000 Subject: [PATCH 15/15] Installing torch separetly to requirements due to cuda gpu verioning dependency --- Dockerfile | 11 +---------- conda/catclass.cmd | 3 ++- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index ca88cd7..9f46c9e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,15 +25,6 @@ COPY . /home/ubuntu/CatClassifier RUN mkdir /home/${user}/CatClassifier/data RUN mkdir /home/${user}/CatClassifier/model/checkpoints -## install anaconda -#RUN wget https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh -#RUN shasum -a 256 ~/Anaconda3-2024.10-1-Linux-x86_64.sh -#RUN bash ~/Anaconda3-2024.10-1-Linux-x86_64.sh -b -p /home/ubuntu/anaconda3 -#RUN export PATH="/home/ubuntu/anaconda3/bin:PATH" -#RUN conda init -#RUN source ~/.bashrc -#RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch-nightly -c nvidia -y - # add deadsnakes ppa RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:deadsnakes/ppa @@ -42,7 +33,7 @@ RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-venv RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN /opt/venv/bin/python3 -m pip install -v -r /home/ubuntu/CatClassifier/requirements.txt -#RUN /opt/venv/bin/python3# python3 -m pip install torch==2.5.1+cu121 torchaudio==2.5.1+cu121 torchvision==0.20.1+cu121 +RUN /opt/venv/bin/python3 -m pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 WORKDIR /home/${user} ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"] \ No newline at end of file diff --git a/conda/catclass.cmd b/conda/catclass.cmd index e223f67..2de8a44 100644 --- a/conda/catclass.cmd +++ b/conda/catclass.cmd @@ -8,4 +8,5 @@ call conda activate catclass call conda update -n base conda --yes :: install relevant libraries -call pip install -v -r ..\requirements.txt \ No newline at end of file +call pip install -v -r ..\requirements.txt +call pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 \ No newline at end of file