oislen · oislen · Feb 13, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -33,6 +33,7 @@ RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-venv
 RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 RUN /opt/venv/bin/python3 -m pip install -v -r /home/ubuntu/CatClassifier/requirements.txt
+RUN /opt/venv/bin/python3 -m pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
 
 WORKDIR /home/${user}
 ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"]
diff --git a/aws/exeRunEC2.cmd b/aws/exeRunEC2.cmd
@@ -1 +1 @@
-call python prg_run_ec2_instance.py --launch --terminate --describe --isFleet
+call python prg_run_ec2_instance.py --isFleet --launch --terminate --describe
diff --git a/aws/exeSetUpEC2.cmd b/aws/exeSetUpEC2.cmd
@@ -1,5 +1,6 @@
 :: set EC2 login info
-SET EC2_USER=ec2-user
+::SET EC2_USER=ec2-user
+SET EC2_USER=ubuntu
 SET EC2_PEM_FPATH="C:\Users\oisin\.aws\kaggle.pem"
 SET EC2_CREDS_FDIR=E:\GitHub\CatClassifier\.creds
 SET EC2_SETUP_FPATH=E:\GitHub\CatClassifier\aws\linux_docker_setup.sh
@@ -17,5 +18,6 @@ call scp -i %EC2_PEM_FPATH% -r %EC2_CREDS_FDIR% %EC2_USER%@%EC2_DNS%:~/.
 call scp -i %EC2_PEM_FPATH% %EC2_SETUP_FPATH% %EC2_USER%@%EC2_DNS%:~/linux_docker_setup.sh
 call scp -i %EC2_PEM_FPATH% %EC2_EXTRACT_FPATH% %EC2_USER%@%EC2_DNS%:~/docker_extract_data.sh
 :: ssh to EC2 and run linux setp
-call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh"
+call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS%
+::call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh"
 ENDLOCAL
diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh
@@ -3,7 +3,7 @@
 # 2. make sure to increase volume in /dev/nvme0n1 (/dev/xvda) e.g. 100gb
 
 # linux file formatting
-# sudo yum install -y dos2unix 
+# sudo apt-get install -y dos2unix
 # dos2unix ./linux_docker_setup.sh 
 
 #-- EC2 Spot Instance Checks --#
@@ -14,6 +14,9 @@ df -h
 lscpu
 # calculate percentage of used memory
 free -m | awk 'FNR == 2 {print $3/($3+$4)*100}'
+# check gpu status
+nvidia-smi
+# watch -n 0.5 nvidia-smi
 
 #-- Configure Permissions and Overcommit Settings --#
 
@@ -54,11 +57,11 @@ sudo umount /tmp
 #-- Download Required Programmes --#
 
 # update os
-sudo yum update -y
+sudo apt-get update -y
 # install required base software
-sudo yum install -y htop vim tmux dos2unix docker git
+sudo apt-get install -y htop vim tmux dos2unix docker git
 # remove unneed dependencies
-sudo yum autoremove
+sudo apt-get autoremove
 
 #-- Pull Git Repo --#
 

diff --git a/aws/ref/create_fleet_config.json b/aws/ref/create_fleet_config.json
@@ -12,22 +12,19 @@
             },
             "Overrides": [
                 {
-                    "InstanceType": "g4ad.xlarge"
-                },
-                {
-                    "InstanceType": "g4ad.2xlarge"
+                    "InstanceType": "g4dn.xlarge"
                 },
                 {
-                    "InstanceType": "g4ad.4xlarge"
+                    "InstanceType": "g4dn.2xlarge"
                 },
                 {
-                    "InstanceType": "g4dn.xlarge"
+                    "InstanceType": "g4dn.4xlarge"
                 },
                 {
-                    "InstanceType": "g4dn.2xlarge"
+                    "InstanceType": "g4dn.8xlarge"
                 },
                 {
-                    "InstanceType": "g4dn.4xlarge"
+                    "InstanceType": "g4dn.12xlarge"
                 },
                 {
                     "InstanceType": "g5.xlarge"
@@ -36,13 +33,13 @@
                     "InstanceType": "g5.2xlarge"
                 },
                 {
-                    "InstanceType": "g6.xlarge"
+                    "InstanceType": "g5.4xlarge"
                 },
                 {
-                    "InstanceType": "g6.2xlarge"
+                    "InstanceType": "g5.8xlarge"
                 },
                 {
-                    "InstanceType": "g6.4xlarge"
+                    "InstanceType": "g5.12xlarge"
                 }
             ]
         }

diff --git a/aws/ref/launch_template_config.json b/aws/ref/launch_template_config.json
@@ -5,9 +5,9 @@
     "LaunchTemplateData": {
         "BlockDeviceMappings": [
             {
-                "DeviceName": "/dev/xvda",
+                "DeviceName": "/dev/sda1",
                 "Ebs": {
-                    "VolumeSize": 50,
+                    "VolumeSize": 100,
                     "VolumeType": "gp3"
                 }
             }
@@ -20,8 +20,8 @@
                 "Groups":["sg-03864b806cd78ded3"]
             }
         ],
-        "ImageId": "ami-00385a401487aefa4",
-        "InstanceType": "t2.micro",
+        "ImageId": "ami-000b13fcd5cd7b0f8",
+        "InstanceType": "g4ad.xlarge",
         "KeyName": "kaggle",
         "Placement": {
             "AvailabilityZone": "eu-west-1a"

diff --git a/conda/catclass.cmd b/conda/catclass.cmd
@@ -8,4 +8,5 @@ call conda activate catclass
 call conda update -n base conda --yes
 
 :: install relevant libraries
-call pip install -v -r ..\requirements.txt
+call pip install -v -r ..\requirements.txt
+call pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
diff --git a/exeDocker.cmd b/exeDocker.cmd
@@ -13,6 +13,8 @@ call docker build --no-cache -t %DOCKER_IMAGE% .
 
 :: run docker container
 call docker run --name %DOCKER_CONTAINER_NAME% --shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds  --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE%
+::call docker run --entrypoint sh --name %DOCKER_CONTAINER_NAME% ---shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds  --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE%
+::call docker run -it --entrypoint bash --name cc --shm-size=512m --volume /home/ec2-user/.creds:/home/ubuntu/CatClassifier/.creds --rm  oislen/cat-classifier:latest
 
 :: useful docker commands
 :: docker images

diff --git a/model/arch/classify_image_torch.py b/model/arch/classify_image_torch.py
@@ -33,7 +33,7 @@
 from model.torch.CustomDataset import CustomDataset
 
 # device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')
 
 torch_transforms = transforms.Compose([
     transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT])  # resize the input image to a uniform size

diff --git a/model/cons.py b/model/cons.py
@@ -78,4 +78,5 @@
 shuffle = False
 
 # multiprocessing
-num_workers = os.cpu_count()
+num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count())
+check_gpu = os.environ.get("PARAM_CHECK_GPU", False)
diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py
@@ -35,7 +35,7 @@
 num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs
 
 # device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')
 
 torch_transforms = transforms.Compose([
     transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT])  # resize the input image to a uniform size

diff --git a/report/torch_analysis_results.ipynb b/report/torch_analysis_results.ipynb
@@ -50,7 +50,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
       "id": "504c7d94",
       "metadata": {},
       "outputs": [
@@ -113,7 +113,7 @@
       ],
       "source": [
         "# device configuration\n",
-        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')\n",
         "# load trained torch model\n",
         "model = VGG16_pretrained(num_classes=2).to(device)\n",
         "model.load(input_fpath=cons.torch_model_pt_fpath)\n",

diff --git a/report/torch_analysis_results.qmd b/report/torch_analysis_results.qmd
@@ -43,7 +43,7 @@ A pre-trained VGG CNN model with 16 layers was trained using the processed image
 
 ```{python}
 # device configuration
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')
 # load trained torch model
 model = VGG16_pretrained(num_classes=2).to(device)
 model.load(input_fpath=cons.torch_model_pt_fpath)

diff --git a/requirements.txt b/requirements.txt
@@ -11,7 +11,5 @@ matplotlib==3.10.0
 seaborn==0.13.2
 kaggle==1.6.17
 jupyterlab==4.3.5
-torch==2.6.0
-torchvision==0.21.0
 beartype==0.19.0
 boto3==1.36.13
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		call python prg_run_ec2_instance.py --launch --terminate --describe --isFleet
		call python prg_run_ec2_instance.py --isFleet --launch --terminate --describe