diff --git a/Dockerfile b/Dockerfile index c48e678..9f46c9e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,6 +33,7 @@ RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-venv RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN /opt/venv/bin/python3 -m pip install -v -r /home/ubuntu/CatClassifier/requirements.txt +RUN /opt/venv/bin/python3 -m pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 WORKDIR /home/${user} ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"] \ No newline at end of file diff --git a/aws/exeRunEC2.cmd b/aws/exeRunEC2.cmd index b21da7e..df4a31a 100644 --- a/aws/exeRunEC2.cmd +++ b/aws/exeRunEC2.cmd @@ -1 +1 @@ -call python prg_run_ec2_instance.py --launch --terminate --describe --isFleet \ No newline at end of file +call python prg_run_ec2_instance.py --isFleet --launch --terminate --describe \ No newline at end of file diff --git a/aws/exeSetUpEC2.cmd b/aws/exeSetUpEC2.cmd index 65884e9..a268627 100644 --- a/aws/exeSetUpEC2.cmd +++ b/aws/exeSetUpEC2.cmd @@ -1,5 +1,6 @@ :: set EC2 login info -SET EC2_USER=ec2-user +::SET EC2_USER=ec2-user +SET EC2_USER=ubuntu SET EC2_PEM_FPATH="C:\Users\oisin\.aws\kaggle.pem" SET EC2_CREDS_FDIR=E:\GitHub\CatClassifier\.creds SET EC2_SETUP_FPATH=E:\GitHub\CatClassifier\aws\linux_docker_setup.sh @@ -17,5 +18,6 @@ call scp -i %EC2_PEM_FPATH% -r %EC2_CREDS_FDIR% %EC2_USER%@%EC2_DNS%:~/. call scp -i %EC2_PEM_FPATH% %EC2_SETUP_FPATH% %EC2_USER%@%EC2_DNS%:~/linux_docker_setup.sh call scp -i %EC2_PEM_FPATH% %EC2_EXTRACT_FPATH% %EC2_USER%@%EC2_DNS%:~/docker_extract_data.sh :: ssh to EC2 and run linux setp -call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh" +call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% +::call ssh -v -i %EC2_PEM_FPATH% %EC2_USER%@%EC2_DNS% "sed -i 's/\r$//' ~/linux_docker_setup.sh; bash ~/linux_docker_setup.sh" ENDLOCAL \ No newline at end of file diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh index 89c1a5c..e0196ff 100644 --- a/aws/linux_docker_setup.sh +++ b/aws/linux_docker_setup.sh @@ -3,7 +3,7 @@ # 2. make sure to increase volume in /dev/nvme0n1 (/dev/xvda) e.g. 100gb # linux file formatting -# sudo yum install -y dos2unix +# sudo apt-get install -y dos2unix # dos2unix ./linux_docker_setup.sh #-- EC2 Spot Instance Checks --# @@ -14,6 +14,9 @@ df -h lscpu # calculate percentage of used memory free -m | awk 'FNR == 2 {print $3/($3+$4)*100}' +# check gpu status +nvidia-smi +# watch -n 0.5 nvidia-smi #-- Configure Permissions and Overcommit Settings --# @@ -54,11 +57,11 @@ sudo umount /tmp #-- Download Required Programmes --# # update os -sudo yum update -y +sudo apt-get update -y # install required base software -sudo yum install -y htop vim tmux dos2unix docker git +sudo apt-get install -y htop vim tmux dos2unix docker git # remove unneed dependencies -sudo yum autoremove +sudo apt-get autoremove #-- Pull Git Repo --# diff --git a/aws/ref/create_fleet_config.json b/aws/ref/create_fleet_config.json index 6a00656..d296070 100644 --- a/aws/ref/create_fleet_config.json +++ b/aws/ref/create_fleet_config.json @@ -12,22 +12,19 @@ }, "Overrides": [ { - "InstanceType": "g4ad.xlarge" - }, - { - "InstanceType": "g4ad.2xlarge" + "InstanceType": "g4dn.xlarge" }, { - "InstanceType": "g4ad.4xlarge" + "InstanceType": "g4dn.2xlarge" }, { - "InstanceType": "g4dn.xlarge" + "InstanceType": "g4dn.4xlarge" }, { - "InstanceType": "g4dn.2xlarge" + "InstanceType": "g4dn.8xlarge" }, { - "InstanceType": "g4dn.4xlarge" + "InstanceType": "g4dn.12xlarge" }, { "InstanceType": "g5.xlarge" @@ -36,13 +33,13 @@ "InstanceType": "g5.2xlarge" }, { - "InstanceType": "g6.xlarge" + "InstanceType": "g5.4xlarge" }, { - "InstanceType": "g6.2xlarge" + "InstanceType": "g5.8xlarge" }, { - "InstanceType": "g6.4xlarge" + "InstanceType": "g5.12xlarge" } ] } diff --git a/aws/ref/launch_template_config.json b/aws/ref/launch_template_config.json index d6f15e8..10f688b 100644 --- a/aws/ref/launch_template_config.json +++ b/aws/ref/launch_template_config.json @@ -5,9 +5,9 @@ "LaunchTemplateData": { "BlockDeviceMappings": [ { - "DeviceName": "/dev/xvda", + "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": 50, + "VolumeSize": 100, "VolumeType": "gp3" } } @@ -20,8 +20,8 @@ "Groups":["sg-03864b806cd78ded3"] } ], - "ImageId": "ami-00385a401487aefa4", - "InstanceType": "t2.micro", + "ImageId": "ami-000b13fcd5cd7b0f8", + "InstanceType": "g4ad.xlarge", "KeyName": "kaggle", "Placement": { "AvailabilityZone": "eu-west-1a" diff --git a/conda/catclass.cmd b/conda/catclass.cmd index e223f67..2de8a44 100644 --- a/conda/catclass.cmd +++ b/conda/catclass.cmd @@ -8,4 +8,5 @@ call conda activate catclass call conda update -n base conda --yes :: install relevant libraries -call pip install -v -r ..\requirements.txt \ No newline at end of file +call pip install -v -r ..\requirements.txt +call pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 \ No newline at end of file diff --git a/exeDocker.cmd b/exeDocker.cmd index a549d57..9a00bd5 100644 --- a/exeDocker.cmd +++ b/exeDocker.cmd @@ -13,6 +13,8 @@ call docker build --no-cache -t %DOCKER_IMAGE% . :: run docker container call docker run --name %DOCKER_CONTAINER_NAME% --shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE% +::call docker run --entrypoint sh --name %DOCKER_CONTAINER_NAME% ---shm-size=512m --publish 8888:8888 --volume E:\GitHub\CatClassifier\.creds:/home/ubuntu/CatClassifier/.creds --volume E:\GitHub\CatClassifier\report:/home/ubuntu/CatClassifier/report -it %DOCKER_IMAGE% +::call docker run -it --entrypoint bash --name cc --shm-size=512m --volume /home/ec2-user/.creds:/home/ubuntu/CatClassifier/.creds --rm oislen/cat-classifier:latest :: useful docker commands :: docker images diff --git a/model/arch/classify_image_torch.py b/model/arch/classify_image_torch.py index 1703ac5..b794307 100644 --- a/model/arch/classify_image_torch.py +++ b/model/arch/classify_image_torch.py @@ -33,7 +33,7 @@ from model.torch.CustomDataset import CustomDataset # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size diff --git a/model/cons.py b/model/cons.py index 3104f6f..1c46796 100644 --- a/model/cons.py +++ b/model/cons.py @@ -78,4 +78,5 @@ shuffle = False # multiprocessing -num_workers = os.cpu_count() \ No newline at end of file +num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count()) +check_gpu = os.environ.get("PARAM_CHECK_GPU", False) \ No newline at end of file diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 2cc37d8..ab1f1db 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -35,7 +35,7 @@ num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size diff --git a/report/torch_analysis_results.ipynb b/report/torch_analysis_results.ipynb index fca9ada..41ce24f 100644 --- a/report/torch_analysis_results.ipynb +++ b/report/torch_analysis_results.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "504c7d94", "metadata": {}, "outputs": [ @@ -113,7 +113,7 @@ ], "source": [ "# device configuration\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')\n", "# load trained torch model\n", "model = VGG16_pretrained(num_classes=2).to(device)\n", "model.load(input_fpath=cons.torch_model_pt_fpath)\n", diff --git a/report/torch_analysis_results.qmd b/report/torch_analysis_results.qmd index 28c8aab..c543592 100644 --- a/report/torch_analysis_results.qmd +++ b/report/torch_analysis_results.qmd @@ -43,7 +43,7 @@ A pre-trained VGG CNN model with 16 layers was trained using the processed image ```{python} # device configuration -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') # load trained torch model model = VGG16_pretrained(num_classes=2).to(device) model.load(input_fpath=cons.torch_model_pt_fpath) diff --git a/requirements.txt b/requirements.txt index 1d71990..d7f7308 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,5 @@ matplotlib==3.10.0 seaborn==0.13.2 kaggle==1.6.17 jupyterlab==4.3.5 -torch==2.6.0 -torchvision==0.21.0 beartype==0.19.0 boto3==1.36.13 \ No newline at end of file