diff --git a/README.md b/README.md index 192a0fa..0d23333 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,10 @@ See the analysis results notebook for a further details on the analysis; includi * https://nbviewer.org/github/oislen/CatClassifier/blob/main/report/torch_analysis_results.ipynb +Master serialised copies of the trainined models are available on Kaggle: + +* https://www.kaggle.com/models/oislen/cat-classifier-cnn-models + ## Running the Application (Windows) ### Anaconda diff --git a/aws/ref/create_fleet_config.json b/aws/ref/create_fleet_config.json index b83d510..6a00656 100644 --- a/aws/ref/create_fleet_config.json +++ b/aws/ref/create_fleet_config.json @@ -12,40 +12,37 @@ }, "Overrides": [ { - "InstanceType": "c3.2xlarge" + "InstanceType": "g4ad.xlarge" }, { - "InstanceType": "c3.4xlarge" + "InstanceType": "g4ad.2xlarge" }, { - "InstanceType": "c4.2xlarge" + "InstanceType": "g4ad.4xlarge" }, { - "InstanceType": "c4.4xlarge" + "InstanceType": "g4dn.xlarge" }, { - "InstanceType": "c5.2xlarge" + "InstanceType": "g4dn.2xlarge" }, { - "InstanceType": "c5.4xlarge" + "InstanceType": "g4dn.4xlarge" }, { - "InstanceType": "c5a.2xlarge" + "InstanceType": "g5.xlarge" }, { - "InstanceType": "c5a.4xlarge" + "InstanceType": "g5.2xlarge" }, { - "InstanceType": "c5ad.2xlarge" + "InstanceType": "g6.xlarge" }, { - "InstanceType": "c5ad.4xlarge" + "InstanceType": "g6.2xlarge" }, { - "InstanceType": "c5d.2xlarge" - }, - { - "InstanceType": "c5d.4xlarge" + "InstanceType": "g6.4xlarge" } ] } diff --git a/doc/woof_meow.jpg b/doc/woof_meow.jpg new file mode 100644 index 0000000..32cb7f4 Binary files /dev/null and b/doc/woof_meow.jpg differ diff --git a/model/arch/classify_image_keras.py b/model/arch/classify_image_keras.py new file mode 100644 index 0000000..4d4993e --- /dev/null +++ b/model/arch/classify_image_keras.py @@ -0,0 +1,84 @@ +# python model/arch/classify_image_keras.py --image_fpath E:/GitHub/CatClassifier/data/train/cat.0.jpg --model_fpath E:/GitHub/CatClassifier/data/models/AlexNet8.keras + +import logging +import argparse +import platform +import os +import pandas as pd +import numpy as np +import sys +import re +from beartype import beartype + +# set root file directories +root_dir_re_match = re.findall(string=os.getcwd(), pattern="^.+CatClassifier") +root_fdir = root_dir_re_match[0] if len(root_dir_re_match) > 0 else os.path.join(".", "CatClassifier") +model_fdir = os.path.join(root_fdir, 'model') +sys.path.append(model_fdir) + +# load custom scripts +import cons + +# load tensorflow / keras modules +from tensorflow.keras.preprocessing.image import ImageDataGenerator +from tensorflow.keras.preprocessing.image import load_img +from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint +from keras.models import load_model +from keras import optimizers + +@beartype +def classify_image_keras(image_fpath:str, model_fpath:str=cons.keras_model_pickle_fpath): + """Classifies an input image using the keras model + + Parameters + ---------- + image_fpath : str + The full filepath to the image to classify using the keras model + model_fpath : str + The full filepath to the keras model to use for classification, default is cons.keras_model_pickle_fpath + + Returns + ------- + list + The image file classification results as a recordset + """ + + logging.info("Loading keras model...") + # load model + model = load_model(model_fpath) + + logging.info("Generating dataset...") + # prepare test data + dataframe = pd.DataFrame({'filepath': [image_fpath]}) + + logging.info("Creating dataloader...") + # set data generator + imagedatagenerator = ImageDataGenerator(rescale=cons.rescale) + generator = imagedatagenerator.flow_from_dataframe(dataframe=dataframe, directory=cons.test_fdir, x_col='filepath', y_col=None, class_mode=None, target_size=cons.IMAGE_SIZE, batch_size=cons.batch_size, shuffle=cons.shuffle) + + logging.info("Classifying image...") + # make test set predictions + predict = model.predict(generator, steps=int(np.ceil(dataframe.shape[0]/cons.batch_size))) + dataframe['category'] = np.argmax(predict, axis=-1) + dataframe['category'] = dataframe['category'].replace(cons.category_mapper) + response = dataframe.to_dict(orient="records") + logging.info(response) + return response + +if __name__ == "__main__": + + # set up logging + lgr = logging.getLogger() + lgr.setLevel(logging.INFO) + + # define argument parser object + parser = argparse.ArgumentParser(description="Classify Image (Torch Model)") + # add input arguments + parser.add_argument("--image_fpath", action="store", dest="image_fpath", type=str, help="String, the full file path to the image to classify") + parser.add_argument("--model_fpath", action="store", dest="model_fpath", type=str, default=cons.keras_model_pickle_fpath, help="String, the full file path to the model to use for classification") + # create an output dictionary to hold the results + input_params_dict = {} + # extract input arguments + args = parser.parse_args() + # classify image using keras model + response = classify_image_keras(image_fpath=args.image_fpath, model_fpath=args.model_fpath) \ No newline at end of file diff --git a/model/arch/classify_image_torch.py b/model/arch/classify_image_torch.py new file mode 100644 index 0000000..1703ac5 --- /dev/null +++ b/model/arch/classify_image_torch.py @@ -0,0 +1,104 @@ +# python model/arch/classify_image_torch.py --image_fpath E:/GitHub/CatClassifier/data/train/cat.0.jpg --model_fpath E:/GitHub/CatClassifier/data/models/VGG16.pt + +import logging +import argparse +import platform +import os +import pandas as pd +import numpy as np +import sys +import re +from beartype import beartype + +# set root file directories +root_dir_re_match = re.findall(string=os.getcwd(), pattern="^.+CatClassifier") +root_fdir = root_dir_re_match[0] if len(root_dir_re_match) > 0 else os.path.join(".", "CatClassifier") +model_fdir = os.path.join(root_fdir, 'model') +sys.path.append(model_fdir) + +# set huggingface hub directory +huggingface_hub_dir = 'E:\\huggingface' +if (platform.system() == 'Windows') and (os.path.exists(huggingface_hub_dir)): + os.environ['TORCH_HOME'] = huggingface_hub_dir + os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' + +import torch +import torch.nn as nn +from torch.utils.data import DataLoader +from torchvision import transforms + +# load custom scripts +import cons +from model.torch.VGG16_pretrained import VGG16_pretrained +from model.torch.CustomDataset import CustomDataset + +# device configuration +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +torch_transforms = transforms.Compose([ + transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size + #,transforms.RandomRotation(30) + #,transforms.RandomHorizontalFlip(p=0.05) + #,transforms.RandomPerspective(distortion_scale=0.05, p=0.05) + ,transforms.ToTensor() # convert PIL Image or numpy.ndarray to tensor and normalize to somewhere between [0,1] + ,transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # standardized processing +]) + +@beartype +def classify_image_torch(image_fpath:str, model_fpath:str=cons.torch_model_pt_fpath): + """Classifies an input image using the torch model + + Parameters + ---------- + image_fpath : str + The full filepath to the image to classify using the torch model + model_fpath : str + The full filepath to the torch model to use for classification, default is cons.torch_model_pt_fpath + + Returns + ------- + list + The image file classification results as a recordset + """ + + logging.info("Loading torch model...") + # load model + #model = AlexNet8(num_classes=2).to(device) + model = VGG16_pretrained(num_classes=2).to(device) + model.load(input_fpath=model_fpath) + + logging.info("Generating dataset...") + # prepare test data + dataframe = pd.DataFrame({'filepath': [image_fpath]}) + + logging.info("Creating dataloader...") + # set train data loader + dataset = CustomDataset(dataframe, transforms=torch_transforms, mode='test') + loader = DataLoader(dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True) + + logging.info("Classifying image...") + # make test set predictions + predict = model.predict(loader, device) + dataframe['category'] = np.argmax(predict, axis=-1) + dataframe["category"] = dataframe["category"].replace(cons.category_mapper) + response = dataframe.to_dict(orient="records") + logging.info(response) + return response + +if __name__ == "__main__": + + # set up logging + lgr = logging.getLogger() + lgr.setLevel(logging.INFO) + + # define argument parser object + parser = argparse.ArgumentParser(description="Classify Image (Torch Model)") + # add input arguments + parser.add_argument("--image_fpath", action="store", dest="image_fpath", type=str, help="String, the full file path to the image to classify") + parser.add_argument("--model_fpath", action="store", dest="model_fpath", type=str, default=cons.torch_model_pt_fpath, help="String, the full file path to the model to use for classification") + # create an output dictionary to hold the results + input_params_dict = {} + # extract input arguments + args = parser.parse_args() + # classify image using torch model + response = classify_image_torch(image_fpath=args.image_fpath, model_fpath=args.model_fpath) \ No newline at end of file diff --git a/webscrapers/cons.py b/webscrapers/cons.py index 71e0f54..006d829 100644 --- a/webscrapers/cons.py +++ b/webscrapers/cons.py @@ -5,13 +5,14 @@ # set root file directories root_dir_re_match = re.findall(string=os.getcwd(), pattern="^.+CatClassifier") root_fdir = root_dir_re_match[0] if len(root_dir_re_match) > 0 else os.path.join(".", "CatClassifier") -data_fdir = os.path.join(root_fdir, 'data') -creds_fdir = os.path.join(root_fdir, '.creds') -dataprep_fdir = os.path.join(root_fdir, 'data_prep') -report_fdir = os.path.join(root_fdir, 'report') -test_fdir = os.path.join(data_fdir, 'test1') -train_fdir = os.path.join(data_fdir, 'train') -webscrapers_fdir = os.path.join(root_fdir, 'webscrapers') +data_fdir = os.path.join(root_fdir, "data") +creds_fdir = os.path.join(root_fdir, ".creds") +dataprep_fdir = os.path.join(root_fdir, "data_prep") +report_fdir = os.path.join(root_fdir, "report") +test_fdir = os.path.join(data_fdir, "test1") +train_fdir = os.path.join(data_fdir, "train") +models_fir = os.path.join(data_fdir, "models") +webscrapers_fdir = os.path.join(root_fdir, "webscrapers") # set list containing all required directories root_fdirs = [root_fdir, data_fdir, dataprep_fdir, report_fdir, test_fdir, train_fdir, webscrapers_fdir] @@ -22,12 +23,15 @@ # set kaggle competition name os.environ["KAGGLE_CONFIG_DIR"] = creds_fdir -comp_name = 'dogs-vs-cats' +comp_name = "dogs-vs-cats" download_data = True unzip_data = True del_zip = True +# set kaggle model detailes +model_instance_url="oislen/cat-classifier-cnn-models/pyTorch/default/1" + # webscraping constants n_images = 6000 -home_url = 'https://free-images.com' -output_dir = os.path.join(data_fdir, '{search}') \ No newline at end of file +home_url = "https://free-images.com" +output_dir = os.path.join(data_fdir, "{search}") \ No newline at end of file diff --git a/webscrapers/prg_scrape_imgs.py b/webscrapers/prg_scrape_imgs.py index 7d45bc6..1230038 100644 --- a/webscrapers/prg_scrape_imgs.py +++ b/webscrapers/prg_scrape_imgs.py @@ -2,11 +2,12 @@ from beartype import beartype import cons from utilities.commandline_interface import commandline_interface -from utilities.download_comp_data import download_comp_data +from utilities.download_comp_data import download_comp_data, download_models from utilities.webscraper import webscraper @beartype def scrape_imags( + run_download_models:bool=False, run_download_comp_data:bool=False, run_webscraper:bool=False ): @@ -14,6 +15,8 @@ def scrape_imags( Parameters ---------- + run_download_models : bool + Whether to run the download Kaggle master models, default is False run_download_comp_data : bool Whether to run the download Kaggle competition data, default is False run_webscraper : bool @@ -22,8 +25,16 @@ def scrape_imags( Returns ------- """ + if run_download_models: + logging.info("Downloading kaggle models ...") + # download competition data + download_models( + model_instance_url=cons.model_instance_url, + model_dir=cons.models_fir + ) + if run_download_comp_data: - logging.info('Downloading kaggle data ...') + logging.info("Downloading kaggle data ...") # download competition data download_comp_data( comp_name=cons.comp_name, @@ -33,25 +44,25 @@ def scrape_imags( del_zip=cons.del_zip ) if run_webscraper: - logging.info('Running cat image webscraper ...') + logging.info("Running cat image webscraper ...") # run cat webscraper webscraper( - search='cat', + search="cat", n_images=cons.n_images, home_url=cons.home_url, output_dir=cons.train_fdir ) - logging.info('Running dog image webscraper ...') + logging.info("Running dog image webscraper ...") # run dog webscraper webscraper( - search='dog', + search="dog", n_images=cons.n_images, home_url=cons.home_url, output_dir=cons.train_fdir ) # if running as main programme -if __name__ == '__main__': +if __name__ == "__main__": # set up logging lgr = logging.getLogger() @@ -62,6 +73,7 @@ def scrape_imags( # run the scrape images programme scrape_imags( - run_download_comp_data=input_params_dict['run_download_comp_data'], - run_webscraper=input_params_dict['run_webscraper'] + run_download_models=input_params_dict["run_download_models"], + run_download_comp_data=input_params_dict["run_download_comp_data"], + run_webscraper=input_params_dict["run_webscraper"] ) \ No newline at end of file diff --git a/webscrapers/utilities/commandline_interface.py b/webscrapers/utilities/commandline_interface.py index a2abe26..df55c0e 100644 --- a/webscrapers/utilities/commandline_interface.py +++ b/webscrapers/utilities/commandline_interface.py @@ -15,6 +15,7 @@ def commandline_interface(): # define argument parser object parser = argparse.ArgumentParser(description="Execute Webscrapers.") # add input arguments + parser.add_argument("--run_download_models", action=argparse.BooleanOptionalAction, dest="run_download_models", type=bool, default=False, help="Boolean, whether to run the download master Kaggle models, default is False",) parser.add_argument("--run_download_comp_data", action=argparse.BooleanOptionalAction, dest="run_download_comp_data", type=bool, default=False, help="Boolean, whether to run the download Kaggle competition data, default is False",) parser.add_argument("--run_webscraper", action=argparse.BooleanOptionalAction, dest="run_webscraper", type=bool, default=False, help="Boolean, whether to run the image webscraper, default is False",) # create an output dictionary to hold the results @@ -22,6 +23,7 @@ def commandline_interface(): # extract input arguments args = parser.parse_args() # map input arguments into output dictionary + input_params_dict["run_download_models"] = args.run_download_models input_params_dict["run_download_comp_data"] = args.run_download_comp_data input_params_dict["run_webscraper"] = args.run_webscraper return input_params_dict diff --git a/webscrapers/utilities/download_comp_data.py b/webscrapers/utilities/download_comp_data.py index 3a1a70c..c46b524 100644 --- a/webscrapers/utilities/download_comp_data.py +++ b/webscrapers/utilities/download_comp_data.py @@ -6,8 +6,14 @@ from beartype import beartype @beartype -def download_comp_data(comp_name:str, data_dir:str, download_data:bool=True, unzip_data:bool=True, del_zip:bool=True): - """Download Competition Data Documentation +def download_comp_data( + comp_name:str, + data_dir:str, + download_data:bool=True, + unzip_data:bool=True, + del_zip:bool=True + ): + """Download Competition Data Parameters ---------- @@ -25,43 +31,91 @@ def download_comp_data(comp_name:str, data_dir:str, download_data:bool=True, unz Returns ------- + + Example + ------- + download_comp_data( + comp_name="dogs-vs-cats", + data_dir="E:\\GitHub\\CatClassifier\\data", + download_data=True, + unzip_data=True, + del_zip=True + ) """ - logging.info('create zip file path ...') + logging.info("create zip file path ...") # define filenames - zip_data_fname = '{}.zip'.format(comp_name) + zip_data_fname = f"{comp_name}.zip" # create file paths zip_data_fpath = os.path.join(data_dir, zip_data_fname) - zip_train_fpath = os.path.join(data_dir, 'train.zip') - zip_test_fpath = os.path.join(data_dir, 'test1.zip') + zip_train_fpath = os.path.join(data_dir, "train.zip") + zip_test_fpath = os.path.join(data_dir, "test1.zip") # combine paths in a list zip_fpaths_list = [zip_data_fpath, zip_train_fpath, zip_test_fpath] - logging.info('checking for data directory ...') + logging.info("checking for data directory ...") # check data directory exists if os.path.exists(data_dir) == False: os.makedirs(data_dir) else: - logging.info('data directory exists: {}'.format(data_dir)) + logging.info(f"data directory exists: {data_dir}") - # if redownloading the data + # if downloading the data if download_data == True: - logging.info('downing kaggle data ..') - kaggle_cmd = 'kaggle competitions download --competition {} --path {} --force'.format(comp_name, data_dir) + logging.info("downing kaggle data ..") + kaggle_cmd = f"kaggle competitions download --competition {comp_name} --path {data_dir} --force" subprocess.run(kaggle_cmd.split()) # if unzipping the data if unzip_data == True: if os.path.exists(zip_data_fpath) == False: - raise OSError('file not found: {}'.format(zip_data_fpath)) + raise OSError(f"file not found: {zip_data_fpath}") else: for zip_fpath in zip_fpaths_list: - logging.info(f'unzipping data {zip_fpath} ...') + logging.info(f"unzipping data {zip_fpath} ...") with zipfile.ZipFile(zip_fpath, "r") as zip_ref: zip_ref.extractall(data_dir) # if deleting zip file if del_zip == True: for zip_fpath in zip_fpaths_list: - logging.info('deleting zip file {zip_fpath} ...') - os.remove(path = zip_fpath) \ No newline at end of file + logging.info("deleting zip file {zip_fpath} ...") + os.remove(path = zip_fpath) + +@beartype +def download_models( + model_instance_url:str, + model_dir:str + ): + """Download Kaggle Models + + Parameters + ---------- + + model_instance_url : str + Model Instance Version URL suffix in format ////. + model_dir : str + Folder where file(s) will be downloaded. + + Returns + ------- + + Example + ------- + download_models( + model_instance_url="oislen/cat-classifier-cnn-models/pyTorch/default/1", + model_dir="E:\\GitHub\\CatClassifier\\data\\models" + ) + """ + + logging.info("checking for data directory ...") + # check data directory exists + if os.path.exists(model_dir) == False: + os.makedirs(model_dir) + else: + logging.info(f"model directory exists: {model_dir}") + + # downloading the model + logging.info("downloading kaggle model ..") + kaggle_cmd = f"kaggle models instances versions download --path {model_dir} --untar --force {model_instance_url}" + subprocess.run(kaggle_cmd.split())