diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh index e0196ff..488981c 100644 --- a/aws/linux_docker_setup.sh +++ b/aws/linux_docker_setup.sh @@ -97,6 +97,6 @@ cat ~/.creds/docker | docker login --username oislen --password-stdin docker pull $docker_image # run pulled docker container #docker run --shm-size=512m -p 8889:8888 -it $docker_image -docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --rm -it --entrypoint bash $docker_image +docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --gpus all --env PARAM_CHECK_GPU=True -it --entrypoint bash $docker_image #docker run --shm-size=512m -p 8889:8888 -d $docker_image #docker run -it -d /bin/bash \ No newline at end of file diff --git a/model/arch/load_image_v2.py b/model/arch/load_image_v2.py new file mode 100644 index 0000000..b532985 --- /dev/null +++ b/model/arch/load_image_v2.py @@ -0,0 +1,88 @@ + +import pandas as pd +import numpy as np +import re +import os +import torch +from PIL import Image +from matplotlib.image import imread +from copy import deepcopy +from multiprocessing import Pool +from copy import deepcopy + +def load_image_v2(image_fpaths): + """ + """ + images = [] + for image_fpath in image_fpaths: + temp = Image.open(image_fpath) + keep = deepcopy(temp) + images.append(keep) + temp.close() + return pd.Series(images) + +class TorchLoadImages(): + + def __init__(self, torch_transforms, n_workers=None): + self.torch_transforms = torch_transforms + self.n_workers = n_workers + + def loadImage(self, filepath): + """ + """ + # determine the filename and source + fileName = os.path.basename(filepath) + # determine label from image file path + if ("cat" in fileName) or ("dog" in fileName): + fileSource = "kaggle" if len(re.findall(pattern='^(cat|dog)(.[0-9]+.jpg)$', string=fileName)) > 0 else "webscraper" + labelName = fileName.split(".")[0] + label = labelName == "dog" + labelTensor = torch.tensor(label, dtype=torch.int64) + else: + fileSource = "kaggle" + labelName = np.nan + label = np.nan + labelTensor = torch.tensor(label) + # load image file and apply torch transforms + image = Image.open(filepath) + torch_transform_error = None + try: + imageTensor = self.torch_transforms(image) + except Exception as e: + imageTensor = None + torch_transform_error = str(e) + imageArray = np.asarray(image) + image.close() + nDims = len(imageArray.shape) + # create an output record + record = { + "filepaths":filepath, + "filenames":fileName, + "source":fileSource, + "categoryname":labelName, + "category":label, + "images":imageArray, + "ndims":nDims, + "category_tensors":labelTensor, + "image_tensors":imageTensor, + "torch_transform_error":torch_transform_error + } + # close open image + return record + + def multiProcess(self, func, args): + """ + """ + pool = Pool(self.n_workers) + results = pool.map(func, args) + pool.close() + return results + + def loadImages(self, filepaths): + """ + """ + if self.n_workers == None: + records = [self.loadImage(filepath) for filepath in filepaths] + else: + records = self.multiProcess(self.loadImage, filepaths) + return records \ No newline at end of file diff --git a/model/cons.py b/model/cons.py index 1c46796..6a34100 100644 --- a/model/cons.py +++ b/model/cons.py @@ -78,5 +78,5 @@ shuffle = False # multiprocessing -num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count()) +num_workers = os.environ.get("PARAM_NUM_WORKERS", 0) check_gpu = os.environ.get("PARAM_CHECK_GPU", False) \ No newline at end of file diff --git a/model/prg_keras_model.py b/model/prg_keras_model.py index 30570cd..b01e109 100644 --- a/model/prg_keras_model.py +++ b/model/prg_keras_model.py @@ -10,6 +10,7 @@ from model.utilities.plot_image import plot_image from model.utilities.plot_generator import plot_generator from model.utilities.plot_preds import plot_preds +from model.keras.LeNet5 import LeNet5 from model.keras.AlexNet8 import AlexNet8 from model.keras.VGG16_pretrained import VGG16_pretrained from model.utilities.plot_model import plot_model_fit @@ -42,6 +43,7 @@ df = pd.DataFrame({'filename': filenames, 'category': categories}) df["category"] = df["category"].replace(cons.category_mapper) df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) + logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") @@ -60,6 +62,8 @@ # set data constants total_train = train_df.shape[0] total_validate = validate_df.shape[0] + logging.info(f"train_df.shape: {train_df.shape}") + logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") logging.info("Creating training and validation data generators...") @@ -80,8 +84,9 @@ logging.info("Initiate keras model...") # initiate LeNet5 architecture - keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') - #keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + #keras_model = LeNet5(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + #keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') keras_model.summary() # set gradient decent compiler optimizer = optimizers.SGD(learning_rate=cons.learning_rate) @@ -129,7 +134,7 @@ # prepare test data test_filenames = os.listdir(cons.test_fdir) test_df = pd.DataFrame({'filename': test_filenames}) - nb_samples = test_df.shape[0] + logging.info(f"train_df.shape: {test_df.shape}") timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") logging.info("Create test data generator...") @@ -140,7 +145,7 @@ logging.info("Generate test set predictions...") # make test set predictions - predict = keras_model.predict(test_generator, steps=int(np.ceil(nb_samples/cons.batch_size))) + predict = keras_model.predict(test_generator, steps=int(np.ceil(test_df.shape[0]/cons.batch_size))) test_df['category'] = np.argmax(predict, axis=-1) test_df['category'] = test_df['category'].replace(cons.category_mapper) timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions") diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index ab1f1db..8941c43 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -3,8 +3,6 @@ import logging import pandas as pd import numpy as np -import random -from PIL import Image # set huggingface hub directory huggingface_hub_dir = 'E:\\huggingface' @@ -16,12 +14,12 @@ import torch.nn as nn from torch.utils.data import DataLoader from torchvision import transforms -from tensorflow.keras.preprocessing.image import load_img # load custom scripts import cons from model.torch.VGG16_pretrained import VGG16_pretrained from model.torch.AlexNet8 import AlexNet8 +from model.torch.LeNet5 import LeNet5 from model.torch.CustomDataset import CustomDataset from model.torch.EarlyStopper import EarlyStopper from model.utilities.plot_model import plot_model_fit @@ -30,13 +28,13 @@ from model.utilities.plot_generator import plot_generator from model.utilities.TimeIt import TimeIt from model.utilities.commandline_interface import commandline_interface - -# hyper-parameters -num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs +from model.arch.load_image_v2 import load_image_v2, TorchLoadImages # device configuration device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') +random_state = 42 + torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size #,transforms.RandomRotation(30) @@ -53,39 +51,48 @@ lgr.setLevel(logging.INFO) timeLogger = TimeIt() + logging.info("Parsing command line arguments...") # handle input parameters input_params_dict = commandline_interface() - + logging.info(input_params_dict) + timeLogger.logTime(parentKey="Initialisation", subKey="CommandlineArguments") + if input_params_dict["run_model_training"]: - + logging.info("Generating dataframe of images...") - # create a dataframe of filenames and categories - filenames = os.listdir(cons.train_fdir) - categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] - df = pd.DataFrame({'filename': filenames, 'category': categories}) - frac = 0.05 - df = df.sample(frac = frac) - df["categoryname"] = df["category"].replace(cons.category_mapper) - df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) - df["filepath"] = cons.train_fdir + '/' + df['filename'] - df["ndims"] = df['filepath'].apply(lambda x: len(np.array(Image.open(x)).shape)) - df = df.loc[df["ndims"] == 3, :].copy() + # load and shuffle the image file paths + np.random.seed(random_state) + image_filepaths=np.array([os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)]) + np.random.shuffle(image_filepaths) + # create torch load images object + sample_size = 30000 + torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) + df = pd.DataFrame.from_records(torchLoadImages.loadImages(image_filepaths[0:sample_size])) + # only consider images with 3 dimensions + df = df.loc[df["ndims"]==3, :] + # flush data from memory + del image_filepaths + logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") # random image plot - sample = random.choice(filenames) - image = load_img(os.path.join(cons.train_fdir, sample)) - plot_image(image, output_fpath=cons.torch_random_image_fpath, show_plot=False) + plot_image(df['images'].values[1], output_fpath=cons.torch_random_image_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="SampleImage") + logging.info("Plot example data loader images...") + # data generator example + plot_generator(generator=df['image_tensors'].values[:16].tolist(), mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) + timeLogger.logTime(parentKey="Plots", subKey="DataLoader") + logging.info("Split into training, validation and test dataset...") # prepare data - random_state = 42 - validate_df = df[df['source'] == 'kaggle'].sample(n=int(5000 * frac), random_state=random_state) + validate_df = df.sample(frac=0.05, random_state=random_state, replace=False) train_df = df[~df.index.isin(validate_df.index)] train_df = train_df.reset_index(drop=True) validate_df = validate_df.reset_index(drop=True) + logging.info(f"train_df.shape: {train_df.shape}") + logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") logging.info("Creating training and validation data loaders...") @@ -93,23 +100,28 @@ total_train = train_df.shape[0] total_validate = validate_df.shape[0] # set train data loader - train_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train') - train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) + train_dataset = CustomDataset(train_df) + train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) # set validation data loader - validation_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train') - validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) + validation_dataset = CustomDataset(validate_df) + validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) + # flush data from memory + del df + del train_df + del train_dataset + del validate_df + del validation_dataset timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") - logging.info("Plot example data loader images...") - # datagen example - example_generator = [(image.detach().numpy(), None) for images, labels in train_loader for image in images] - plot_generator(generator=example_generator, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) - timeLogger.logTime(parentKey="Plots", subKey="DataLoader") - logging.info("Initiate torch model...") + logging.info(f"device: {device}") # initiate cnn architecture + #model = LeNet5(num_classes=2) #model = AlexNet8(num_classes=2).to(device) model = VGG16_pretrained(num_classes=2).to(device) + if device == "cuda": + model = nn.DataParallel(model) + model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs') @@ -117,8 +129,13 @@ timeLogger.logTime(parentKey="Modelling", subKey="InitiateTorchModel") logging.info("Fit torch model...") + # hyper-parameters + num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # fit torch model - model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper) + model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper, checkpoints_dir=cons.checkpoints_fdir, load_epoch_checkpoint=None) + # flush data from memory + del train_loader + del validation_loader timeLogger.logTime(parentKey="Modelling", subKey="Fit") logging.info("Plot model fit results...") @@ -130,30 +147,28 @@ # save model model.save(output_fpath=cons.torch_model_pt_fpath) timeLogger.logTime(parentKey="ModelSerialisation", subKey="Write") - + if input_params_dict["run_testset_prediction"]: logging.info("Load fitted torch model from disk...") # load model + #model = LeNet5(num_classes=2).to(device) #model = AlexNet8(num_classes=2).to(device) model = VGG16_pretrained(num_classes=2).to(device) model.load(input_fpath=cons.torch_model_pt_fpath) timeLogger.logTime(parentKey="ModelSerialisation", subKey="Load") logging.info("Generate test dataset...") - # prepare test data - test_filenames = os.listdir(cons.test_fdir) - test_df = pd.DataFrame({'filename': test_filenames}) - test_df["filepath"] = cons.test_fdir + '/' + test_df['filename'] - test_df["idx"] = test_df['filename'].str.extract(pat='([0-9]+)').astype(int) - test_df = test_df.set_index('idx').sort_index() - nb_samples = test_df.shape[0] - timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") + # create torch load images object + torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) + test_df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.test_fdir, x) for x in os.listdir(cons.test_fdir)])) + logging.info(f"test_df.shape: {test_df.shape}") + timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Create test dataloader...") # set train data loader - test_dataset = CustomDataset(test_df, transforms=torch_transforms, mode='test') - test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True) + test_dataset = CustomDataset(test_df) + test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) timeLogger.logTime(parentKey="TestSet", subKey="DataLoader") logging.info("Generate test set predictions...") @@ -161,6 +176,9 @@ predict = model.predict(test_loader, device) test_df['category'] = np.argmax(predict, axis=-1) test_df["category"] = test_df["category"].replace(cons.category_mapper) + # flush data from memory + del test_dataset + del test_loader timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions") logging.info("Plot example test set predictions...") @@ -171,7 +189,10 @@ logging.info("Generate a sample submission file for kaggle...") # make submission submission_df = test_df.copy() - submission_df['id'] = submission_df['filename'].str.split('.').str[0] + submission_df['id'] = submission_df['filenames'].str.split('.').str[0] submission_df['label'] = submission_df['category'].replace(cons.category_mapper) submission_df.to_csv(cons.submission_csv_fpath, index=False) + # delete dataframes from memory + del test_df + del submission_df timeLogger.logTime(parentKey="TestSet", subKey="SubmissionFile") \ No newline at end of file diff --git a/model/torch/AlexNet8.py b/model/torch/AlexNet8.py index 99892b0..bf0b264 100644 --- a/model/torch/AlexNet8.py +++ b/model/torch/AlexNet8.py @@ -15,6 +15,7 @@ class AlexNet8(nn.Module): def __init__(self, num_classes=1000): super(AlexNet8, self).__init__() + self.model_id = "AlexNet8" self.features = nn.Sequential( nn.Conv2d(in_channels=3, out_channels=96, kernel_size=(11, 11), stride=(4, 4), padding='valid'), nn.ReLU(inplace=True), @@ -59,7 +60,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -80,11 +81,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/CustomDataset.py b/model/torch/CustomDataset.py index 1f7cf2b..5084cb4 100644 --- a/model/torch/CustomDataset.py +++ b/model/torch/CustomDataset.py @@ -5,24 +5,23 @@ class CustomDataset(Dataset): - def __init__(self, df, transforms, mode): - self.mode = mode - self.filepath = df['filepath'].tolist() - if mode == 'train': - self.category = df['category'].tolist() - self.transforms = transforms + def __init__(self, df): + self.image_tensors = df['image_tensors'].values + self.category_tensors = df['category_tensors'].values def __len__(self): - return len(self.filepath) + return len(self.image_tensors) def __getitem__(self, idx): - image_filepath = self.filepath[idx] - image = Image.open(image_filepath) - image = self.transforms(image) + image_tensor = self.image_tensors[idx] + category_tensor = self.category_tensors[idx] + return image_tensor, category_tensor + + def __getitems__(self, idx_list): + image_tensors = torch.stack(self.image_tensors[idx_list].tolist()) + category_tensors = torch.stack(self.category_tensors[idx_list].tolist()) + return image_tensors, category_tensors - if self.mode == 'train': - label = torch.tensor(self.category[idx], dtype=torch.int64) - else: - label = torch.tensor(np.nan) - - return image, label \ No newline at end of file + def collate_fn(data): + arrays, categories = data + return arrays, categories \ No newline at end of file diff --git a/model/torch/LeNet5.py b/model/torch/LeNet5.py index d9edfb9..0300051 100644 --- a/model/torch/LeNet5.py +++ b/model/torch/LeNet5.py @@ -15,6 +15,7 @@ class LeNet5(nn.Module): def __init__(self, num_classes=1000): super(LeNet5, self).__init__() + self.model_id = "LeNet5" self.features = nn.Sequential( nn.Conv2d(in_channels = 3, out_channels = 6, kernel_size=(5, 5), stride = (1, 1), padding = 'valid'), nn.ReLU(inplace=True), @@ -53,7 +54,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -74,11 +75,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/ResNet50_pretrained.py b/model/torch/ResNet50_pretrained.py index e084061..be33d6b 100644 --- a/model/torch/ResNet50_pretrained.py +++ b/model/torch/ResNet50_pretrained.py @@ -14,6 +14,7 @@ class ResNet50_pretrained(nn.Module): def __init__(self, num_classes=1000): + self.model_id = "ResNet50_pretrained" super(ResNet50_pretrained, self).__init__() self.resnet = models.resnet50(pretrained=True) self.num_ftrs = self.resnet.fc.out_features @@ -38,7 +39,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -59,11 +60,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/VGG16.py b/model/torch/VGG16.py index 3b416e2..11d9d5f 100644 --- a/model/torch/VGG16.py +++ b/model/torch/VGG16.py @@ -15,6 +15,7 @@ class VGG16(nn.Module): def __init__(self, num_classes=1000): super(VGG16, self).__init__() + self.model_id = "VGG16" self.features = nn.Sequential( # first convulation and pooling layer nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3, 3), stride=(1, 1), padding='same'), @@ -85,7 +86,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -106,11 +107,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/VGG16_pretrained.py b/model/torch/VGG16_pretrained.py index 865f3ba..4767f64 100644 --- a/model/torch/VGG16_pretrained.py +++ b/model/torch/VGG16_pretrained.py @@ -15,6 +15,7 @@ class VGG16_pretrained(nn.Module): def __init__(self, num_classes=1000): super(VGG16_pretrained, self).__init__() + self.model_id = "VGG16_pretrained" self.resnet = models.vgg16(weights ="DEFAULT") self.num_ftrs = self.resnet.classifier[len(self.resnet.classifier)-1].out_features self.classifier = nn.Sequential(nn.Linear(in_features=self.num_ftrs, out_features=num_classes)) @@ -38,7 +39,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -59,11 +60,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit=fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit=fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/checkpoints.py b/model/torch/checkpoints.py new file mode 100644 index 0000000..a031bb7 --- /dev/null +++ b/model/torch/checkpoints.py @@ -0,0 +1,34 @@ +import torch +from beartype import beartype + +@beartype +def save_checkpoint(state:dict, filepath:str): + """Save model training checkpoints to local file path + + Parameters + ---------- + state : dict + The model state dictionary to write to disk + filepath : str + The local file path to write the checkpoints to + + Returns + ------- + """ + torch.save(state, filepath) + +@beartype +def load_checkpoint(filepath:str) -> dict: + """Load model training checkpoints from local file path + + Parameters + ---------- + filepath : str + The local file path to read the checkpoints from + + Returns + ------- + """ + # load checkpoint file + checkpoint = torch.load(filepath, weights_only=False) + return checkpoint \ No newline at end of file diff --git a/model/torch/fit.py b/model/torch/fit.py index c39d800..b4fbaf2 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -1,12 +1,15 @@ +import os import torch +import logging from model.torch.validate import validate from model.torch.ModelFit import ModelFit from model.torch.EarlyStopper import EarlyStopper +from model.torch.checkpoints import save_checkpoint, load_checkpoint from typing import Union from beartype import beartype @beartype -def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): +def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """ Fits model to specified data loader given the criterion and optimizer @@ -30,23 +33,40 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ + start_epoch = 1 + end_epoch = num_epochs + 1 train_loss_list, train_acc_list, valid_loss_list, valid_acc_list = [], [], [], [] model = model.to(device) n_total_steps = len(train_dataloader) - for epoch in range(num_epochs): + # load previous model epoch checkpoint + if (load_epoch_checkpoint != None) and (checkpoints_dir != None): + checkpoints_filename = f"checkpoint_{model.model_id}_epoch_{load_epoch_checkpoint}.pt" + checkpoint_filepath = os.path.join(checkpoints_dir, checkpoints_filename) + checkpoint = load_checkpoint(filepath=checkpoint_filepath) + # assign state dictionaries from checkpoint dictionary + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = load_epoch_checkpoint + 1 + train_loss_list, train_acc_list, valid_loss_list, valid_acc_list = checkpoint["train_loss_list"], checkpoint["train_acc_list"], checkpoint["valid_loss_list"], checkpoint["valid_acc_list"] + logging.info(f"Read checkpoints from: {checkpoint_filepath}") + for epoch in range(start_epoch, end_epoch): t_loss, t_corr = 0.0, 0.0 model.train() for i, (images, labels) in enumerate(train_dataloader): # load images and labels to device images = images.to(device) - label = labels.to(device) + labels = labels.to(device) # forward pass preds = model.forward(images) - loss = criterion(preds, label) + loss = criterion(preds, labels) if scheduler != None: scheduler.step(loss) # backward and optimise @@ -56,23 +76,31 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz # calculate metrics t_loss += loss.item() * images.size(0) t_corr += torch.sum(preds.argmax(1) == labels) - print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') - # update training loss and accuarcy + logging.info(f'Epoch [{epoch}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') + # update training loss and accuracy train_loss = t_loss / len(train_dataloader.dataset) - train_acc = t_corr.cpu().numpy() / len(train_dataloader.dataset) + train_acc = t_corr.item() / len(train_dataloader.dataset) train_loss_list.append(train_loss) train_acc_list.append(train_acc) - print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}') + logging.info(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}') # calculate validation loss and accuracy if applicable if valid_dataLoader != None: valid_loss, valid_acc = validate(model=model, device=device, dataloader=valid_dataLoader, criterion=criterion) valid_loss_list.append(valid_loss) valid_acc_list.append(valid_acc) - print(f'Valid Loss: {loss.item():.4f}, Valid Accuracy: {valid_acc:.4f}') + logging.info(f'Valid Loss: {loss.item():.4f}, Valid Accuracy: {valid_acc:.4f}') # if implementing early stopping if early_stopper != None and early_stopper.early_stop(valid_loss): - print(f"Applying early stopping criteria at validation loss: {valid_loss}") + logging.info(f"Applying early stopping criteria at validation loss: {valid_loss}") break + # writing model checkpoints + if (checkpoints_dir != None): + checkpoints_filename = f"checkpoint_{model.model_id}_epoch_{epoch}.pt" + checkpoint_filepath = os.path.join(checkpoints_dir, checkpoints_filename) + checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), "train_loss_list":train_loss_list, "train_acc_list":train_acc_list, "valid_loss_list":valid_loss_list, "valid_acc_list":valid_acc_list} + save_checkpoint(state=checkpoint, filepath=checkpoint_filepath) + logging.info(f"Wrote checkpoints to: {checkpoint_filepath}") + # create model fit object model_fit = ModelFit(loss=train_loss_list, accuracy=train_acc_list, val_loss=valid_loss_list, val_accuracy=valid_acc_list) return model, model_fit diff --git a/model/torch/predict.py b/model/torch/predict.py index 2c8213d..74e6117 100644 --- a/model/torch/predict.py +++ b/model/torch/predict.py @@ -24,6 +24,6 @@ def predict(model, dataloader:torch.utils.data.DataLoader, device:torch.device) images = images.to(device) labels = labels.to(device) outputs = model.forward(images) - fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) + fin_outputs.extend(torch.sigmoid(outputs).tolist()) proba = np.array(fin_outputs) return proba \ No newline at end of file diff --git a/model/torch/validate.py b/model/torch/validate.py index d2a95a8..072559c 100644 --- a/model/torch/validate.py +++ b/model/torch/validate.py @@ -35,7 +35,7 @@ def validate(model, device:torch.device, dataloader:torch.utils.data.DataLoader, v_corr += torch.sum(preds.argmax(1) == labels) # update training loss and accuarcy valid_loss = v_loss / len(dataloader.dataset) - valid_acc = v_corr.cpu().numpy() / len(dataloader.dataset) + valid_acc = v_corr.item() / len(dataloader.dataset) return (valid_loss, valid_acc) diff --git a/model/utilities/plot_generator.py b/model/utilities/plot_generator.py index 5e0a7cd..6dfb8c0 100644 --- a/model/utilities/plot_generator.py +++ b/model/utilities/plot_generator.py @@ -36,7 +36,7 @@ def plot_generator( plt.imshow(image) break elif mode == 'torch': - X_batch, Y_batch = generator[i] + X_batch = generator[i] image = X_batch[0] plt.imshow(image) plt.tight_layout() diff --git a/model/utilities/plot_preds.py b/model/utilities/plot_preds.py index 7850dee..2cf2caa 100644 --- a/model/utilities/plot_preds.py +++ b/model/utilities/plot_preds.py @@ -29,7 +29,7 @@ def plot_preds( sample_test = data.head(18) plt.figure(figsize=(12, 24)) for id, (index, row) in enumerate(sample_test.iterrows()): - filename = row['filename'] + filename = row['filenames'] category = row['category'] img = load_img(os.path.join(cons.test_fdir, filename), target_size=cons.IMAGE_SIZE) plt.subplot(6, 3, id+1)