From 00b080093c5aa79a3b18dfa11b241af1bc9ffe99 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 19:18:33 +0000 Subject: [PATCH 01/30] Calling --gpus all and passing check gpu env variable --- aws/linux_docker_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/linux_docker_setup.sh b/aws/linux_docker_setup.sh index e0196ff..488981c 100644 --- a/aws/linux_docker_setup.sh +++ b/aws/linux_docker_setup.sh @@ -97,6 +97,6 @@ cat ~/.creds/docker | docker login --username oislen --password-stdin docker pull $docker_image # run pulled docker container #docker run --shm-size=512m -p 8889:8888 -it $docker_image -docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --rm -it --entrypoint bash $docker_image +docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --gpus all --env PARAM_CHECK_GPU=True -it --entrypoint bash $docker_image #docker run --shm-size=512m -p 8889:8888 -d $docker_image #docker run -it -d /bin/bash \ No newline at end of file From 3dd4c082b51244a5f2a2fde845d7dcca57c0b141 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 19:51:04 +0000 Subject: [PATCH 02/30] Added extra logging and testing with AlexNet8 --- model/prg_torch_model.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index ab1f1db..7a3c253 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -63,13 +63,14 @@ filenames = os.listdir(cons.train_fdir) categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] df = pd.DataFrame({'filename': filenames, 'category': categories}) - frac = 0.05 + frac = 0.001 df = df.sample(frac = frac) df["categoryname"] = df["category"].replace(cons.category_mapper) df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) df["filepath"] = cons.train_fdir + '/' + df['filename'] df["ndims"] = df['filepath'].apply(lambda x: len(np.array(Image.open(x)).shape)) df = df.loc[df["ndims"] == 3, :].copy() + logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") @@ -86,6 +87,8 @@ train_df = df[~df.index.isin(validate_df.index)] train_df = train_df.reset_index(drop=True) validate_df = validate_df.reset_index(drop=True) + logging.info(f"train_df.shape: {train_df.shape}") + logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") logging.info("Creating training and validation data loaders...") @@ -102,14 +105,16 @@ logging.info("Plot example data loader images...") # datagen example - example_generator = [(image.detach().numpy(), None) for images, labels in train_loader for image in images] + example_dataset = CustomDataset(train_df.head(16), transforms=torch_transforms, mode='train') + example_loader = DataLoader(example_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) + example_generator = [(image.detach().numpy(), None) for images, labels in example_loader for image in images] plot_generator(generator=example_generator, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="DataLoader") logging.info("Initiate torch model...") # initiate cnn architecture - #model = AlexNet8(num_classes=2).to(device) - model = VGG16_pretrained(num_classes=2).to(device) + model = AlexNet8(num_classes=2).to(device) + #model = VGG16_pretrained(num_classes=2).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs') @@ -147,7 +152,7 @@ test_df["filepath"] = cons.test_fdir + '/' + test_df['filename'] test_df["idx"] = test_df['filename'].str.extract(pat='([0-9]+)').astype(int) test_df = test_df.set_index('idx').sort_index() - nb_samples = test_df.shape[0] + logging.info(f"train_df.shape: {test_df.shape}") timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") logging.info("Create test dataloader...") From 7c9444c4bc27667af6880818049ef4ff034835f6 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 19:55:23 +0000 Subject: [PATCH 03/30] Restructured code and testing with LeNet5 --- model/prg_torch_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 7a3c253..10b9602 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -22,6 +22,7 @@ import cons from model.torch.VGG16_pretrained import VGG16_pretrained from model.torch.AlexNet8 import AlexNet8 +from model.torch.LeNet5 import LeNet5 from model.torch.CustomDataset import CustomDataset from model.torch.EarlyStopper import EarlyStopper from model.utilities.plot_model import plot_model_fit @@ -31,12 +32,6 @@ from model.utilities.TimeIt import TimeIt from model.utilities.commandline_interface import commandline_interface -# hyper-parameters -num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs - -# device configuration -device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') - torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size #,transforms.RandomRotation(30) @@ -112,8 +107,11 @@ timeLogger.logTime(parentKey="Plots", subKey="DataLoader") logging.info("Initiate torch model...") + # device configuration + device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') + logging.info(f"device: {device}") # initiate cnn architecture - model = AlexNet8(num_classes=2).to(device) + model = LeNet5(num_classes=2).to(device) #model = VGG16_pretrained(num_classes=2).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate) @@ -122,6 +120,8 @@ timeLogger.logTime(parentKey="Modelling", subKey="InitiateTorchModel") logging.info("Fit torch model...") + # hyper-parameters + num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # fit torch model model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper) timeLogger.logTime(parentKey="Modelling", subKey="Fit") From 3b7ce0cd4bc99f6e580893877cea9feae1ac54ef Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 20:13:51 +0000 Subject: [PATCH 04/30] Moving labels to device --- model/torch/fit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/torch/fit.py b/model/torch/fit.py index c39d800..6d9f292 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -55,7 +55,7 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz optimizer.step() # calculate metrics t_loss += loss.item() * images.size(0) - t_corr += torch.sum(preds.argmax(1) == labels) + t_corr += torch.sum(preds.argmax(1) == labels.to(device)) print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') # update training loss and accuarcy train_loss = t_loss / len(train_dataloader.dataset) From d7328f7e96121d03af38de1d1fa55d8fe613faa1 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 20:16:31 +0000 Subject: [PATCH 05/30] Reverted frac to 0.05 --- model/prg_torch_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 10b9602..ee8cad4 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -58,7 +58,7 @@ filenames = os.listdir(cons.train_fdir) categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] df = pd.DataFrame({'filename': filenames, 'category': categories}) - frac = 0.001 + frac = 0.05 df = df.sample(frac = frac) df["categoryname"] = df["category"].replace(cons.category_mapper) df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) From 13e3bb594dada24db7336f54a6144850c6909be6 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 20:25:44 +0000 Subject: [PATCH 06/30] Training with VGG16 --- model/prg_torch_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index ee8cad4..565bae8 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -111,8 +111,9 @@ device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') logging.info(f"device: {device}") # initiate cnn architecture - model = LeNet5(num_classes=2).to(device) - #model = VGG16_pretrained(num_classes=2).to(device) + #model = LeNet5(num_classes=2).to(device) + #model = AlexNet8(num_classes=2).to(device) + model = VGG16_pretrained(num_classes=2).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs') From a7a681a3c2575c2bdf3bd55b5330f41e92b16b9f Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 13 Feb 2025 21:44:20 +0000 Subject: [PATCH 07/30] Removed cpu usage --- model/torch/fit.py | 2 +- model/torch/predict.py | 2 +- model/torch/validate.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/model/torch/fit.py b/model/torch/fit.py index 6d9f292..91ddb56 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -59,7 +59,7 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') # update training loss and accuarcy train_loss = t_loss / len(train_dataloader.dataset) - train_acc = t_corr.cpu().numpy() / len(train_dataloader.dataset) + train_acc = t_corr.item() / len(train_dataloader.dataset) train_loss_list.append(train_loss) train_acc_list.append(train_acc) print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}') diff --git a/model/torch/predict.py b/model/torch/predict.py index 2c8213d..74e6117 100644 --- a/model/torch/predict.py +++ b/model/torch/predict.py @@ -24,6 +24,6 @@ def predict(model, dataloader:torch.utils.data.DataLoader, device:torch.device) images = images.to(device) labels = labels.to(device) outputs = model.forward(images) - fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) + fin_outputs.extend(torch.sigmoid(outputs).tolist()) proba = np.array(fin_outputs) return proba \ No newline at end of file diff --git a/model/torch/validate.py b/model/torch/validate.py index d2a95a8..692d77c 100644 --- a/model/torch/validate.py +++ b/model/torch/validate.py @@ -32,10 +32,10 @@ def validate(model, device:torch.device, dataloader:torch.utils.data.DataLoader, loss = criterion(preds, labels) # calculate metrics v_loss += loss.item() * images.size(0) - v_corr += torch.sum(preds.argmax(1) == labels) + v_corr += torch.sum(preds.argmax(1) == labels.to(device)) # update training loss and accuarcy valid_loss = v_loss / len(dataloader.dataset) - valid_acc = v_corr.cpu().numpy() / len(dataloader.dataset) + valid_acc = v_corr.item() / len(dataloader.dataset) return (valid_loss, valid_acc) From 4c7efa3bb7ce6adfb3c28e92ff7ea3cccedc7648 Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 12:03:18 +0000 Subject: [PATCH 08/30] Added extra logging --- model/prg_keras_model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/model/prg_keras_model.py b/model/prg_keras_model.py index 30570cd..6d3be61 100644 --- a/model/prg_keras_model.py +++ b/model/prg_keras_model.py @@ -42,6 +42,7 @@ df = pd.DataFrame({'filename': filenames, 'category': categories}) df["category"] = df["category"].replace(cons.category_mapper) df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) + logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") @@ -60,6 +61,8 @@ # set data constants total_train = train_df.shape[0] total_validate = validate_df.shape[0] + logging.info(f"train_df.shape: {train_df.shape}") + logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") logging.info("Creating training and validation data generators...") @@ -129,7 +132,7 @@ # prepare test data test_filenames = os.listdir(cons.test_fdir) test_df = pd.DataFrame({'filename': test_filenames}) - nb_samples = test_df.shape[0] + logging.info(f"train_df.shape: {test_df.shape}") timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") logging.info("Create test data generator...") @@ -140,7 +143,7 @@ logging.info("Generate test set predictions...") # make test set predictions - predict = keras_model.predict(test_generator, steps=int(np.ceil(nb_samples/cons.batch_size))) + predict = keras_model.predict(test_generator, steps=int(np.ceil(test_df.shape[0]/cons.batch_size))) test_df['category'] = np.argmax(predict, axis=-1) test_df['category'] = test_df['category'].replace(cons.category_mapper) timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions") From bfa36627372845e60e7101c1bd2cd1ecf9b8a639 Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 12:03:40 +0000 Subject: [PATCH 09/30] Unpacking single value --- model/utilities/plot_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/utilities/plot_generator.py b/model/utilities/plot_generator.py index 5e0a7cd..6dfb8c0 100644 --- a/model/utilities/plot_generator.py +++ b/model/utilities/plot_generator.py @@ -36,7 +36,7 @@ def plot_generator( plt.imshow(image) break elif mode == 'torch': - X_batch, Y_batch = generator[i] + X_batch = generator[i] image = X_batch[0] plt.imshow(image) plt.tight_layout() From 2f93d3b4a167b130033f94e74a7d7d9a3567d111 Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 13:51:03 +0000 Subject: [PATCH 10/30] Created a new load images script using PIL image --- model/arch/load_image_v2.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 model/arch/load_image_v2.py diff --git a/model/arch/load_image_v2.py b/model/arch/load_image_v2.py new file mode 100644 index 0000000..bb17e82 --- /dev/null +++ b/model/arch/load_image_v2.py @@ -0,0 +1,14 @@ + +import pandas as pd +from PIL import Image + +def load_image_v2(image_fpaths): + """ + """ + images = [] + for image_fpath in image_fpaths: + temp = Image.open(image_fpath) + keep = temp.copy() + images.append(keep) + temp.close() + return pd.Series(images) \ No newline at end of file From 944bc2e8577b285c62639e85c72ae51cb463983a Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 13:51:50 +0000 Subject: [PATCH 11/30] Moved torch transform operations outside of custom data set --- model/prg_torch_model.py | 57 ++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 565bae8..9c338bf 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -16,14 +16,13 @@ import torch.nn as nn from torch.utils.data import DataLoader from torchvision import transforms -from tensorflow.keras.preprocessing.image import load_img # load custom scripts import cons from model.torch.VGG16_pretrained import VGG16_pretrained from model.torch.AlexNet8 import AlexNet8 from model.torch.LeNet5 import LeNet5 -from model.torch.CustomDataset import CustomDataset +from model.torch.CustomDataset import CustomDataset, collate_fn from model.torch.EarlyStopper import EarlyStopper from model.utilities.plot_model import plot_model_fit from model.utilities.plot_preds import plot_preds @@ -31,6 +30,10 @@ from model.utilities.plot_generator import plot_generator from model.utilities.TimeIt import TimeIt from model.utilities.commandline_interface import commandline_interface +from model.arch.load_image_v2 import load_image_v2 + +# device configuration +device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size @@ -54,24 +57,28 @@ if input_params_dict["run_model_training"]: logging.info("Generating dataframe of images...") + # TODO: rewrite this with polars # create a dataframe of filenames and categories filenames = os.listdir(cons.train_fdir) categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] - df = pd.DataFrame({'filename': filenames, 'category': categories}) + df = pd.DataFrame({'filenames': filenames, 'category': categories}) frac = 0.05 df = df.sample(frac = frac) df["categoryname"] = df["category"].replace(cons.category_mapper) - df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) - df["filepath"] = cons.train_fdir + '/' + df['filename'] - df["ndims"] = df['filepath'].apply(lambda x: len(np.array(Image.open(x)).shape)) + df['source'] = df['filenames'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) + df["filepaths"] = cons.train_fdir + '/' + df['filenames'] + df["images"] = df["filepaths"].apply(lambda x: Image.open(x)) + df["ndims"] = df['images'].apply(lambda x: len(np.array(x).shape)) df = df.loc[df["ndims"] == 3, :].copy() + # apply transforms and convert to arrays + df["image_tensors"] = df["images"].apply(lambda x: torch_transforms(x))#.detach().cpu().numpy()) + df["category_tensors"] = df["category"].apply(lambda x: torch.tensor(x, dtype=torch.int64)) logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") # random image plot - sample = random.choice(filenames) - image = load_img(os.path.join(cons.train_fdir, sample)) + image = Image.open(os.path.join(cons.train_fdir, filenames[1])) plot_image(image, output_fpath=cons.torch_random_image_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="SampleImage") @@ -91,29 +98,28 @@ total_train = train_df.shape[0] total_validate = validate_df.shape[0] # set train data loader - train_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train') + train_dataset = CustomDataset(train_df) train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) # set validation data loader - validation_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train') + validation_dataset = CustomDataset(train_df) validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") logging.info("Plot example data loader images...") - # datagen example - example_dataset = CustomDataset(train_df.head(16), transforms=torch_transforms, mode='train') - example_loader = DataLoader(example_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) - example_generator = [(image.detach().numpy(), None) for images, labels in example_loader for image in images] - plot_generator(generator=example_generator, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) + # datagen examplec + example_data = [train_dataset.__getitem__(idx)[0] for idx in range(16)] + plot_generator(generator=example_data, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="DataLoader") logging.info("Initiate torch model...") - # device configuration - device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') logging.info(f"device: {device}") # initiate cnn architecture - #model = LeNet5(num_classes=2).to(device) + model = LeNet5(num_classes=2) #model = AlexNet8(num_classes=2).to(device) - model = VGG16_pretrained(num_classes=2).to(device) + #model = VGG16_pretrained(num_classes=2).to(device) + if device == "cuda": + model = nn.DataParallel(model) + model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs') @@ -142,23 +148,28 @@ logging.info("Load fitted torch model from disk...") # load model #model = AlexNet8(num_classes=2).to(device) - model = VGG16_pretrained(num_classes=2).to(device) + model = LeNet5(num_classes=2).to(device) model.load(input_fpath=cons.torch_model_pt_fpath) timeLogger.logTime(parentKey="ModelSerialisation", subKey="Load") logging.info("Generate test dataset...") # prepare test data test_filenames = os.listdir(cons.test_fdir) - test_df = pd.DataFrame({'filename': test_filenames}) - test_df["filepath"] = cons.test_fdir + '/' + test_df['filename'] + test_df = pd.DataFrame({'filename': test_filenames}).head() test_df["idx"] = test_df['filename'].str.extract(pat='([0-9]+)').astype(int) + test_df["filepaths"] = test_df['filename'].apply(lambda x: os.path.join(cons.test_fdir, x)) + test_df["images"] = load_image_v2(test_df["filepaths"]) + test_df["category"] = np.nan test_df = test_df.set_index('idx').sort_index() + # apply transforms and convert to arrays + test_df["image_tensors"] = test_df["images"].apply(lambda x: torch_transforms(x))#.detach().cpu().numpy()) + test_df["category_tensors"] = test_df["category"].apply(lambda x: torch.tensor(torch.tensor(x), dtype=torch.int64)) logging.info(f"train_df.shape: {test_df.shape}") timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") logging.info("Create test dataloader...") # set train data loader - test_dataset = CustomDataset(test_df, transforms=torch_transforms, mode='test') + test_dataset = CustomDataset(test_df) test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True) timeLogger.logTime(parentKey="TestSet", subKey="DataLoader") From 1764349a64c145ac97552c4de62cf6210e6968cc Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 13:52:12 +0000 Subject: [PATCH 12/30] Moved torch transform operations outside of custom data set. Added logic to return multiple items via array operations --- model/torch/CustomDataset.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/model/torch/CustomDataset.py b/model/torch/CustomDataset.py index 1f7cf2b..2a9c752 100644 --- a/model/torch/CustomDataset.py +++ b/model/torch/CustomDataset.py @@ -5,24 +5,23 @@ class CustomDataset(Dataset): - def __init__(self, df, transforms, mode): - self.mode = mode - self.filepath = df['filepath'].tolist() - if mode == 'train': - self.category = df['category'].tolist() - self.transforms = transforms + def __init__(self, df): + self.image_tensors = df['image_tensors'].values + self.category_tensors = df['category_tensors'].values def __len__(self): - return len(self.filepath) + return len(self.image_tensors) def __getitem__(self, idx): - image_filepath = self.filepath[idx] - image = Image.open(image_filepath) - image = self.transforms(image) + image_tensor = self.image_tensors[idx] + category_tensor = self.category_tensors[idx] + return image_tensor, category_tensor + + #def __getitems__(self, idx_list): + # image_tensors = self.image_tensors[idx_list].tolist() + # category_tensors = self.category_tensors[idx_list].tolist() + # return image_tensors, category_tensors - if self.mode == 'train': - label = torch.tensor(self.category[idx], dtype=torch.int64) - else: - label = torch.tensor(np.nan) - - return image, label \ No newline at end of file +def collate_fn(data): + arrays, categories = data + return arrays, categories \ No newline at end of file From d6caf0c95cc90df13aecaa41728a3ae311142547 Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 13:53:14 +0000 Subject: [PATCH 13/30] Added comment for iterating over dataset instead of dataloader --- model/torch/fit.py | 17 ++++++++++------- model/torch/predict.py | 1 + model/torch/validate.py | 3 ++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/model/torch/fit.py b/model/torch/fit.py index 91ddb56..73e3b1f 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -1,4 +1,5 @@ import torch +import logging from model.torch.validate import validate from model.torch.ModelFit import ModelFit from model.torch.EarlyStopper import EarlyStopper @@ -40,13 +41,15 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz for epoch in range(num_epochs): t_loss, t_corr = 0.0, 0.0 model.train() + #train_features, train_labels = next(iter(train_dataloader)) for i, (images, labels) in enumerate(train_dataloader): + #for i, (images, labels) in enumerate(zip(train_dataloader.dataset.image_tensors, train_dataloader.dataset.category_tensors)): # load images and labels to device images = images.to(device) - label = labels.to(device) + labels = labels.to(device) # forward pass preds = model.forward(images) - loss = criterion(preds, label) + loss = criterion(preds, labels) if scheduler != None: scheduler.step(loss) # backward and optimise @@ -55,23 +58,23 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz optimizer.step() # calculate metrics t_loss += loss.item() * images.size(0) - t_corr += torch.sum(preds.argmax(1) == labels.to(device)) - print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') + t_corr += torch.sum(preds.argmax(1) == labels) + logging.info(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') # update training loss and accuarcy train_loss = t_loss / len(train_dataloader.dataset) train_acc = t_corr.item() / len(train_dataloader.dataset) train_loss_list.append(train_loss) train_acc_list.append(train_acc) - print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}') + logging.info(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}') # calculate validation loss and accuracy if applicable if valid_dataLoader != None: valid_loss, valid_acc = validate(model=model, device=device, dataloader=valid_dataLoader, criterion=criterion) valid_loss_list.append(valid_loss) valid_acc_list.append(valid_acc) - print(f'Valid Loss: {loss.item():.4f}, Valid Accuracy: {valid_acc:.4f}') + logging.info(f'Valid Loss: {loss.item():.4f}, Valid Accuracy: {valid_acc:.4f}') # if implementing early stopping if early_stopper != None and early_stopper.early_stop(valid_loss): - print(f"Applying early stopping criteria at validation loss: {valid_loss}") + logging.info(f"Applying early stopping criteria at validation loss: {valid_loss}") break # create model fit object model_fit = ModelFit(loss=train_loss_list, accuracy=train_acc_list, val_loss=valid_loss_list, val_accuracy=valid_acc_list) diff --git a/model/torch/predict.py b/model/torch/predict.py index 74e6117..1676923 100644 --- a/model/torch/predict.py +++ b/model/torch/predict.py @@ -21,6 +21,7 @@ def predict(model, dataloader:torch.utils.data.DataLoader, device:torch.device) fin_outputs = [] with torch.no_grad(): for i, (images, labels) in enumerate(dataloader): + #for i, (images, labels) in enumerate(zip(dataloader.dataset.image_tensors, dataloader.dataset.category_tensors)): images = images.to(device) labels = labels.to(device) outputs = model.forward(images) diff --git a/model/torch/validate.py b/model/torch/validate.py index 692d77c..4017d8e 100644 --- a/model/torch/validate.py +++ b/model/torch/validate.py @@ -24,6 +24,7 @@ def validate(model, device:torch.device, dataloader:torch.utils.data.DataLoader, with torch.no_grad(): v_loss, v_corr = 0.0, 0.0 for i, (images, labels) in enumerate(dataloader): + #for i, (images, labels) in enumerate(zip(dataloader.dataset.image_tensors, dataloader.dataset.category_tensors)): # load images and labels to device images = images.to(device) labels = labels.to(device) @@ -32,7 +33,7 @@ def validate(model, device:torch.device, dataloader:torch.utils.data.DataLoader, loss = criterion(preds, labels) # calculate metrics v_loss += loss.item() * images.size(0) - v_corr += torch.sum(preds.argmax(1) == labels.to(device)) + v_corr += torch.sum(preds.argmax(1) == labels) # update training loss and accuarcy valid_loss = v_loss / len(dataloader.dataset) valid_acc = v_corr.item() / len(dataloader.dataset) From bfa14bfef9d0c5890724633087767528f70cd6f5 Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 15:35:23 +0000 Subject: [PATCH 14/30] Further reduced data size base on multiples of the batch size --- model/prg_torch_model.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 9c338bf..4f08670 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -62,12 +62,12 @@ filenames = os.listdir(cons.train_fdir) categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] df = pd.DataFrame({'filenames': filenames, 'category': categories}) - frac = 0.05 - df = df.sample(frac = frac) + random_state = 42 + df = df.sample(n=cons.batch_size*4, random_state=random_state).reset_index(drop=True) df["categoryname"] = df["category"].replace(cons.category_mapper) df['source'] = df['filenames'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) - df["filepaths"] = cons.train_fdir + '/' + df['filenames'] - df["images"] = df["filepaths"].apply(lambda x: Image.open(x)) + df["filepaths"] = df['filenames'].apply(lambda x: os.path.join(cons.train_fdir, x)) + df["images"] = load_image_v2(df["filepaths"]) df["ndims"] = df['images'].apply(lambda x: len(np.array(x).shape)) df = df.loc[df["ndims"] == 3, :].copy() # apply transforms and convert to arrays @@ -84,8 +84,7 @@ logging.info("Split into training, validation and test dataset...") # prepare data - random_state = 42 - validate_df = df[df['source'] == 'kaggle'].sample(n=int(5000 * frac), random_state=random_state) + validate_df = df.sample(n=cons.batch_size, random_state=random_state) train_df = df[~df.index.isin(validate_df.index)] train_df = train_df.reset_index(drop=True) validate_df = validate_df.reset_index(drop=True) @@ -155,9 +154,9 @@ logging.info("Generate test dataset...") # prepare test data test_filenames = os.listdir(cons.test_fdir) - test_df = pd.DataFrame({'filename': test_filenames}).head() - test_df["idx"] = test_df['filename'].str.extract(pat='([0-9]+)').astype(int) - test_df["filepaths"] = test_df['filename'].apply(lambda x: os.path.join(cons.test_fdir, x)) + test_df = pd.DataFrame({'filenames': test_filenames}).head() + test_df["idx"] = test_df['filenames'].str.extract(pat='([0-9]+)').astype(int) + test_df["filepaths"] = test_df['filenames'].apply(lambda x: os.path.join(cons.test_fdir, x)) test_df["images"] = load_image_v2(test_df["filepaths"]) test_df["category"] = np.nan test_df = test_df.set_index('idx').sort_index() From d3b742aafc58bfd68ad0d795ffbc8f51cc6783cb Mon Sep 17 00:00:00 2001 From: Oisin Date: Fri, 14 Feb 2025 15:35:42 +0000 Subject: [PATCH 15/30] Applying deep copy --- model/arch/load_image_v2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model/arch/load_image_v2.py b/model/arch/load_image_v2.py index bb17e82..303f105 100644 --- a/model/arch/load_image_v2.py +++ b/model/arch/load_image_v2.py @@ -1,6 +1,8 @@ import pandas as pd +import numpy as np from PIL import Image +from copy import deepcopy def load_image_v2(image_fpaths): """ @@ -8,7 +10,7 @@ def load_image_v2(image_fpaths): images = [] for image_fpath in image_fpaths: temp = Image.open(image_fpath) - keep = temp.copy() + keep = deepcopy(temp) images.append(keep) temp.close() return pd.Series(images) \ No newline at end of file From 5ef18442d91045c441a1b8ce581a589d1cf433f4 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sat, 15 Feb 2025 14:36:38 +0000 Subject: [PATCH 16/30] Defaulting n workers to 0 --- model/cons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/cons.py b/model/cons.py index 1c46796..6a34100 100644 --- a/model/cons.py +++ b/model/cons.py @@ -78,5 +78,5 @@ shuffle = False # multiprocessing -num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count()) +num_workers = os.environ.get("PARAM_NUM_WORKERS", 0) check_gpu = os.environ.get("PARAM_CHECK_GPU", False) \ No newline at end of file From 8d502b6a64aedcb0fbd00bfe26b58f5403a4e9b2 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sat, 15 Feb 2025 14:37:00 +0000 Subject: [PATCH 17/30] Fixed filenames extraction --- model/utilities/plot_preds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/utilities/plot_preds.py b/model/utilities/plot_preds.py index 7850dee..2cf2caa 100644 --- a/model/utilities/plot_preds.py +++ b/model/utilities/plot_preds.py @@ -29,7 +29,7 @@ def plot_preds( sample_test = data.head(18) plt.figure(figsize=(12, 24)) for id, (index, row) in enumerate(sample_test.iterrows()): - filename = row['filename'] + filename = row['filenames'] category = row['category'] img = load_img(os.path.join(cons.test_fdir, filename), target_size=cons.IMAGE_SIZE) plt.subplot(6, 3, id+1) From 69d2e00b4e825fcb8cc3d3537c62054d1eb914f9 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sat, 15 Feb 2025 14:37:59 +0000 Subject: [PATCH 18/30] Rmoved comment to iterate over dataset object within data loader --- model/torch/fit.py | 2 -- model/torch/predict.py | 1 - model/torch/validate.py | 1 - 3 files changed, 4 deletions(-) diff --git a/model/torch/fit.py b/model/torch/fit.py index 73e3b1f..f2057dd 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -41,9 +41,7 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz for epoch in range(num_epochs): t_loss, t_corr = 0.0, 0.0 model.train() - #train_features, train_labels = next(iter(train_dataloader)) for i, (images, labels) in enumerate(train_dataloader): - #for i, (images, labels) in enumerate(zip(train_dataloader.dataset.image_tensors, train_dataloader.dataset.category_tensors)): # load images and labels to device images = images.to(device) labels = labels.to(device) diff --git a/model/torch/predict.py b/model/torch/predict.py index 1676923..74e6117 100644 --- a/model/torch/predict.py +++ b/model/torch/predict.py @@ -21,7 +21,6 @@ def predict(model, dataloader:torch.utils.data.DataLoader, device:torch.device) fin_outputs = [] with torch.no_grad(): for i, (images, labels) in enumerate(dataloader): - #for i, (images, labels) in enumerate(zip(dataloader.dataset.image_tensors, dataloader.dataset.category_tensors)): images = images.to(device) labels = labels.to(device) outputs = model.forward(images) diff --git a/model/torch/validate.py b/model/torch/validate.py index 4017d8e..072559c 100644 --- a/model/torch/validate.py +++ b/model/torch/validate.py @@ -24,7 +24,6 @@ def validate(model, device:torch.device, dataloader:torch.utils.data.DataLoader, with torch.no_grad(): v_loss, v_corr = 0.0, 0.0 for i, (images, labels) in enumerate(dataloader): - #for i, (images, labels) in enumerate(zip(dataloader.dataset.image_tensors, dataloader.dataset.category_tensors)): # load images and labels to device images = images.to(device) labels = labels.to(device) From 46646fae732970de70ba9399d8ec7cfd930085ee Mon Sep 17 00:00:00 2001 From: Oisin Date: Sat, 15 Feb 2025 14:38:34 +0000 Subject: [PATCH 19/30] Integrated __getitems__ and collate_fn into CustomDataset --- model/torch/CustomDataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/model/torch/CustomDataset.py b/model/torch/CustomDataset.py index 2a9c752..5084cb4 100644 --- a/model/torch/CustomDataset.py +++ b/model/torch/CustomDataset.py @@ -17,11 +17,11 @@ def __getitem__(self, idx): category_tensor = self.category_tensors[idx] return image_tensor, category_tensor - #def __getitems__(self, idx_list): - # image_tensors = self.image_tensors[idx_list].tolist() - # category_tensors = self.category_tensors[idx_list].tolist() - # return image_tensors, category_tensors + def __getitems__(self, idx_list): + image_tensors = torch.stack(self.image_tensors[idx_list].tolist()) + category_tensors = torch.stack(self.category_tensors[idx_list].tolist()) + return image_tensors, category_tensors -def collate_fn(data): - arrays, categories = data - return arrays, categories \ No newline at end of file + def collate_fn(data): + arrays, categories = data + return arrays, categories \ No newline at end of file From d1512cfbc4ba1883b0fe7a07cdd09ac6af3255c7 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sat, 15 Feb 2025 14:38:56 +0000 Subject: [PATCH 20/30] Fixed dataloader latency issue --- model/prg_torch_model.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 4f08670..d9d7edb 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -22,7 +22,7 @@ from model.torch.VGG16_pretrained import VGG16_pretrained from model.torch.AlexNet8 import AlexNet8 from model.torch.LeNet5 import LeNet5 -from model.torch.CustomDataset import CustomDataset, collate_fn +from model.torch.CustomDataset import CustomDataset from model.torch.EarlyStopper import EarlyStopper from model.utilities.plot_model import plot_model_fit from model.utilities.plot_preds import plot_preds @@ -71,7 +71,7 @@ df["ndims"] = df['images'].apply(lambda x: len(np.array(x).shape)) df = df.loc[df["ndims"] == 3, :].copy() # apply transforms and convert to arrays - df["image_tensors"] = df["images"].apply(lambda x: torch_transforms(x))#.detach().cpu().numpy()) + df["image_tensors"] = df["images"].apply(lambda x: torch_transforms(x)) df["category_tensors"] = df["category"].apply(lambda x: torch.tensor(x, dtype=torch.int64)) logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") @@ -98,15 +98,15 @@ total_validate = validate_df.shape[0] # set train data loader train_dataset = CustomDataset(train_df) - train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) + train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) # set validation data loader validation_dataset = CustomDataset(train_df) - validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True) + validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") logging.info("Plot example data loader images...") # datagen examplec - example_data = [train_dataset.__getitem__(idx)[0] for idx in range(16)] + example_data = train_df['image_tensors'].values[:16].tolist() plot_generator(generator=example_data, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="DataLoader") @@ -161,15 +161,15 @@ test_df["category"] = np.nan test_df = test_df.set_index('idx').sort_index() # apply transforms and convert to arrays - test_df["image_tensors"] = test_df["images"].apply(lambda x: torch_transforms(x))#.detach().cpu().numpy()) - test_df["category_tensors"] = test_df["category"].apply(lambda x: torch.tensor(torch.tensor(x), dtype=torch.int64)) + test_df["image_tensors"] = test_df["images"].apply(lambda x: torch_transforms(x)) + test_df["category_tensors"] = test_df["category"].apply(lambda x: torch.tensor(x)) logging.info(f"train_df.shape: {test_df.shape}") timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") logging.info("Create test dataloader...") # set train data loader test_dataset = CustomDataset(test_df) - test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True) + test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) timeLogger.logTime(parentKey="TestSet", subKey="DataLoader") logging.info("Generate test set predictions...") @@ -187,7 +187,7 @@ logging.info("Generate a sample submission file for kaggle...") # make submission submission_df = test_df.copy() - submission_df['id'] = submission_df['filename'].str.split('.').str[0] + submission_df['id'] = submission_df['filenames'].str.split('.').str[0] submission_df['label'] = submission_df['category'].replace(cons.category_mapper) submission_df.to_csv(cons.submission_csv_fpath, index=False) timeLogger.logTime(parentKey="TestSet", subKey="SubmissionFile") \ No newline at end of file From 605ccff9c0e9086d6c70697fc97ddd2c3eb5566e Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 12:28:47 +0000 Subject: [PATCH 21/30] Created class for loading and applying torch transformation over single iteration --- model/arch/load_image_v2.py | 68 ++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/model/arch/load_image_v2.py b/model/arch/load_image_v2.py index 303f105..4156068 100644 --- a/model/arch/load_image_v2.py +++ b/model/arch/load_image_v2.py @@ -1,7 +1,13 @@ import pandas as pd import numpy as np +import re +import os +import torch from PIL import Image +from matplotlib.image import imread +from copy import deepcopy +from multiprocessing import Pool from copy import deepcopy def load_image_v2(image_fpaths): @@ -13,4 +19,64 @@ def load_image_v2(image_fpaths): keep = deepcopy(temp) images.append(keep) temp.close() - return pd.Series(images) \ No newline at end of file + return pd.Series(images) + +class TorchLoadImages(): + + def __init__(self, torch_transforms, n_workers=None): + self.torch_transforms = torch_transforms + self.n_workers = n_workers + + def loadImage(self, filepath): + """ + """ + # determine the filename and source + fileName = os.path.basename(filepath) + # determine label from image file path + if ("cat" in fileName) or ("dog" in fileName): + fileSource = "kaggle" if len(re.findall(pattern='^(cat|dog)(.[0-9]+.jpg)$', string=fileName)) > 0 else "webscraper" + labelName = fileName.split(".")[0] + label = labelName == "dog" + labelTensor = torch.tensor(label, dtype=torch.int64) + else: + fileSource = "kaggle" + labelName = np.nan + label = np.nan + labelTensor = torch.tensor(label) + # load image file and apply torch transforms + image = Image.open(filepath) + imageTensor = self.torch_transforms(image) + imageArray = np.asarray(image) + image.close() + nDims = len(imageArray.shape) + # create an output record + record = { + "filepaths":filepath, + "filenames":fileName, + "source":fileSource, + "categoryname":labelName, + "category":label, + "images":imageArray, + "ndims":nDims, + "category_tensors":labelTensor, + "image_tensors":imageTensor + } + # close open image + return record + + def multiProcess(self, func, args): + """ + """ + pool = Pool(self.n_workers) + results = pool.map(func, args) + pool.close() + return results + + def loadImages(self, filepaths): + """ + """ + if self.n_workers == None: + records = [self.loadImage(filepath) for filepath in filepaths] + else: + records = self.multiProcess(self.loadImage, filepaths) + return records \ No newline at end of file From 1f2f9c82c5b07b4ed36cb58c9531924936432a9a Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 12:29:34 +0000 Subject: [PATCH 22/30] Increased sample size to 15k images. Using new load torch data object. --- model/prg_torch_model.py | 63 +++++++++++++++------------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index d9d7edb..8f02d48 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -3,8 +3,6 @@ import logging import pandas as pd import numpy as np -import random -from PIL import Image # set huggingface hub directory huggingface_hub_dir = 'E:\\huggingface' @@ -30,11 +28,13 @@ from model.utilities.plot_generator import plot_generator from model.utilities.TimeIt import TimeIt from model.utilities.commandline_interface import commandline_interface -from model.arch.load_image_v2 import load_image_v2 +from model.arch.load_image_v2 import load_image_v2, TorchLoadImages # device configuration device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu') +random_state = 42 + torch_transforms = transforms.Compose([ transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size #,transforms.RandomRotation(30) @@ -51,40 +51,30 @@ lgr.setLevel(logging.INFO) timeLogger = TimeIt() + logging.info("Parsing command line arguments...") # handle input parameters input_params_dict = commandline_interface() + logging.info(input_params_dict) + timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") if input_params_dict["run_model_training"]: logging.info("Generating dataframe of images...") - # TODO: rewrite this with polars - # create a dataframe of filenames and categories - filenames = os.listdir(cons.train_fdir) - categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames] - df = pd.DataFrame({'filenames': filenames, 'category': categories}) - random_state = 42 - df = df.sample(n=cons.batch_size*4, random_state=random_state).reset_index(drop=True) - df["categoryname"] = df["category"].replace(cons.category_mapper) - df['source'] = df['filenames'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'}) - df["filepaths"] = df['filenames'].apply(lambda x: os.path.join(cons.train_fdir, x)) - df["images"] = load_image_v2(df["filepaths"]) - df["ndims"] = df['images'].apply(lambda x: len(np.array(x).shape)) - df = df.loc[df["ndims"] == 3, :].copy() - # apply transforms and convert to arrays - df["image_tensors"] = df["images"].apply(lambda x: torch_transforms(x)) - df["category_tensors"] = df["category"].apply(lambda x: torch.tensor(x, dtype=torch.int64)) + # create torch load images object + sample_size = 15000 + torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) + df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)[0:sample_size]])) logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Plot sample image...") # random image plot - image = Image.open(os.path.join(cons.train_fdir, filenames[1])) - plot_image(image, output_fpath=cons.torch_random_image_fpath, show_plot=False) + plot_image(df['images'].values[1], output_fpath=cons.torch_random_image_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="SampleImage") logging.info("Split into training, validation and test dataset...") # prepare data - validate_df = df.sample(n=cons.batch_size, random_state=random_state) + validate_df = df.sample(n=cons.batch_size*3, random_state=random_state) train_df = df[~df.index.isin(validate_df.index)] train_df = train_df.reset_index(drop=True) validate_df = validate_df.reset_index(drop=True) @@ -100,12 +90,12 @@ train_dataset = CustomDataset(train_df) train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) # set validation data loader - validation_dataset = CustomDataset(train_df) + validation_dataset = CustomDataset(validate_df) validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") logging.info("Plot example data loader images...") - # datagen examplec + # datagen example example_data = train_df['image_tensors'].values[:16].tolist() plot_generator(generator=example_data, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="DataLoader") @@ -113,9 +103,9 @@ logging.info("Initiate torch model...") logging.info(f"device: {device}") # initiate cnn architecture - model = LeNet5(num_classes=2) + #model = LeNet5(num_classes=2) #model = AlexNet8(num_classes=2).to(device) - #model = VGG16_pretrained(num_classes=2).to(device) + model = VGG16_pretrained(num_classes=2).to(device) if device == "cuda": model = nn.DataParallel(model) model = model.to(device) @@ -146,25 +136,18 @@ logging.info("Load fitted torch model from disk...") # load model + #model = LeNet5(num_classes=2).to(device) #model = AlexNet8(num_classes=2).to(device) - model = LeNet5(num_classes=2).to(device) + model = VGG16_pretrained(num_classes=2).to(device) model.load(input_fpath=cons.torch_model_pt_fpath) timeLogger.logTime(parentKey="ModelSerialisation", subKey="Load") logging.info("Generate test dataset...") - # prepare test data - test_filenames = os.listdir(cons.test_fdir) - test_df = pd.DataFrame({'filenames': test_filenames}).head() - test_df["idx"] = test_df['filenames'].str.extract(pat='([0-9]+)').astype(int) - test_df["filepaths"] = test_df['filenames'].apply(lambda x: os.path.join(cons.test_fdir, x)) - test_df["images"] = load_image_v2(test_df["filepaths"]) - test_df["category"] = np.nan - test_df = test_df.set_index('idx').sort_index() - # apply transforms and convert to arrays - test_df["image_tensors"] = test_df["images"].apply(lambda x: torch_transforms(x)) - test_df["category_tensors"] = test_df["category"].apply(lambda x: torch.tensor(x)) - logging.info(f"train_df.shape: {test_df.shape}") - timeLogger.logTime(parentKey="TestSet", subKey="RawLoad") + # create torch load images object + torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) + test_df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.test_fdir, x) for x in os.listdir(cons.test_fdir)])) + logging.info(f"test_df.shape: {test_df.shape}") + timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") logging.info("Create test dataloader...") # set train data loader From dc154ab0a092fff5e06c5b1ad472ebb5a57c53bd Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 16:15:35 +0000 Subject: [PATCH 23/30] Added functions to handle torch checkpoint read and writes --- model/torch/checkpoints.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 model/torch/checkpoints.py diff --git a/model/torch/checkpoints.py b/model/torch/checkpoints.py new file mode 100644 index 0000000..a031bb7 --- /dev/null +++ b/model/torch/checkpoints.py @@ -0,0 +1,34 @@ +import torch +from beartype import beartype + +@beartype +def save_checkpoint(state:dict, filepath:str): + """Save model training checkpoints to local file path + + Parameters + ---------- + state : dict + The model state dictionary to write to disk + filepath : str + The local file path to write the checkpoints to + + Returns + ------- + """ + torch.save(state, filepath) + +@beartype +def load_checkpoint(filepath:str) -> dict: + """Load model training checkpoints from local file path + + Parameters + ---------- + filepath : str + The local file path to read the checkpoints from + + Returns + ------- + """ + # load checkpoint file + checkpoint = torch.load(filepath, weights_only=False) + return checkpoint \ No newline at end of file From 5915d919887ec38c6d0932ecd8f7543a0e0e2ac7 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 16:17:51 +0000 Subject: [PATCH 24/30] Deleting dataframes to restore memory. Moved generate plots to after train validation split. Added logic to write torch training epoch checkpoints to local file directory --- model/prg_torch_model.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 8f02d48..071c5fe 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -61,7 +61,7 @@ logging.info("Generating dataframe of images...") # create torch load images object - sample_size = 15000 + sample_size = 20000 torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)[0:sample_size]])) logging.info(f"df.shape: {df.shape}") @@ -74,7 +74,7 @@ logging.info("Split into training, validation and test dataset...") # prepare data - validate_df = df.sample(n=cons.batch_size*3, random_state=random_state) + validate_df = df.sample(frac=0.05, random_state=random_state, replace=False) train_df = df[~df.index.isin(validate_df.index)] train_df = train_df.reset_index(drop=True) validate_df = validate_df.reset_index(drop=True) @@ -82,6 +82,11 @@ logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") + logging.info("Plot example data loader images...") + # data generator example + plot_generator(generator=train_df['image_tensors'].values[:16].tolist(), mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) + timeLogger.logTime(parentKey="Plots", subKey="DataLoader") + logging.info("Creating training and validation data loaders...") # set data constants total_train = train_df.shape[0] @@ -92,14 +97,12 @@ # set validation data loader validation_dataset = CustomDataset(validate_df) validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) + # flush dataframes from memory + del df + del train_df + del validate_df timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") - logging.info("Plot example data loader images...") - # datagen example - example_data = train_df['image_tensors'].values[:16].tolist() - plot_generator(generator=example_data, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) - timeLogger.logTime(parentKey="Plots", subKey="DataLoader") - logging.info("Initiate torch model...") logging.info(f"device: {device}") # initiate cnn architecture @@ -119,7 +122,7 @@ # hyper-parameters num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # fit torch model - model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper) + model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper, checkpoints_dir=cons.checkpoints_fdir, load_epoch_checkpoint=None) timeLogger.logTime(parentKey="Modelling", subKey="Fit") logging.info("Plot model fit results...") @@ -173,4 +176,6 @@ submission_df['id'] = submission_df['filenames'].str.split('.').str[0] submission_df['label'] = submission_df['category'].replace(cons.category_mapper) submission_df.to_csv(cons.submission_csv_fpath, index=False) + # delete dataframes from memory + del test_df timeLogger.logTime(parentKey="TestSet", subKey="SubmissionFile") \ No newline at end of file From e56da3aaf53988adf4752063b4f9213855e9d367 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 16:18:39 +0000 Subject: [PATCH 25/30] Calling fit with model checkpoints and load epach checkpoint arguments --- model/torch/VGG16_pretrained.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/model/torch/VGG16_pretrained.py b/model/torch/VGG16_pretrained.py index 865f3ba..4767f64 100644 --- a/model/torch/VGG16_pretrained.py +++ b/model/torch/VGG16_pretrained.py @@ -15,6 +15,7 @@ class VGG16_pretrained(nn.Module): def __init__(self, num_classes=1000): super(VGG16_pretrained, self).__init__() + self.model_id = "VGG16_pretrained" self.resnet = models.vgg16(weights ="DEFAULT") self.num_ftrs = self.resnet.classifier[len(self.resnet.classifier)-1].out_features self.classifier = nn.Sequential(nn.Linear(in_features=self.num_ftrs, out_features=num_classes)) @@ -38,7 +39,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -59,11 +60,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit=fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit=fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: From b394fedf35c2c0d24ef1c86215167fc7c63e837c Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 16:19:43 +0000 Subject: [PATCH 26/30] Added logic to write model training epoch checkpoints to specified local directory. Added extra logic to load specific model checkpoint and train from next epoch onwards. --- model/torch/fit.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/model/torch/fit.py b/model/torch/fit.py index f2057dd..b4fbaf2 100644 --- a/model/torch/fit.py +++ b/model/torch/fit.py @@ -1,13 +1,15 @@ +import os import torch import logging from model.torch.validate import validate from model.torch.ModelFit import ModelFit from model.torch.EarlyStopper import EarlyStopper +from model.torch.checkpoints import save_checkpoint, load_checkpoint from typing import Union from beartype import beartype @beartype -def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): +def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """ Fits model to specified data loader given the criterion and optimizer @@ -31,14 +33,31 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ + start_epoch = 1 + end_epoch = num_epochs + 1 train_loss_list, train_acc_list, valid_loss_list, valid_acc_list = [], [], [], [] model = model.to(device) n_total_steps = len(train_dataloader) - for epoch in range(num_epochs): + # load previous model epoch checkpoint + if (load_epoch_checkpoint != None) and (checkpoints_dir != None): + checkpoints_filename = f"checkpoint_{model.model_id}_epoch_{load_epoch_checkpoint}.pt" + checkpoint_filepath = os.path.join(checkpoints_dir, checkpoints_filename) + checkpoint = load_checkpoint(filepath=checkpoint_filepath) + # assign state dictionaries from checkpoint dictionary + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = load_epoch_checkpoint + 1 + train_loss_list, train_acc_list, valid_loss_list, valid_acc_list = checkpoint["train_loss_list"], checkpoint["train_acc_list"], checkpoint["valid_loss_list"], checkpoint["valid_acc_list"] + logging.info(f"Read checkpoints from: {checkpoint_filepath}") + for epoch in range(start_epoch, end_epoch): t_loss, t_corr = 0.0, 0.0 model.train() for i, (images, labels) in enumerate(train_dataloader): @@ -57,8 +76,8 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz # calculate metrics t_loss += loss.item() * images.size(0) t_corr += torch.sum(preds.argmax(1) == labels) - logging.info(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') - # update training loss and accuarcy + logging.info(f'Epoch [{epoch}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}') + # update training loss and accuracy train_loss = t_loss / len(train_dataloader.dataset) train_acc = t_corr.item() / len(train_dataloader.dataset) train_loss_list.append(train_loss) @@ -74,6 +93,14 @@ def fit(model, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimiz if early_stopper != None and early_stopper.early_stop(valid_loss): logging.info(f"Applying early stopping criteria at validation loss: {valid_loss}") break + # writing model checkpoints + if (checkpoints_dir != None): + checkpoints_filename = f"checkpoint_{model.model_id}_epoch_{epoch}.pt" + checkpoint_filepath = os.path.join(checkpoints_dir, checkpoints_filename) + checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), "train_loss_list":train_loss_list, "train_acc_list":train_acc_list, "valid_loss_list":valid_loss_list, "valid_acc_list":valid_acc_list} + save_checkpoint(state=checkpoint, filepath=checkpoint_filepath) + logging.info(f"Wrote checkpoints to: {checkpoint_filepath}") + # create model fit object model_fit = ModelFit(loss=train_loss_list, accuracy=train_acc_list, val_loss=valid_loss_list, val_accuracy=valid_acc_list) return model, model_fit From 65dd8f9c83d82214bbde75a2efb80a0f7309cbf4 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 17:35:45 +0000 Subject: [PATCH 27/30] Added try except to catch torch transform errors --- model/arch/load_image_v2.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/model/arch/load_image_v2.py b/model/arch/load_image_v2.py index 4156068..b532985 100644 --- a/model/arch/load_image_v2.py +++ b/model/arch/load_image_v2.py @@ -45,7 +45,12 @@ def loadImage(self, filepath): labelTensor = torch.tensor(label) # load image file and apply torch transforms image = Image.open(filepath) - imageTensor = self.torch_transforms(image) + torch_transform_error = None + try: + imageTensor = self.torch_transforms(image) + except Exception as e: + imageTensor = None + torch_transform_error = str(e) imageArray = np.asarray(image) image.close() nDims = len(imageArray.shape) @@ -59,7 +64,8 @@ def loadImage(self, filepath): "images":imageArray, "ndims":nDims, "category_tensors":labelTensor, - "image_tensors":imageTensor + "image_tensors":imageTensor, + "torch_transform_error":torch_transform_error } # close open image return record From 613c12d06c7c0bd0216212562b0fece2db5714c3 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 17:36:19 +0000 Subject: [PATCH 28/30] Added comments to LeNet5 and using VGG16 --- model/prg_keras_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/model/prg_keras_model.py b/model/prg_keras_model.py index 6d3be61..b01e109 100644 --- a/model/prg_keras_model.py +++ b/model/prg_keras_model.py @@ -10,6 +10,7 @@ from model.utilities.plot_image import plot_image from model.utilities.plot_generator import plot_generator from model.utilities.plot_preds import plot_preds +from model.keras.LeNet5 import LeNet5 from model.keras.AlexNet8 import AlexNet8 from model.keras.VGG16_pretrained import VGG16_pretrained from model.utilities.plot_model import plot_model_fit @@ -83,8 +84,9 @@ logging.info("Initiate keras model...") # initiate LeNet5 architecture - keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') - #keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + #keras_model = LeNet5(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + #keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') + keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax') keras_model.summary() # set gradient decent compiler optimizer = optimizers.SGD(learning_rate=cons.learning_rate) From 29d7b0f4ce464c4443e46cc1f9a13b0b2b5b2e9b Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 17:37:52 +0000 Subject: [PATCH 29/30] Clearing more objects from memory set sample size to 30k. Shuffling images with random seed. Moved generator plot forward --- model/prg_torch_model.py | 41 ++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/model/prg_torch_model.py b/model/prg_torch_model.py index 071c5fe..8941c43 100644 --- a/model/prg_torch_model.py +++ b/model/prg_torch_model.py @@ -55,15 +55,23 @@ # handle input parameters input_params_dict = commandline_interface() logging.info(input_params_dict) - timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") - + timeLogger.logTime(parentKey="Initialisation", subKey="CommandlineArguments") + if input_params_dict["run_model_training"]: - + logging.info("Generating dataframe of images...") + # load and shuffle the image file paths + np.random.seed(random_state) + image_filepaths=np.array([os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)]) + np.random.shuffle(image_filepaths) # create torch load images object - sample_size = 20000 + sample_size = 30000 torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None) - df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)[0:sample_size]])) + df = pd.DataFrame.from_records(torchLoadImages.loadImages(image_filepaths[0:sample_size])) + # only consider images with 3 dimensions + df = df.loc[df["ndims"]==3, :] + # flush data from memory + del image_filepaths logging.info(f"df.shape: {df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad") @@ -72,6 +80,11 @@ plot_image(df['images'].values[1], output_fpath=cons.torch_random_image_fpath, show_plot=False) timeLogger.logTime(parentKey="Plots", subKey="SampleImage") + logging.info("Plot example data loader images...") + # data generator example + plot_generator(generator=df['image_tensors'].values[:16].tolist(), mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) + timeLogger.logTime(parentKey="Plots", subKey="DataLoader") + logging.info("Split into training, validation and test dataset...") # prepare data validate_df = df.sample(frac=0.05, random_state=random_state, replace=False) @@ -82,11 +95,6 @@ logging.info(f"validate_df.shape: {validate_df.shape}") timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit") - logging.info("Plot example data loader images...") - # data generator example - plot_generator(generator=train_df['image_tensors'].values[:16].tolist(), mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False) - timeLogger.logTime(parentKey="Plots", subKey="DataLoader") - logging.info("Creating training and validation data loaders...") # set data constants total_train = train_df.shape[0] @@ -97,10 +105,12 @@ # set validation data loader validation_dataset = CustomDataset(validate_df) validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn) - # flush dataframes from memory + # flush data from memory del df del train_df + del train_dataset del validate_df + del validation_dataset timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders") logging.info("Initiate torch model...") @@ -123,6 +133,9 @@ num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs # fit torch model model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper, checkpoints_dir=cons.checkpoints_fdir, load_epoch_checkpoint=None) + # flush data from memory + del train_loader + del validation_loader timeLogger.logTime(parentKey="Modelling", subKey="Fit") logging.info("Plot model fit results...") @@ -134,7 +147,7 @@ # save model model.save(output_fpath=cons.torch_model_pt_fpath) timeLogger.logTime(parentKey="ModelSerialisation", subKey="Write") - + if input_params_dict["run_testset_prediction"]: logging.info("Load fitted torch model from disk...") @@ -163,6 +176,9 @@ predict = model.predict(test_loader, device) test_df['category'] = np.argmax(predict, axis=-1) test_df["category"] = test_df["category"].replace(cons.category_mapper) + # flush data from memory + del test_dataset + del test_loader timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions") logging.info("Plot example test set predictions...") @@ -178,4 +194,5 @@ submission_df.to_csv(cons.submission_csv_fpath, index=False) # delete dataframes from memory del test_df + del submission_df timeLogger.logTime(parentKey="TestSet", subKey="SubmissionFile") \ No newline at end of file From 7ea326517c69e409f071a67f4d96550081d24f05 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 16 Feb 2025 18:06:56 +0000 Subject: [PATCH 30/30] Added argument calls to checkpoint directory and load epoch checkpoints. Added model id. --- model/torch/AlexNet8.py | 9 +++++++-- model/torch/LeNet5.py | 9 +++++++-- model/torch/ResNet50_pretrained.py | 9 +++++++-- model/torch/VGG16.py | 9 +++++++-- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/model/torch/AlexNet8.py b/model/torch/AlexNet8.py index 99892b0..bf0b264 100644 --- a/model/torch/AlexNet8.py +++ b/model/torch/AlexNet8.py @@ -15,6 +15,7 @@ class AlexNet8(nn.Module): def __init__(self, num_classes=1000): super(AlexNet8, self).__init__() + self.model_id = "AlexNet8" self.features = nn.Sequential( nn.Conv2d(in_channels=3, out_channels=96, kernel_size=(11, 11), stride=(4, 4), padding='valid'), nn.ReLU(inplace=True), @@ -59,7 +60,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -80,11 +81,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/LeNet5.py b/model/torch/LeNet5.py index d9edfb9..0300051 100644 --- a/model/torch/LeNet5.py +++ b/model/torch/LeNet5.py @@ -15,6 +15,7 @@ class LeNet5(nn.Module): def __init__(self, num_classes=1000): super(LeNet5, self).__init__() + self.model_id = "LeNet5" self.features = nn.Sequential( nn.Conv2d(in_channels = 3, out_channels = 6, kernel_size=(5, 5), stride = (1, 1), padding = 'valid'), nn.ReLU(inplace=True), @@ -53,7 +54,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -74,11 +75,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/ResNet50_pretrained.py b/model/torch/ResNet50_pretrained.py index e084061..be33d6b 100644 --- a/model/torch/ResNet50_pretrained.py +++ b/model/torch/ResNet50_pretrained.py @@ -14,6 +14,7 @@ class ResNet50_pretrained(nn.Module): def __init__(self, num_classes=1000): + self.model_id = "ResNet50_pretrained" super(ResNet50_pretrained, self).__init__() self.resnet = models.resnet50(pretrained=True) self.num_ftrs = self.resnet.fc.out_features @@ -38,7 +39,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -59,11 +60,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: diff --git a/model/torch/VGG16.py b/model/torch/VGG16.py index 3b416e2..11d9d5f 100644 --- a/model/torch/VGG16.py +++ b/model/torch/VGG16.py @@ -15,6 +15,7 @@ class VGG16(nn.Module): def __init__(self, num_classes=1000): super(VGG16, self).__init__() + self.model_id = "VGG16" self.features = nn.Sequential( # first convulation and pooling layer nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3, 3), stride=(1, 1), padding='same'), @@ -85,7 +86,7 @@ def forward(self, x): return x @beartype - def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None): + def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimizer:torch.optim.SGD, train_dataloader:torch.utils.data.DataLoader, num_epochs:int=4, scheduler:Union[torch.optim.lr_scheduler.ReduceLROnPlateau,None]=None, valid_dataLoader:Union[torch.utils.data.DataLoader,None]=None, early_stopper:Union[EarlyStopper,None]=None, checkpoints_dir:Union[str,None]=None, load_epoch_checkpoint:Union[int,None]=None): """Fits model to specified data loader given the criterion and optimizer Parameters @@ -106,11 +107,15 @@ def fit(self, device:torch.device, criterion:torch.nn.CrossEntropyLoss, optimize The torch data loader to use for validation when fitting the model, default is None early_stopper : EarlyStopper The EarlyStopper object for halting fitting when performing validation, default is None + checkpoints_dir : str + The local folder location where model epoch checkpoints are to be read and wrote to, default is None + load_epoch_checkpoint : int + The epoch checkpoint to load and start from, default is None Returns ------- """ - self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper) + self, self.model_fit = fit_module(self, device, criterion, optimizer, train_dataloader, num_epochs, scheduler, valid_dataLoader, early_stopper, checkpoints_dir, load_epoch_checkpoint) @beartype def validate(self, device:torch.device, dataloader:torch.utils.data.DataLoader, criterion:torch.nn.CrossEntropyLoss) -> tuple: