Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
00b0800
Calling --gpus all and passing check gpu env variable
oislen Feb 13, 2025
3dd4c08
Added extra logging and testing with AlexNet8
oislen Feb 13, 2025
7c9444c
Restructured code and testing with LeNet5
oislen Feb 13, 2025
3b7ce0c
Moving labels to device
oislen Feb 13, 2025
d7328f7
Reverted frac to 0.05
oislen Feb 13, 2025
13e3bb5
Training with VGG16
oislen Feb 13, 2025
a7a681a
Removed cpu usage
oislen Feb 13, 2025
4c7efa3
Added extra logging
oislen Feb 14, 2025
bfa3662
Unpacking single value
oislen Feb 14, 2025
2f93d3b
Created a new load images script using PIL image
oislen Feb 14, 2025
944bc2e
Moved torch transform operations outside of custom data set
oislen Feb 14, 2025
1764349
Moved torch transform operations outside of custom data set. Added lo…
oislen Feb 14, 2025
d6caf0c
Added comment for iterating over dataset instead of dataloader
oislen Feb 14, 2025
bfa14bf
Further reduced data size base on multiples of the batch size
oislen Feb 14, 2025
d3b742a
Applying deep copy
oislen Feb 14, 2025
5ef1844
Defaulting n workers to 0
oislen Feb 15, 2025
8d502b6
Fixed filenames extraction
oislen Feb 15, 2025
69d2e00
Rmoved comment to iterate over dataset object within data loader
oislen Feb 15, 2025
46646fa
Integrated __getitems__ and collate_fn into CustomDataset
oislen Feb 15, 2025
d1512cf
Fixed dataloader latency issue
oislen Feb 15, 2025
605ccff
Created class for loading and applying torch transformation over sing…
oislen Feb 16, 2025
1f2f9c8
Increased sample size to 15k images. Using new load torch data object.
oislen Feb 16, 2025
dc154ab
Added functions to handle torch checkpoint read and writes
oislen Feb 16, 2025
5915d91
Deleting dataframes to restore memory. Moved generate plots to after …
oislen Feb 16, 2025
e56da3a
Calling fit with model checkpoints and load epach checkpoint arguments
oislen Feb 16, 2025
b394fed
Added logic to write model training epoch checkpoints to specified lo…
oislen Feb 16, 2025
65dd8f9
Added try except to catch torch transform errors
oislen Feb 16, 2025
613c12d
Added comments to LeNet5 and using VGG16
oislen Feb 16, 2025
29d7b0f
Clearing more objects from memory set sample size to 30k. Shuffling i…
oislen Feb 16, 2025
7ea3265
Added argument calls to checkpoint directory and load epoch checkpoin…
oislen Feb 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aws/linux_docker_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,6 @@ cat ~/.creds/docker | docker login --username oislen --password-stdin
docker pull $docker_image
# run pulled docker container
#docker run --shm-size=512m -p 8889:8888 -it $docker_image
docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --rm -it --entrypoint bash $docker_image
docker run --name $docker_container_name --shm-size=512m --publish 8888:8888 --volume /home/ubuntu/CatClassifier/.creds:/home/ubuntu/CatClassifier/.creds --volume /home/ubuntu/CatClassifier/report:/home/ubuntu/CatClassifier/report --gpus all --env PARAM_CHECK_GPU=True -it --entrypoint bash $docker_image
#docker run --shm-size=512m -p 8889:8888 -d $docker_image
#docker run -it -d <container_id_or_name> /bin/bash
88 changes: 88 additions & 0 deletions model/arch/load_image_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@

import pandas as pd
import numpy as np
import re
import os
import torch
from PIL import Image
from matplotlib.image import imread
from copy import deepcopy
from multiprocessing import Pool
from copy import deepcopy

def load_image_v2(image_fpaths):
"""
"""
images = []
for image_fpath in image_fpaths:
temp = Image.open(image_fpath)
keep = deepcopy(temp)
images.append(keep)
temp.close()
return pd.Series(images)

class TorchLoadImages():

def __init__(self, torch_transforms, n_workers=None):
self.torch_transforms = torch_transforms
self.n_workers = n_workers

def loadImage(self, filepath):
"""
"""
# determine the filename and source
fileName = os.path.basename(filepath)
# determine label from image file path
if ("cat" in fileName) or ("dog" in fileName):
fileSource = "kaggle" if len(re.findall(pattern='^(cat|dog)(.[0-9]+.jpg)$', string=fileName)) > 0 else "webscraper"
labelName = fileName.split(".")[0]
label = labelName == "dog"
labelTensor = torch.tensor(label, dtype=torch.int64)
else:
fileSource = "kaggle"
labelName = np.nan
label = np.nan
labelTensor = torch.tensor(label)
# load image file and apply torch transforms
image = Image.open(filepath)
torch_transform_error = None
try:
imageTensor = self.torch_transforms(image)
except Exception as e:
imageTensor = None
torch_transform_error = str(e)
imageArray = np.asarray(image)
image.close()
nDims = len(imageArray.shape)
# create an output record
record = {
"filepaths":filepath,
"filenames":fileName,
"source":fileSource,
"categoryname":labelName,
"category":label,
"images":imageArray,
"ndims":nDims,
"category_tensors":labelTensor,
"image_tensors":imageTensor,
"torch_transform_error":torch_transform_error
}
# close open image
return record

def multiProcess(self, func, args):
"""
"""
pool = Pool(self.n_workers)
results = pool.map(func, args)
pool.close()
return results

def loadImages(self, filepaths):
"""
"""
if self.n_workers == None:
records = [self.loadImage(filepath) for filepath in filepaths]
else:
records = self.multiProcess(self.loadImage, filepaths)
return records
2 changes: 1 addition & 1 deletion model/cons.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,5 @@
shuffle = False

# multiprocessing
num_workers = os.environ.get("PARAM_NUM_WORKERS", os.cpu_count())
num_workers = os.environ.get("PARAM_NUM_WORKERS", 0)
check_gpu = os.environ.get("PARAM_CHECK_GPU", False)
13 changes: 9 additions & 4 deletions model/prg_keras_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from model.utilities.plot_image import plot_image
from model.utilities.plot_generator import plot_generator
from model.utilities.plot_preds import plot_preds
from model.keras.LeNet5 import LeNet5
from model.keras.AlexNet8 import AlexNet8
from model.keras.VGG16_pretrained import VGG16_pretrained
from model.utilities.plot_model import plot_model_fit
Expand Down Expand Up @@ -42,6 +43,7 @@
df = pd.DataFrame({'filename': filenames, 'category': categories})
df["category"] = df["category"].replace(cons.category_mapper)
df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'})
logging.info(f"df.shape: {df.shape}")
timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad")

logging.info("Plot sample image...")
Expand All @@ -60,6 +62,8 @@
# set data constants
total_train = train_df.shape[0]
total_validate = validate_df.shape[0]
logging.info(f"train_df.shape: {train_df.shape}")
logging.info(f"validate_df.shape: {validate_df.shape}")
timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit")

logging.info("Creating training and validation data generators...")
Expand All @@ -80,8 +84,9 @@

logging.info("Initiate keras model...")
# initiate LeNet5 architecture
keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax')
#keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax')
#keras_model = LeNet5(input_shape=cons.input_shape, n_classes=2, output_activation='softmax')
#keras_model = AlexNet8(input_shape=cons.input_shape, n_classes=2, output_activation='softmax')
keras_model = VGG16_pretrained(input_shape=cons.input_shape, n_classes=2, output_activation='softmax')
keras_model.summary()
# set gradient decent compiler
optimizer = optimizers.SGD(learning_rate=cons.learning_rate)
Expand Down Expand Up @@ -129,7 +134,7 @@
# prepare test data
test_filenames = os.listdir(cons.test_fdir)
test_df = pd.DataFrame({'filename': test_filenames})
nb_samples = test_df.shape[0]
logging.info(f"train_df.shape: {test_df.shape}")
timeLogger.logTime(parentKey="TestSet", subKey="RawLoad")

logging.info("Create test data generator...")
Expand All @@ -140,7 +145,7 @@

logging.info("Generate test set predictions...")
# make test set predictions
predict = keras_model.predict(test_generator, steps=int(np.ceil(nb_samples/cons.batch_size)))
predict = keras_model.predict(test_generator, steps=int(np.ceil(test_df.shape[0]/cons.batch_size)))
test_df['category'] = np.argmax(predict, axis=-1)
test_df['category'] = test_df['category'].replace(cons.category_mapper)
timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions")
Expand Down
115 changes: 68 additions & 47 deletions model/prg_torch_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import logging
import pandas as pd
import numpy as np
import random
from PIL import Image

# set huggingface hub directory
huggingface_hub_dir = 'E:\\huggingface'
Expand All @@ -16,12 +14,12 @@
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from tensorflow.keras.preprocessing.image import load_img

# load custom scripts
import cons
from model.torch.VGG16_pretrained import VGG16_pretrained
from model.torch.AlexNet8 import AlexNet8
from model.torch.LeNet5 import LeNet5
from model.torch.CustomDataset import CustomDataset
from model.torch.EarlyStopper import EarlyStopper
from model.utilities.plot_model import plot_model_fit
Expand All @@ -30,13 +28,13 @@
from model.utilities.plot_generator import plot_generator
from model.utilities.TimeIt import TimeIt
from model.utilities.commandline_interface import commandline_interface

# hyper-parameters
num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs
from model.arch.load_image_v2 import load_image_v2, TorchLoadImages

# device configuration
device = torch.device('cuda' if torch.cuda.is_available() and cons.check_gpu else 'cpu')

random_state = 42

torch_transforms = transforms.Compose([
transforms.Resize(size=[cons.IMAGE_WIDTH, cons.IMAGE_HEIGHT]) # resize the input image to a uniform size
#,transforms.RandomRotation(30)
Expand All @@ -53,72 +51,91 @@
lgr.setLevel(logging.INFO)
timeLogger = TimeIt()

logging.info("Parsing command line arguments...")
# handle input parameters
input_params_dict = commandline_interface()

logging.info(input_params_dict)
timeLogger.logTime(parentKey="Initialisation", subKey="CommandlineArguments")

if input_params_dict["run_model_training"]:

logging.info("Generating dataframe of images...")
# create a dataframe of filenames and categories
filenames = os.listdir(cons.train_fdir)
categories = [1 if filename.split('.')[0] == 'dog' else 0 for filename in filenames]
df = pd.DataFrame({'filename': filenames, 'category': categories})
frac = 0.05
df = df.sample(frac = frac)
df["categoryname"] = df["category"].replace(cons.category_mapper)
df['source'] = df['filename'].str.contains(pat='[cat|dog].[0-9]+.jpg', regex=True).map({True:'kaggle', False:'webscraper'})
df["filepath"] = cons.train_fdir + '/' + df['filename']
df["ndims"] = df['filepath'].apply(lambda x: len(np.array(Image.open(x)).shape))
df = df.loc[df["ndims"] == 3, :].copy()
# load and shuffle the image file paths
np.random.seed(random_state)
image_filepaths=np.array([os.path.join(cons.train_fdir, x) for x in os.listdir(cons.train_fdir)])
np.random.shuffle(image_filepaths)
# create torch load images object
sample_size = 30000
torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None)
df = pd.DataFrame.from_records(torchLoadImages.loadImages(image_filepaths[0:sample_size]))
# only consider images with 3 dimensions
df = df.loc[df["ndims"]==3, :]
# flush data from memory
del image_filepaths
logging.info(f"df.shape: {df.shape}")
timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad")

logging.info("Plot sample image...")
# random image plot
sample = random.choice(filenames)
image = load_img(os.path.join(cons.train_fdir, sample))
plot_image(image, output_fpath=cons.torch_random_image_fpath, show_plot=False)
plot_image(df['images'].values[1], output_fpath=cons.torch_random_image_fpath, show_plot=False)
timeLogger.logTime(parentKey="Plots", subKey="SampleImage")

logging.info("Plot example data loader images...")
# data generator example
plot_generator(generator=df['image_tensors'].values[:16].tolist(), mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False)
timeLogger.logTime(parentKey="Plots", subKey="DataLoader")

logging.info("Split into training, validation and test dataset...")
# prepare data
random_state = 42
validate_df = df[df['source'] == 'kaggle'].sample(n=int(5000 * frac), random_state=random_state)
validate_df = df.sample(frac=0.05, random_state=random_state, replace=False)
train_df = df[~df.index.isin(validate_df.index)]
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)
logging.info(f"train_df.shape: {train_df.shape}")
logging.info(f"validate_df.shape: {validate_df.shape}")
timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationSplit")

logging.info("Creating training and validation data loaders...")
# set data constants
total_train = train_df.shape[0]
total_validate = validate_df.shape[0]
# set train data loader
train_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train')
train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True)
train_dataset = CustomDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn)
# set validation data loader
validation_dataset = CustomDataset(train_df, transforms=torch_transforms, mode='train')
validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True)
validation_dataset = CustomDataset(validate_df)
validation_loader = DataLoader(validation_dataset, batch_size=cons.batch_size, shuffle=True, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn)
# flush data from memory
del df
del train_df
del train_dataset
del validate_df
del validation_dataset
timeLogger.logTime(parentKey="DataPrep", subKey="TrainValidationDataLoaders")

logging.info("Plot example data loader images...")
# datagen example
example_generator = [(image.detach().numpy(), None) for images, labels in train_loader for image in images]
plot_generator(generator=example_generator, mode='torch', output_fpath=cons.torch_generator_plot_fpath, show_plot=False)
timeLogger.logTime(parentKey="Plots", subKey="DataLoader")

logging.info("Initiate torch model...")
logging.info(f"device: {device}")
# initiate cnn architecture
#model = LeNet5(num_classes=2)
#model = AlexNet8(num_classes=2).to(device)
model = VGG16_pretrained(num_classes=2).to(device)
if device == "cuda":
model = nn.DataParallel(model)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=cons.learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs')
early_stopper = EarlyStopper(patience=3, min_delta=0.3)
timeLogger.logTime(parentKey="Modelling", subKey="InitiateTorchModel")

logging.info("Fit torch model...")
# hyper-parameters
num_epochs = cons.min_epochs if cons.FAST_RUN else cons.max_epochs
# fit torch model
model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper)
model.fit(device=device, criterion=criterion, optimizer=optimizer, train_dataloader=train_loader, num_epochs=num_epochs, scheduler=scheduler, valid_dataLoader=validation_loader, early_stopper=early_stopper, checkpoints_dir=cons.checkpoints_fdir, load_epoch_checkpoint=None)
# flush data from memory
del train_loader
del validation_loader
timeLogger.logTime(parentKey="Modelling", subKey="Fit")

logging.info("Plot model fit results...")
Expand All @@ -130,37 +147,38 @@
# save model
model.save(output_fpath=cons.torch_model_pt_fpath)
timeLogger.logTime(parentKey="ModelSerialisation", subKey="Write")

if input_params_dict["run_testset_prediction"]:

logging.info("Load fitted torch model from disk...")
# load model
#model = LeNet5(num_classes=2).to(device)
#model = AlexNet8(num_classes=2).to(device)
model = VGG16_pretrained(num_classes=2).to(device)
model.load(input_fpath=cons.torch_model_pt_fpath)
timeLogger.logTime(parentKey="ModelSerialisation", subKey="Load")

logging.info("Generate test dataset...")
# prepare test data
test_filenames = os.listdir(cons.test_fdir)
test_df = pd.DataFrame({'filename': test_filenames})
test_df["filepath"] = cons.test_fdir + '/' + test_df['filename']
test_df["idx"] = test_df['filename'].str.extract(pat='([0-9]+)').astype(int)
test_df = test_df.set_index('idx').sort_index()
nb_samples = test_df.shape[0]
timeLogger.logTime(parentKey="TestSet", subKey="RawLoad")
# create torch load images object
torchLoadImages = TorchLoadImages(torch_transforms=torch_transforms, n_workers=None)
test_df = pd.DataFrame.from_records(torchLoadImages.loadImages(filepaths=[os.path.join(cons.test_fdir, x) for x in os.listdir(cons.test_fdir)]))
logging.info(f"test_df.shape: {test_df.shape}")
timeLogger.logTime(parentKey="DataPrep", subKey="TrainDataLoad")

logging.info("Create test dataloader...")
# set train data loader
test_dataset = CustomDataset(test_df, transforms=torch_transforms, mode='test')
test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True)
test_dataset = CustomDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=cons.batch_size, shuffle=False, num_workers=cons.num_workers, pin_memory=True, collate_fn=CustomDataset.collate_fn)
timeLogger.logTime(parentKey="TestSet", subKey="DataLoader")

logging.info("Generate test set predictions...")
# make test set predictions
predict = model.predict(test_loader, device)
test_df['category'] = np.argmax(predict, axis=-1)
test_df["category"] = test_df["category"].replace(cons.category_mapper)
# flush data from memory
del test_dataset
del test_loader
timeLogger.logTime(parentKey="TestSet", subKey="ModelPredictions")

logging.info("Plot example test set predictions...")
Expand All @@ -171,7 +189,10 @@
logging.info("Generate a sample submission file for kaggle...")
# make submission
submission_df = test_df.copy()
submission_df['id'] = submission_df['filename'].str.split('.').str[0]
submission_df['id'] = submission_df['filenames'].str.split('.').str[0]
submission_df['label'] = submission_df['category'].replace(cons.category_mapper)
submission_df.to_csv(cons.submission_csv_fpath, index=False)
# delete dataframes from memory
del test_df
del submission_df
timeLogger.logTime(parentKey="TestSet", subKey="SubmissionFile")
Loading