diff --git a/CNN-TS/M4-Method-Description-CNN-TS.pdf b/CNN-TS/M4-Method-Description-CNN-TS.pdf new file mode 100644 index 0000000..5997c54 Binary files /dev/null and b/CNN-TS/M4-Method-Description-CNN-TS.pdf differ diff --git a/CNN-TS/environment.txt b/CNN-TS/environment.txt new file mode 100644 index 0000000..0baa8e7 --- /dev/null +++ b/CNN-TS/environment.txt @@ -0,0 +1,163 @@ +absl-py==0.2.0 +alabaster==0.7.10 +anaconda-client==1.6.3 +anaconda-navigator==1.6.2 +anaconda-project==0.6.0 +asn1crypto==0.22.0 +astor==0.6.2 +astroid==1.4.9 +astropy==1.3.2 +Babel==2.4.0 +backports.shutil-get-terminal-size==1.0.0 +beautifulsoup4==4.6.0 +bitarray==0.8.1 +blaze==0.10.1 +bleach==1.5.0 +bokeh==0.12.5 +boto==2.46.1 +Bottleneck==1.2.1 +cffi==1.10.0 +chardet==3.0.3 +click==6.7 +cloudpickle==0.2.2 +clyent==1.2.2 +colorama==0.3.9 +comtypes==1.1.2 +contextlib2==0.5.5 +cryptography==1.8.1 +cycler==0.10.0 +Cython==0.25.2 +cytoolz==0.8.2 +dask==0.14.3 +datashape==0.5.4 +decorator==4.0.11 +distributed==1.16.3 +docutils==0.13.1 +entrypoints==0.2.2 +et-xmlfile==1.0.1 +fastcache==1.0.2 +Flask==0.12.2 +Flask-Cors==3.0.2 +gast==0.2.0 +gevent==1.2.1 +greenlet==0.4.12 +grpcio==1.11.0 +h5py==2.7.0 +HeapDict==1.0.0 +html5lib==0.9999999 +idna==2.5 +imagesize==0.7.1 +ipykernel==4.6.1 +ipython==5.3.0 +ipython-genutils==0.2.0 +ipywidgets==6.0.0 +isort==4.2.5 +itsdangerous==0.24 +jdcal==1.3 +jedi==0.10.2 +Jinja2==2.9.6 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.0.1 +jupyter-console==5.1.0 +jupyter-core==4.3.0 +Keras==2.1.6 +lazy-object-proxy==1.2.2 +lightgbm==2.1.1 +llvmlite==0.18.0 +locket==0.2.0 +lxml==3.7.3 +Markdown==2.6.11 +MarkupSafe==0.23 +matplotlib==2.0.2 +menuinst==1.4.7 +mistune==0.7.4 +mpmath==0.19 +msgpack-python==0.4.8 +multipledispatch==0.4.9 +navigator-updater==0.1.0 +nbconvert==5.1.1 +nbformat==4.3.0 +networkx==1.11 +nltk==3.2.3 +nose==1.3.7 +notebook==5.0.0 +numba==0.33.0 +numexpr==2.6.2 +numpy==1.14.3 +numpydoc==0.6.0 +odo==0.5.0 +olefile==0.44 +openpyxl==2.4.7 +packaging==16.8 +pandas==0.20.1 +pandocfilters==1.4.1 +partd==0.3.8 +path.py==10.3.1 +pathlib2==2.2.1 +patsy==0.4.1 +pep8==1.7.0 +pickleshare==0.7.4 +Pillow==4.1.1 +ply==3.10 +prompt-toolkit==1.0.14 +protobuf==3.5.2.post1 +psutil==5.2.2 +py==1.4.33 +pycosat==0.6.2 +pycparser==2.17 +pycrypto==2.6.1 +pycurl==7.43.0 +pyflakes==1.5.0 +Pygments==2.2.0 +pylint==1.6.4 +pyodbc==4.0.16 +pyOpenSSL==17.0.0 +pyparsing==2.1.4 +pytest==3.0.7 +python-dateutil==2.6.0 +pytz==2017.2 +PyWavelets==0.5.2 +pywin32==220 +PyYAML==3.12 +pyzmq==16.0.2 +QtAwesome==0.4.4 +qtconsole==4.3.0 +QtPy==1.2.1 +requests==2.14.2 +rope-py3k==0.9.4.post1 +scikit-image==0.13.0 +scikit-learn==0.18.1 +scipy==0.19.0 +seaborn==0.7.1 +simplegeneric==0.8.1 +singledispatch==3.4.0.3 +six==1.11.0 +snowballstemmer==1.2.1 +sortedcollections==0.5.3 +sortedcontainers==1.5.7 +sphinx==1.5.6 +spyder==3.1.4 +SQLAlchemy==1.1.9 +statsmodels==0.8.0 +sympy==1.0 +tables==3.2.2 +tblib==1.3.2 +tensorboard==1.8.0 +tensorflow==1.8.0 +termcolor==1.1.0 +testpath==0.3 +toolz==0.8.2 +tornado==4.5.1 +traitlets==4.3.2 +unicodecsv==0.14.1 +wcwidth==0.1.7 +Werkzeug==0.14.1 +widgetsnbextension==2.0.0 +win-unicode-console==0.5 +wrapt==1.10.10 +xlrd==1.0.0 +XlsxWriter==0.9.6 +xlwings==0.10.4 +xlwt==1.2.0 +zict==0.1.2 diff --git a/CNN-TS/metrics.py b/CNN-TS/metrics.py new file mode 100644 index 0000000..f082303 --- /dev/null +++ b/CNN-TS/metrics.py @@ -0,0 +1,38 @@ +"""Evaluation metrics, taken from the competition Github repo +https://github.com/M4Competition/M4-methods +The factor of 2 in SMAPE has been changed to 200 to be consistent with the R code. +""" +import numpy as np + + + +def smape(a, b): + """ + Calculates sMAPE + + :param a: actual values + :param b: predicted values + :return: sMAPE + """ + a = np.reshape(a, (-1,)) + b = np.reshape(b, (-1,)) + return np.mean(200.0 * np.abs(a - b) / (np.abs(a) + np.abs(b))).item() # BT changed 2 to 200 + + +def mase(insample, y_test, y_hat_test, freq): + """ + Calculates MAsE + + :param insample: insample data + :param y_test: out of sample target values + :param y_hat_test: predicted values + :param freq: data frequency + :return: + """ + y_hat_naive = [] + for i in range(freq, len(insample)): + y_hat_naive.append(insample[(i - freq)]) + + masep = np.mean(abs(insample[freq:] - y_hat_naive)) + + return np.mean(abs(y_test - y_hat_test)) / masep diff --git a/CNN-TS/model_definitions.py b/CNN-TS/model_definitions.py new file mode 100644 index 0000000..c7937b2 --- /dev/null +++ b/CNN-TS/model_definitions.py @@ -0,0 +1,179 @@ + +import keras as ks +import pandas as pd +import numpy as np +import metrics +from collections import namedtuple +import tensorflow as tf +import datetime as dt +import os +from utilities import * + + +def yearly_model(series_length): + input = ks.layers.Input((series_length,)) + yearly_input = ks.layers.Reshape((series_length, 1))(input) + yearly_hidden1 = ks.layers.Dense(units=50, activation='relu')(ks.layers.Flatten()(yearly_input)) + yearly_hidden2 = ks.layers.Dense(units=50, activation='relu')(yearly_hidden1) + yearly_output = ks.layers.Dense(units=1, activation='linear')(yearly_hidden2) + est = ks.Model(inputs=input, outputs=yearly_output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def quarterly_model(series_length): + input = ks.layers.Input((series_length,)) + yearly_input = ks.layers.Reshape((series_length, 1))(input) + yearly_avg = ks.layers.AveragePooling1D(pool_size=4, strides=4, padding='valid')(yearly_input) + yearly_hidden1 = ks.layers.Dense(units=50, activation='relu')(ks.layers.Flatten()(yearly_avg)) + yearly_hidden2 = ks.layers.Dense(units=50, activation='relu')(yearly_hidden1) + yearly_output = ks.layers.Dense(units=1, activation='linear')(yearly_hidden2) + + yearly_avg_up = ks.layers.UpSampling1D(size=4)(yearly_avg) + periodic_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(yearly_avg_up)]) + periodic_input = ks.layers.Reshape((series_length // 4, 4, 1))(periodic_diff) + periodic_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 4), strides=(1, 4), + padding='valid')(periodic_input) + periodic_hidden1 = ks.layers.Dense(units=50, activation='relu')(ks.layers.Flatten()(periodic_conv)) + periodic_hidden2 = ks.layers.Dense(units=50, activation='relu')(periodic_hidden1) + periodic_output = ks.layers.Dense(units=1, activation='linear')(periodic_hidden2) + output = ks.layers.Add()([yearly_output, periodic_output]) + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def monthly_model(series_length): + input = ks.layers.Input((series_length,)) + yearly_input = ks.layers.Reshape((series_length, 1))(input) + yearly_avg = ks.layers.AveragePooling1D(pool_size=12, strides=12, padding='valid')(yearly_input) + yearly_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(yearly_avg)) + yearly_hidden2 = ks.layers.Dense(units=20, activation='relu')(yearly_hidden1) + yearly_output = ks.layers.Dense(units=1, activation='linear')(yearly_hidden2) + + yearly_avg_up = ks.layers.UpSampling1D(size=12)(yearly_avg) + periodic_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(yearly_avg_up)]) + periodic_input = ks.layers.Reshape((series_length // 12, 12, 1))(periodic_diff) + periodic_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 12), strides=(1, 12), + padding='valid')(periodic_input) + periodic_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(periodic_conv)) + periodic_hidden2 = ks.layers.Dense(units=20, activation='relu')(periodic_hidden1) + periodic_output = ks.layers.Dense(units=1, activation='linear')(periodic_hidden2) + output = ks.layers.Add()([yearly_output, periodic_output]) + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def weekly_model(series_length): + if series_length == 52: + input = ks.layers.Input((series_length,)) + hidden1 = ks.layers.Dense(units=20, activation='relu')(input) + hidden2 = ks.layers.Dense(units=20, activation='relu')(hidden1) + output = ks.layers.Dense(units=1, activation='linear')(hidden2) + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + else: + input = ks.layers.Input((series_length,)) + yearly_input = ks.layers.Reshape((series_length, 1))(input) + yearly_avg = ks.layers.AveragePooling1D(pool_size=52, strides=52, padding='valid')(yearly_input) + yearly_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(yearly_avg)) + yearly_hidden2 = ks.layers.Dense(units=20, activation='relu')(yearly_hidden1) + yearly_output = ks.layers.Dense(units=1, activation='linear')(yearly_hidden2) + + yearly_avg_up = ks.layers.UpSampling1D(size=52)(yearly_avg) + periodic_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(yearly_avg_up)]) + periodic_input = ks.layers.Reshape((series_length // 52, 52, 1))(periodic_diff) + periodic_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 52), strides=(1, 52), + padding='valid')(periodic_input) + periodic_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(periodic_conv)) + periodic_hidden2 = ks.layers.Dense(units=20, activation='relu')(periodic_hidden1) + periodic_output = ks.layers.Dense(units=1, activation='linear')(periodic_hidden2) + output = ks.layers.Add()([yearly_output, periodic_output]) + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def daily_model(series_length): + input = ks.layers.Input((series_length,)) + weekly_input = ks.layers.Reshape((series_length, 1))(input) + weekly_avg = ks.layers.AveragePooling1D(pool_size=7, strides=7, padding='valid')(weekly_input) + weekly_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(weekly_avg)) + weekly_hidden2 = ks.layers.Dense(units=20, activation='relu')(weekly_hidden1) + weekly_output = ks.layers.Dense(units=1, activation='linear')(weekly_hidden2) + + weekly_avg_up = ks.layers.UpSampling1D(size=7)(weekly_avg) + periodic_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(weekly_avg_up)]) + periodic_input = ks.layers.Reshape((series_length // 7, 7, 1))(periodic_diff) + periodic_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 7), strides=(1, 7), + padding='valid')(periodic_input) + periodic_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(periodic_conv)) + periodic_hidden2 = ks.layers.Dense(units=20, activation='relu')(periodic_hidden1) + periodic_output = ks.layers.Dense(units=1, activation='linear')(periodic_hidden2) + output = ks.layers.Add()([weekly_output, periodic_output]) + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def hourly_model(series_length): + input = ks.layers.Input((series_length,)) + weekly_input = ks.layers.Reshape((series_length, 1))(input) + weekly_avg = ks.layers.AveragePooling1D(pool_size=168, strides=168, padding='valid')(weekly_input) + weekly_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(weekly_avg)) + weekly_hidden2 = ks.layers.Dense(units=20, activation='relu')(weekly_hidden1) + weekly_output = ks.layers.Dense(units=1, activation='linear')(weekly_hidden2) + weekly_avg_up = ks.layers.UpSampling1D(size=168)(weekly_avg) + + daily_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(weekly_avg_up)]) + daily_diff_input = ks.layers.Reshape((series_length // 7, 7, 1))(daily_diff) + daily_diff_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 7), strides=(1, 7), + padding='valid')(daily_diff_input) + daily_diff_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(daily_diff_conv)) + daily_diff_hidden2 = ks.layers.Dense(units=20, activation='relu')(daily_diff_hidden1) + daily_diff_output = ks.layers.Dense(units=1, activation='linear')(daily_diff_hidden2) + + daily_avg = ks.layers.AveragePooling1D(pool_size=24, strides=24, padding='valid')(weekly_input) + daily_avg_up = ks.layers.UpSampling1D(size=24)(daily_avg) + hourly_diff = ks.layers.Subtract()([input, ks.layers.Flatten()(daily_avg_up)]) + hourly_diff_input = ks.layers.Reshape((series_length // 24, 24, 1))(hourly_diff) + hourly_diff_conv = ks.layers.Conv2D(filters=3, kernel_size=(1, 24), strides=(1, 24), + padding='valid')(hourly_diff_input) + hourly_diff_hidden1 = ks.layers.Dense(units=20, activation='relu')(ks.layers.Flatten()(hourly_diff_conv)) + hourly_diff_hidden2 = ks.layers.Dense(units=20, activation='relu')(hourly_diff_hidden1) + hourly_diff_output = ks.layers.Dense(units=1, activation='linear')(hourly_diff_hidden2) + + output = ks.layers.Add()([weekly_output, daily_diff_output, hourly_diff_output]) + + est = ks.Model(inputs=input, outputs=output) + est.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse', metrics=['mse']) + epochs = 250 + batch_size = 1000 + return est, epochs, batch_size + + +def get_model_params(): + # define the model parameters + Model = namedtuple('Model', ['freq_name', 'horizon', 'freq', 'model_constructor', 'training_lengths', + 'cycle_length', 'augment']) + yearly = Model('Yearly', 6, 1, yearly_model, [10, 20, 30], 1, False) + quarterly = Model('Quarterly', 8, 4, quarterly_model, [20, 48, 100], 4, False) + monthly = Model('Monthly', 18, 12, monthly_model, [48, 120, 240], 12, False) + weekly = Model('Weekly', 13, 1, weekly_model, [52, 520, 1040], 52, True) + daily = Model('Daily', 14, 1, daily_model, [98], 7, True) + hourly = Model('Hourly', 48, 24, hourly_model, [672], 7*24, True) + return [yearly, quarterly, monthly, weekly, daily, hourly] + diff --git a/CNN-TS/predict.py b/CNN-TS/predict.py new file mode 100644 index 0000000..44458c6 --- /dev/null +++ b/CNN-TS/predict.py @@ -0,0 +1,107 @@ +"""Script to make predictions by loading trained models.""" + +import keras as ks +import pandas as pd +import numpy as np +import metrics +from collections import namedtuple +import tensorflow as tf +import datetime as dt +import os +from scipy import stats +from utilities import * +from model_definitions import * + + +model_arr = get_model_params() + +fc_arr = {'fc': [], 'lower': [], 'upper': []} +for m in model_arr: + + # read data + series = pd.read_csv(os.path.join('data', '{}-train.csv'.format(m.freq_name)), header=0, index_col=0) + num_values = series.notnull().sum(axis=1) + + # dictionary of predictions + prediction = dict() + valid_prediction = dict() + + # loop over the different training lengths + training_lengths = m.training_lengths + for series_length in training_lengths: + horizon = m.horizon + + # data for prediction + train_x = get_base_data(series, series_length, 0, m.cycle_length) + train_x, train_mean, train_std = normalise(train_x) + + # data to measure errors calculating the prediction intervals + valid_x = get_base_data(series, series_length, horizon, m.cycle_length)[:, :-horizon] + valid_x, valid_mean, valid_std = normalise(valid_x) + + # only predict on series that are long enough + train_length_ok = np.logical_or(series_length == min(training_lengths), num_values.values >= series_length) + valid_length_ok = np.logical_or(series_length == min(training_lengths), + num_values.values >= series_length + horizon) + + # initialise array for forecast + prediction[series_length] = np.zeros([len(train_x), horizon]) + curr_prediction = prediction[series_length] + curr_prediction[:] = np.nan + + # initialise prediction array for training period, used to calculate prediction intervals + valid_prediction[series_length] = np.zeros([len(train_x), horizon]) + curr_valid_prediction = valid_prediction[series_length] + curr_valid_prediction[:] = np.nan + + # different models for each future time step + for horizon_step in range(horizon): + print(m.freq_name, series_length, horizon_step) + + # clear session and reset default graph, as suggested here, to speed up prediction + # https://stackoverflow.com/questions/45796167/training-of-keras-model-gets-slower-after-each-repetition + ks.backend.clear_session() + tf.reset_default_graph() + + # load model and predict + model_file = os.path.join('trained_models', + '{}_length_{}_step_{}.h5'.format(m.freq_name, series_length, + horizon_step)) + est = ks.models.load_model(model_file) + curr_prediction[:, horizon_step] = np.where(train_length_ok, est.predict(train_x).flatten(), + curr_prediction[:, horizon_step]) + curr_valid_prediction[:, horizon_step] \ + = np.where(valid_length_ok, est.predict(valid_x).flatten(), + curr_valid_prediction[:, horizon_step]) + + # denormalise + prediction[series_length] = denormalise(prediction[series_length], train_mean, train_std) + valid_prediction[series_length] = denormalise(valid_prediction[series_length], valid_mean, valid_std) + + # blend predictions + blend_predictions(prediction, num_values.values) + blend_predictions(valid_prediction, num_values.values - horizon) + + # calculate prediction intervals + actual = get_base_data(series, horizon, 0, m.cycle_length) + err = (actual - valid_prediction[-1]) / valid_prediction[-1] + lower = np.percentile(err, 2.5, axis=0) + upper = np.percentile(err, 97.5, axis=0) + + for fc_type in fc_arr: + output = pd.DataFrame(index=series.index, columns=['F' + str(i) for i in range(1, 49)]) + output.index.name = 'id' + if fc_type == 'fc': + output.iloc[:, :m.horizon] = prediction[-1] + elif fc_type == 'lower': + output.iloc[:, :m.horizon] = prediction[-1] * (1 + lower) + elif fc_type == 'upper': + output.iloc[:, :m.horizon] = prediction[-1] * (1 + upper) + fc_arr[fc_type].append(output) + +for fc_type in fc_arr: + # order as in sample submission and save + all_output = pd.concat(fc_arr[fc_type], axis=0) + sample = pd.read_csv(os.path.join('data', 'template_Naive.csv'), index_col=0) + all_output = all_output.reindex(sample.index) + all_output.to_csv('Submission_{}_{}.csv'.format(fc_type, dt.datetime.now().strftime('%y%m%d_%H%M'))) diff --git a/CNN-TS/shuffle.py b/CNN-TS/shuffle.py new file mode 100644 index 0000000..3e5fb30 --- /dev/null +++ b/CNN-TS/shuffle.py @@ -0,0 +1,6 @@ +import pandas as pd + +for freq in ['Daily', 'Hourly', 'Monthly', 'Quarterly', 'Weekly', 'Yearly']: + df = pd.read_csv('data\\{}-train.csv'.format(freq), header=0, index_col=0) + df = df.sample(frac=1, random_state=0) + df.to_csv('data\\{}_train_shuffle.csv'.format(freq), index=True) \ No newline at end of file diff --git a/CNN-TS/train_models.py b/CNN-TS/train_models.py new file mode 100644 index 0000000..99a113f --- /dev/null +++ b/CNN-TS/train_models.py @@ -0,0 +1,134 @@ +"""Script to train models. Runs training code twice. In the first run, 20% of the data is excluded from training and +used as a test set to estimate model accuracy. In the second run all the data is used for training, and the trained models are +saved.""" + +import keras as ks +import pandas as pd +import numpy as np +import metrics +from collections import namedtuple +import tensorflow as tf +import datetime as dt +import os +from utilities import * +from model_definitions import * + + +model_arr = get_model_params() +log_file = 'log_{}.txt'.format(dt.datetime.now().strftime('%y%m%d_%H%M')) + +for test_mode in [True, False]: + for m in model_arr: + + # read data + series = pd.read_csv(os.path.join('data', '{}_train_shuffle.csv'.format(m.freq_name)), header=0, index_col=0) + num_values = series.notnull().sum(axis=1) + + # in test mode, make predictions on holdout set of training data + prediction = dict() + + # loop over the different training lengths + training_lengths = m.training_lengths + for series_length in training_lengths: + horizon = m.horizon + + all_data = get_base_data(series, series_length, horizon, m.cycle_length) + + # get extended training set + train_x_ext, train_y_ext, weights = get_extended_data(series, series_length, min(training_lengths), + horizon, m.cycle_length, m.augment, test_mode) + train_x_ext, train_mean_ext, train_std_ext = normalise(train_x_ext) + + # only train on series that are long enough + train_length_ok = np.logical_or(series_length == min(training_lengths), + num_values.values - horizon >= series_length) + + if test_mode: + # initialise prediction array + prediction[series_length] = np.zeros([len(all_data), horizon]) + curr_prediction = prediction[series_length] + curr_prediction[:] = np.nan + + # different models for each future time step + for horizon_step in range(horizon): + print('\n{} model, series length {}, horizon step {}'.format(m.freq_name, series_length, horizon_step)) + + train_y = train_y_ext[:, horizon_step] + train_y = (train_y - train_mean_ext) / train_std_ext + + # exclude outliers + max_y = 5 * np.std(train_y) + train_y[train_y > max_y] = max_y + train_y[train_y < -max_y] = -max_y + + # clear session and reset default graph, as suggested here, to speed up training + # https://stackoverflow.com/questions/45796167/training-of-keras-model-gets-slower-after-each-repetition + ks.backend.clear_session() + tf.reset_default_graph() + + # set random seeds as described here: + # https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development + np.random.seed(0) + session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + tf.set_random_seed(0) + tf_session = tf.Session(graph=tf.get_default_graph(), config=session_conf) + ks.backend.set_session(tf_session) + # end setting random seeds + + # define model and train + nn_model_def = m.model_constructor + est, epochs, batch_size = nn_model_def(series_length) + est.fit(train_x_ext, train_y, epochs=epochs, batch_size=batch_size, shuffle=True, validation_split=0.1, + callbacks=[ks.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=10)]) + + if test_mode: + train_x = all_data[:, :series_length] + train_x, train_mean, train_std = normalise(train_x) + curr_prediction[:, horizon_step] = np.where(train_length_ok, est.predict(train_x).flatten(), + curr_prediction[:, horizon_step]) + else: + # save model + if not os.path.exists('trained_models'): + os.mkdir('trained_models') + model_file = os.path.join('trained_models', + '{}_length_{}_step_{}.h5'.format(m.freq_name, series_length, + horizon_step)) + est.save(model_file) + + if test_mode: + # denormalise + prediction[series_length] = denormalise(prediction[series_length], train_mean, train_std) + + if test_mode: + + blend_predictions(prediction, num_values.values - horizon) + + # evaluate on test set and write to log + with open(log_file, 'a') as f: + f.write('\n\n{} ACCURACY'.format(m.freq_name.upper())) + for series_length in training_lengths: + train_length_ok = np.logical_or(series_length == min(training_lengths), + num_values.values - horizon >= series_length) + with open(log_file, 'a') as f: + f.write('\nAccuracy for series with training_length >= {}'.format(series_length)) + for train_length in training_lengths + [-1]: + if train_length > series_length: + continue + curr_prediction = prediction[train_length] + mase_arr = np.zeros([len(all_data)]) + smape_arr = np.zeros(len(all_data)) + for i in range(len(all_data)): + row = series.iloc[i].values + row = row[np.logical_not(np.isnan(row))] + mase_arr[i] = metrics.mase(row[:-horizon], row[-horizon:], curr_prediction[i, :], m.freq) + smape_arr[i] = metrics.smape(row[-horizon:], curr_prediction[i, :]) + + last_test_row = int(np.round(len(all_data) * 0.2, 0)) + test_period = np.arange(len(all_data)) < last_test_row + bool_ind = train_length_ok & test_period + with open(log_file, 'a') as f: + f.write('\nMASE {}: {}'.format(train_length, + np.mean(mase_arr[bool_ind][np.logical_not(np.isinf(mase_arr[bool_ind]))]))) + f.write('\nSMAPE {}: {}'.format(train_length, + np.mean(smape_arr[bool_ind][np.logical_not(np.isinf(smape_arr[bool_ind]))]))) + diff --git a/CNN-TS/trained_models.zip b/CNN-TS/trained_models.zip new file mode 100644 index 0000000..32fe9bc Binary files /dev/null and b/CNN-TS/trained_models.zip differ diff --git a/CNN-TS/utilities.py b/CNN-TS/utilities.py new file mode 100644 index 0000000..8f22eb3 --- /dev/null +++ b/CNN-TS/utilities.py @@ -0,0 +1,89 @@ +"""Utility functions used by training and prediction code.""" + +import pandas as pd +import numpy as np + + +def get_base_data(series, series_length, horizon, cycle_length): + """Get data from series dataframe, padding series that are too short.""" + all_data = np.zeros([len(series), series_length + horizon]) + for i in range(len(series)): + row = series.iloc[i].values + row = row[np.logical_not(np.isnan(row))] + if len(row) < series_length + horizon: + num_missing = series_length + horizon - len(row) + all_data[i, -len(row):] = row[-(series_length + horizon):] + if num_missing > cycle_length: + all_data[i, :num_missing] = np.mean(row[:-horizon]) # mean of non-test values + else: + all_data[i, :num_missing] = all_data[i, cycle_length:(cycle_length + num_missing)] # copy from same period in cycle + else: + all_data[i, :] = row[-(series_length + horizon):] + return all_data + + +def get_extended_data(series, series_length, min_length, horizon, cycle_length, augment=True, test_mode=True): + """Get data from series dataframe, padding series that are too short.""" + ext_arr = [] + weights = [] + first_row = int(len(series) * 0.2) if test_mode else 0 + for i in range(first_row, len(series)): + row = series.iloc[i].values + row = row[np.logical_not(np.isnan(row))] + if len(row) <= series_length + horizon: + if series_length == min_length: + num_missing = series_length + horizon - len(row) + row_to_add = np.zeros([series_length + horizon]) + row_to_add[-len(row):] = row[-(series_length + horizon):] + if num_missing > cycle_length: + row_to_add[:num_missing] = np.mean(row[:-horizon]) # mean of non-test values + else: + row_to_add[:num_missing] = row[cycle_length - num_missing:cycle_length] # copy from same period in cycle + ext_arr.append(row_to_add) + weights.append(1) + else: + num_to_add = cycle_length if augment else 1 + num_extra = min(num_to_add, len(row) - series_length - horizon) + for j in range(num_extra): + if j == 0: + ext_arr.append(row[-(series_length + horizon):]) + else: + ext_arr.append(row[-(series_length + horizon + j):-j]) + weights.append(1 / num_extra) + train_ext = np.stack(ext_arr, 0) + train_x_ext = train_ext[:, :-horizon] + train_y_ext = train_ext[:, -horizon:] + weights = np.array(weights) + return train_x_ext, train_y_ext, weights + + +def normalise(arr): + mean_arr = np.mean(arr, 1) + series_length = arr.shape[1] + std_arr = np.std(arr, axis=1) + std_arr = np.where(std_arr == 0, mean_arr, std_arr) + std_arr = np.where(std_arr == 0, np.ones(mean_arr.shape), std_arr) + norm_arr = (arr - np.repeat(mean_arr[:, np.newaxis], series_length, 1)) \ + / np.repeat(std_arr[:, np.newaxis], series_length, 1) + return norm_arr, mean_arr, std_arr + + +def denormalise(norm_arr, mean_arr, std_arr): + series_length = norm_arr.shape[1] + denorm_arr = norm_arr * np.repeat(std_arr[:, np.newaxis], repeats=series_length, axis=1) \ + + np.repeat(mean_arr[:, np.newaxis], repeats=series_length, axis=1) + return denorm_arr + + +def blend_predictions(prediction_dict, num_values): + """Given the original series and dictionary of predictions indexed by the training length, + produce a blended prediction and store it in prediction[-1].""" + + training_lengths = list(prediction_dict.keys()) + prediction_dict[-1] = np.copy(prediction_dict[min(training_lengths)]) + for series_length in training_lengths: + train_length_ok = np.logical_or(series_length == min(training_lengths), num_values >= series_length) + comb_prediction = np.mean(np.stack([prediction_dict[s] for s in training_lengths if s <= series_length], 2), + 2) + prediction_dict[-1] = np.where(np.repeat(train_length_ok[:, np.newaxis], prediction_dict[-1].shape[1], 1), + comb_prediction, prediction_dict[-1]) \ No newline at end of file