diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/M4-methods.Rproj b/M4-methods.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/M4-methods.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/ThiyangaTalagala/m4competition_report.pdf b/ThiyangaTalagala/m4competition_report.pdf new file mode 100644 index 0000000..cd900c2 Binary files /dev/null and b/ThiyangaTalagala/m4competition_report.pdf differ diff --git a/ThiyangaTalagala/src/code_daily.R b/ThiyangaTalagala/src/code_daily.R new file mode 100644 index 0000000..4f93ed7 --- /dev/null +++ b/ThiyangaTalagala/src/code_daily.R @@ -0,0 +1,123 @@ +## ---- load-pkgs +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +library(foreach) + +## ---- load-data +data(M4) +M4_daily <- subset(M4, "daily") + +## --- convert the time series into suitable msts object +M4_daily_msts <- lapply(M4_daily, function(temp){ + temp$x <- convert_msts(temp$x, "daily") + return(temp) +}) + +## ---- load-rmConstantSeries +M4_daily_constant_train <- sapply(M4_daily_msts, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=14) + if (is.constant(training)==TRUE){print(temp$st)} +}) +# D2085 + +# split the M4 daily series into training and test +names_m4_use_d <- names(M4_daily_rm) +set.seed(8) +index_test_d <- sample(names_m4_use_d, 226) +save(index_test_d, file="data/daily/index_test_d.rda") +M4_training_daily <- M4_daily_rm[!names(M4_daily_rm) %in% index_test_d] +length(M4_training_daily) # 4000 +save(M4_training_daily, file="data/daily/M4_training_daily.rda") +M4_test_daily <- M4_daily_rm[names(M4_daily_rm) %in% index_test_d] +length(M4_test_daily) #100 +save(M4_test_daily, file="data/daily/M4_test_daily.rda") + + +# simulation +set.seed(8) # +M4Dmstlets <- lapply(M4_daily_msts, sim_mstlbased, Future=TRUE, Nsim=10, extralength=14, Combine=FALSE, mtd="ets") + +set.seed(8)# +M4Dmstlarima <- lapply(M4_daily_msts, sim_mstlbased, Future=TRUE, Nsim=10, extralength=14, Combine=FALSE, mtd="arima") + + +# convert to msts object +set.seed(8) +M4Dets_msts <- lapply(M4_daily_msts, sim_mstlbased, Future=TRUE, Nsim=5, extralength=14, Combine=FALSE, mtd="ets") +M4simets_daily_msts <- lapply(M4Dets_msts, function(temp){ + lapply(temp, convert_msts, category="daily")}) + + +set.seed(8) +M4Darima_msts <- lapply(M4_daily_msts, sim_mstlbased, Future=TRUE, Nsim=5, extralength=14, Combine=FALSE, mtd="arima") +M4simarima_daily_msts <- lapply(M4Darima_msts, function(temp){ + lapply(temp, convert_msts, category="daily")}) + + +# Calculate features: M4-training +M4D_train <- lapply(M4_training_daily, function(temp){temp$x}) +featuresM4D_training <- cal_features(M4D_train, seasonal=TRUE, h=14, m=7, lagmax=8L, database="other", highfreq=TRUE) +# labels M4-training +data_train <- lapply(M4_training_daily, function(temp){temp$x}) +m4daily_train <- fcast_accuracy(data_train, + models=c("rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstlets","mstlarima", "tbats"), + database="other", + h=14, accuracyFun=cal_m4measures, length_out=2) +m0 <- m4daily_train$accuracy +ARIMA <- rep("arima", 4000) +ETS <- rep("ets", 4000) +acc_list <- list(accuracy=m0, ARIMA=ARIMA, ETS=ETS) +M4D_ms <- cal_medianscaled(acc_list) +M4D_training <- prepare_trainingset(accuracy_set=M4D_ms, + feature_set=featuresM4D_training) +m4dtraining <- M4D_training$trainingset + +# classlabel for simulated data +M4Dmstlets <- lapply(M4simets_daily_msts, fcast_accuracy, + models=c("rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstlets","mstlarima", "tbats"), + database="other", + h=14, accuracyFun=cal_m4measures, length_out=2) +m4d_accuracy <- lapply(M4Dmstlets, function(temp){temp$accuracy}) +m4d_mat <- do.call(rbind, m4d_accuracy) +accuracy <- m4d_mat +ARIMA <- rep("arima", 42270) +ETS <- rep("ets", 42270) +accsim_list_sim <- list(accuracy=accuracy, ARIMA=ARIMA, ETS=ETS) +M4D_msim <- cal_medianscaled(accsim_list_sim) + +# features - simulated data +M4D_ets <- lapply(M4Dmstlets, function(temp){ + lapply(temp, function(temp){convert_msts(temp, "daily")})}) +features_M4Dets <-lapply(M4D_ets, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=14, m=7, lagmax=8L, database="other", highfreq=TRUE)}) +features_M4DS <- lapply(features_M4Dets, function(temp){ + do.call(rbind, temp) +}) +featuresM4DS <- data.table::rbindlist(features_M4DS, use.names = TRUE, fill = TRUE) +featuresM4DS <- as.data.frame(featuresM4DS) +dim(featuresM4DS) # 42270 27 +featuresM4DS$seasonal_strength1[is.na(featuresM4DS$seasonal_strength1)==TRUE] = + featuresM4DS$seasonality[is.na(featuresM4DS$seasonality)==FALSE] +featuresM4DS$seasonal_strength2[is.na(featuresM4DS$seasonal_strength2)==TRUE]=0 +dim(featuresM4DS) + +featuresM4DS <- featuresM4DS %>% dplyr::select(-dplyr::one_of("seasonality")) +featuresM4DS <- featuresM4DS %>% dplyr::select(-dplyr::one_of("seas_pacf")) + +M4Dsim_rf <- prepare_trainingset(accuracy_set=M4D_msim, + feature_set=featuresM4DS) + +# combine the data frames for daily series +daily_training <- dplyr::bind_rows(m4dtraining, M4D_training_sim) +daily_training <- daily_training %>% dplyr::select(-dplyr::one_of("seas_pacf")) +save(daily_training, file="data/daily_training.rda") + diff --git a/ThiyangaTalagala/src/code_hourly.R b/ThiyangaTalagala/src/code_hourly.R new file mode 100644 index 0000000..621fb0b --- /dev/null +++ b/ThiyangaTalagala/src/code_hourly.R @@ -0,0 +1,91 @@ +## ---- load-pkgs +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +library(foreach) + +## ---- load-data +data(M4) +M4_hourly <- subset(M4, "hourly") + +## --- convert the time series into suitable msts object +M4_hourly_msts <- lapply(M4_hourly, function(temp){ + temp$x <- convert_msts(temp$x, "hourly") + return(temp) +}) + +## ---- load-rmConstantSeries +M4_hourly_constant_train <- sapply(M4_hourly_msts, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=48) + if (is.constant(training)==TRUE){print(temp$st)} +}) +# No hourly series with constant values + +# split the M4 hourly series into training and test +names_m4_use_h <- names(M4_hourly_msts) +set.seed(8) +index_test_h <- sample(names_m4_use_h, 64) +M4_training_hourly <- M4_hourly_msts[!names(M4_hourly_msts) %in% index_test_h] +M4_test_hourly <- M4_hourly_msts[names(M4_hourly_msts) %in% index_test_h] + + +# simulation +set.seed(8) +M4Hmstlets <- lapply(M4_hourly_msts, sim_mstlbased, Future=TRUE, Nsim=10, extralength=48, Combine=FALSE, mtd="ets") +set.seed(8) +M4Hmstlarima <- lapply(M4_hourly_msts, sim_mstlbased, Future=TRUE, Nsim=10, extralength=48, Combine=FALSE, mtd="arima") + +# convert simulate ts to msts +M4Hmstlarima_msts <- lapply(M4Hmstlarima, function(temp){ + lapply(temp, function(temp){convert_msts(temp, "hourly")})}) + +# features +M4H_train <- lapply(M4_training_hourly, function(temp){temp$x}) +featuresM4H_training <- cal_features(M4H_train, seasonal=TRUE, h=48, m=24, lagmax=25L, database="other", highfreq=TRUE) + +features_m4hmstlets <- lapply(M4Hmstlarima_msts, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=48, m=24, lagmax=25L, database="other", highfreq=TRUE)}) +features_M4H_mstl <- lapply(features_m4hmstlets, function(temp){ + do.call(rbind, temp) +}) +features_M4Hmstl_DF <- do.call(rbind, features_M4H_mstl) + +# Class label +data_train <- lapply(M4_training_hourly, function(temp){temp$x}) +M4Htraining_label <- fcast_accuracy(data_train, + models=c("rw", "rwd", "wn", "stlar", "nn", "snaive", "mstlets","mstlarima", "tbats"), + database="other", + h=48, accuracyFun=cal_m4measures, length_out=2) +m0 <- M4Htraining_label$accuracy +ARIMA <- rep("arima", 350) +ETS <- rep("ets", 350) +acc_list <- list(accuracy=m0, ARIMA=ARIMA, ETS=ETS) +M4H_ms <- cal_medianscaled(acc_list) +M4H_training <- prepare_trainingset(accuracy_set=M4H_ms, + feature_set=featuresM4H_training) + +# simulated series +m4hets <- lapply(M4Hmstlets, fcast_accuracy, + models=c("rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstlets","mstlarima", "tbats"), + database="other", + h=48, accuracyFun=cal_m4measures, length_out=2) +accuracy <- m4hets$accuracy +ARIMA <- rep("arima", 4140) +ETS <- rep("ets", 4140) +accsim_list <- list(accuracy=accuracy, ARIMA=ARIMA, ETS=ETS) +M4H_msim <- cal_medianscaled(accsim_list) +M4H_training_sim <- prepare_trainingset(accuracy_set=M4H_msim, + feature_set=features_M4Hmstl_DF) + + +# combine dataframes +hourly_training <- dplyr::bind_rows(M4H_training$trainingset, M4H_training_sim$trainingset) +save(hourly_training, file="data/hourly_training.rda") + diff --git a/ThiyangaTalagala/src/code_monthly.R b/ThiyangaTalagala/src/code_monthly.R new file mode 100644 index 0000000..ee6bd8a --- /dev/null +++ b/ThiyangaTalagala/src/code_monthly.R @@ -0,0 +1,142 @@ +## ---- load-pkgs +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +library(foreach) + +## ---- load-data +M1_monthly <- subset(M1, "monthly") +M3_monthly <- subset(M3, "monthly") +data(M4) +M4_monthly <- subset(M4, "monthly") + +## ---- load-rmConstantSeries +M4_monthly_constant_train <- sapply(M4_monthly, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=18) + if (is.constant(training)==TRUE){print(temp$st)} +}) +## for monthly data there is no series with constant values + +## ---- extract series number + +sn <- sapply(M4_monthly, function(temp){temp$st}) +set.seed("27-4-2018") +index_test <- sample(sn, 1000) +M4_monthly_training <- M4_monthly[ !names(M4_monthly) %in% index_test] +length(M4_monthly_training) # 47000 +save(M4_monthly_training, file="data/monthly/M4_monthly_training.rda") + +M4_monthly_test <- M4_monthly[names(M4_monthly) %in% index_test] +length(M4_monthly_test) #1000 +save(M4_monthly_test, file="data/monthly/M4_monthly_test.rda") + +# ---- simulation +data(M4) +m4_monthly <- subset(M4, "monthly") +set.seed(8) +M4MAS <- lapply(m4_monthly, sim_etsbased, Future=TRUE, Nsim=5, extralength=18, Combine=FALSE) + +M4MES <- lapply(m4_monthly, sim_arimabased, Future=TRUE, Nsim=5, extralength=18, Combine=FALSE) + + +# ---- classlabel +## M1-monthly +classlabelM1M <- fcast_accuracy(monthly_m1, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="Mcomp", + h=18, accuracyFun=cal_m4measures, length_out = 2) + +## M3-monthly +classlabelM3M <- fcast_accuracy(monthly_m3, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="Mcomp", + h=18, accuracyFun=cal_m4measures, length_out = 2) + +## M4-monthly-training +M4Mtraining_fcast_accuracy <- fcast_accuracy(M4_monthly_training, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=18, accuracyFun=cal_m4measures, length_out = 2) + + +## M4-monthly_simulate based on ARIMA +M4MAS_fcast_accuracy <- lapply(M4MAS, fcast_accuracy, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=18, accuracyFun=cal_m4measures, length_out=2) + +## M4-monthly_simulate based on ETS +m4_monthly_training <- load("M4/data/M4MES.rda") # length 48000 +M4MES_fcast_accuracy <- lapply(data, fcast_accuracy, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=18, accuracyFun=cal_m4measures, length_out=2) + +# ---- features +## M1-monthly +featuresM1M <- cal_features(M1_monthly, seasonal=TRUE, h=18, m=12, lagmax=13L, database="M1", highfreq = FALSE) + +## M3-monthly +featuresM3M <- cal_features(M3_monthly, seasonal=TRUE, h=18, m=12, lagmax=13L, database="M3", highfreq = FALSE) + +## M4-monthly(training) +M4M_training <- lapply(M4_monthly_training, function(temp){temp$x}) +featuresM4M_training <- cal_features(M4M_training, seasonal=TRUE, h=18, m=12, lagmax=13L, database="other", highfreq = FALSE) + +# calculate features on simulated data - simulated based on ARIMA +featuresM4MSA <- lapply(M4MAS, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=18, m=12, lagmax=13L, database="other", highfreq=FALSE)}) + +# calculate features on simulated data - simulated based on ETS +featuresM4MSE <- lapply(M4MES, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=18, m=12, lagmax=13L, database="other", highfreq=FALSE)}) + + +# processing class labels for monthly data + +# M1 series +classlabelM1M_ms <- cal_medianscaled(classlabelM1M) +m1m_df <- prepare_trainingset(accuracy_set = classlabelM1M_ms, featuresM1M) +m1m_df_training <- m1m_df$training + +# M3 series +classlabelM3M_ms <- cal_medianscaled(classlabelM3M) +m3m_df <- prepare_trainingset(accuracy_set = classlabelM3M_ms, featuresM3M) +m3m_df_training <- m3m_df$training + +# M4-training set +M4Mtraining_fcast_accuaracy_ms <- cal_medianscaled(M4Mtraining_fcast_accuaracy) +features_training <- featuresM4M_training[1:44650, ] +M4M_trainingset <- prepare_trainingset(accuracy_set=M4Mtraining_fcast_accuaracy_ms, + feature_set=features_training) +M4M_training <- M4M_trainingset$trainingset + +# M4 - simulate based on ARIMA +M4MAS_fcast_accuaracy_ms <- cal_medianscaled(M4MAS_fcast_accuaracy) +features_M4MSA <- lapply(featuresM4MSA, function(temp){ + do.call(rbind, temp) +}) +features_M4MSA_DF <- do.call(rbind, features_M4MSA) +M4MAS_rfset_arima <- prepare_trainingset(accuracy_set=M4MAS_fcast_accuaracy_ms, + feature_set=features_M4MSA_DF) + +M4MAS_rfset <- M4MAS_rfset_arima$trainingset + +# training set +monthly_training <- dplyr::bind_rows(m1m_df_training, m3m_df_training) +monthly_training <- dplyr::bind_rows(monthly_training,M4M_training) +monthly_training <- dplyr::bind_rows(monthly_training, M4MAS_rfset) +save(monthly_training, file="data/monthly_training.rda") + + diff --git a/ThiyangaTalagala/src/code_quarterly.R b/ThiyangaTalagala/src/code_quarterly.R new file mode 100644 index 0000000..70970e2 --- /dev/null +++ b/ThiyangaTalagala/src/code_quarterly.R @@ -0,0 +1,166 @@ +## ---- load-pkgs +library(tidyverse) +library(Mcomp) +library(forecast) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) + +## ---- load-data +M1_quarterly <- subset(M1, "Quarterly") +M3_quarterly <- subset(M3, "Quarterly") +data(M4) +M4_quarterly <- subset(M4, "Quarterly") + +# ---- summary of length of series +M4_length <- sapply(M4_quarterly, function(temp){length(temp$x)}) +table(M4_length) # min-16 and max - 866 + +length_16 <- as.vector(which(M4_length==16)) + +## ---- load-rmConstantSeries +M4_quarterly_constant_train <- sapply(M4_quarterly, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=8) + if (is.constant(training)==TRUE){print(temp$st)} +}) + +# constant series: Q5619 +# plot(M4[["Q5619"]]$x) +M4_quarterly_rm <- M4_quarterly[-c(5619, length_16)] +length(M4_quarterly_rm) # 23998 + +# split the M4 series into training and test +names_m4_use_q <- names(M4_quarterly_rm) +set.seed(8) +index_test_q <- sample(names_m4_use_q, 1000) +M4_training_quarterly <- M4_quarterly_rm[!names(M4_quarterly_rm) %in% index_test_q] +M4_test_quarterly <- M4_quarterly_rm[names(M4_quarterly_rm) %in% index_test_q] + +# Simulate based on ARIMA +set.seed(8) +M4QAS <- lapply(data, sim_arimabased, Future=TRUE, Nsim=5, extralength=8, Combine=FALSE) + +# Simulate based on ETS +set.seed(8) +M4QES <- lapply(data, sim_etsbased, Future=TRUE, Nsim=5, extralength=8, Combine=FALSE) + + +## ---- load-classlabelM1Q +classlabelM1Q <- fcast_accuracy(quarterly_m1, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="Mcomp", + h=8, accuracyFun=cal_m4measures, length_out=2) + +## ---- load-classlabelM3Q +classlabelM3Q <- fcast_accuracy(quarterly_m3, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="Mcomp", + h=8, accuracyFun=cal_m4measures, length_out=2) + +## --- class label M4Q(training) +data_train <- lapply(M4_training_quarterly, function(temp){temp$x}) +M4Qtraininglab <- fcast_accuracy(data_train, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=8, accuracyFun=cal_m4measures) + +## ---- load-classlabelM4QAS +M4Qtraining_fcast_accuaracy <- lapply(M4QAS, fcast_accuracy, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=8, accuracyFun=cal_m4measures, length_out=2) + +## ---- load-classlabelM4QES +M4QES_fcast_accuaracy <- lapply(M4QES, fcast_accuracy, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=8, accuracyFun=cal_m4measures, length_out=2) + +## ---- load-classlabelM4Q(test) +data_test <- lapply(M4_test_quarterly, function(temp){temp$x}) +M4Q_test_label <- fcast_accuracy(data_test, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=8, accuracyFun=cal_m4measures, length_out=2) + +## ---- calculate features +# M1Q +featuresM1Q <- cal_features(M1_quarterly, seasonal=TRUE, h=8, m=4, lagmax=5L, database="M1") + +# M3Q +featuresM3Q <- cal_features(M3_quarterly, seasonal=TRUE, h=8, m=4, lagmax=5L, database="M1") + +# M4Q - training +M4Q_train <- lapply(M4_training_quarterly, function(temp){temp$x}) +featuresM4Q_taining <- cal_features(M4Q_train, seasonal=TRUE, h=8, m=4, lagmax=5L, database="other") + +# M4Q - test +M4Q_test <- lapply(M4_test_quarterly, function(temp){temp$x}) +featuresM4Q_test <- cal_features(M4Q_test, seasonal=TRUE, h=8, m=4, lagmax=5L, database="other") + +# M4Q - simulate based on ETS-features +featuresM4QSE <- lapply(M4QES, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=8, m=4, lagmax=5L, database="other")}) + +# M4Q - simulate based on ARIMA-featues +featuresM4QSA <- lapply(M4QAS, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=8, m=4, lagmax=5L, database="other")}) + + +# Random forest data preparation +## based on m1 +classlabelM1Q_ms <- cal_medianscaled(classlabelM1Q) +m1q_df <- prepare_trainingset(accuracy_set = classlabelM1Q_ms, featuresM1Q) +m1q_df_training <- m1q_df$training + +## based on m3 +classlabelM3Q_ms <- cal_medianscaled(classlabelM3Q) +m3q_df <- prepare_trainingset(classlabelM3Q_ms, featuresM3Q) +m3q_df_training <- m3q_df$training + +# M4 training +M4Qtraining_fcast_accuaracy_ms <- cal_medianscaled(M4Qtraining_fcast_accuaracy) +M4Q_trainingset <- prepare_trainingset(accuracy_set=M4Qtraining_fcast_accuaracy_ms, + feature_set=featuresM4Q_taining) +M4Q_training <- M4Q_trainingset$trainingset + + +# M4- arima based +M4QAS_fcast_accuaracy_ms <- cal_medianscaled(M4QAS_fcast_accuaracy) +features_M4QSA <- lapply(featuresM4QSA, function(temp){ + do.call(rbind, temp) +}) +features_M4QSA_DF <- do.call(rbind, features_M4QSA) +M4QAS_rfset <- prepare_trainingset(accuracy_set=M4QAS_fcast_accuaracy_ms, + feature_set=features_M4QSA_DF) +M4QAS_training <- M4QAS_rfset$trainingset + + +# M4 ets based +M4QES_fcast_accuaracy_ms <- cal_medianscaled(M4QES_fcast_accuaracy) +features_M4QSE <- lapply(featuresM4QSE, function(temp){ + do.call(rbind, temp) +}) +features_M4QSE_DF <- do.call(rbind, features_M4QSE) +M4QES_rfset <- prepare_trainingset(accuracy_set=M4QES_fcast_accuaracy_ms, + feature_set=features_M4QSE_DF) +M4QES_training <- M4QES_rfset$trainingset + +# Combine dataframes--------------------------------- +quarterly_training <- dplyr::bind_rows(m1q_df_training, m3q_df_training) +quarterly_training <- dplyr::bind_rows(quarterly_training,M4Q_training) +quarterly_training <- dplyr::bind_rows(quarterly_training, M4QAS_training) +quarterly_training <- dplyr::bind_rows(quarterly_training, M4QES_training) +dim(quarterly_training) #263957 31 (120000*2+75+22998+203) +table(quarterly_training$classlabels) +save(quarterly_training, file="data/quarterly_training.rda") + diff --git a/ThiyangaTalagala/src/code_reproducibility.R b/ThiyangaTalagala/src/code_reproducibility.R new file mode 100644 index 0000000..987d2bd --- /dev/null +++ b/ThiyangaTalagala/src/code_reproducibility.R @@ -0,0 +1,168 @@ +# Note ########################################################### +# +# The R package: seer is built to run the all codes. The seer +# package is available at: https://github.com/thiyangt/seer +# +# +# Note: download the data folder from: https://github.com/thiyangt/M4Competition/tree/master/data +# Preparation of these training files ae computationally expensive hence file are uploaded +# to the above mentioned git repository. Further, codes to construct this training data files are +# available at: src floder of the git repository at: https://github.com/thiyangt/M4Competition +# +# Before you start download the data folder from: +# https://github.com/thiyangt/M4Competition/tree/master/data +# +# To generate the files according to the format specify in the competition download the +# template folder at: https://github.com/thiyangt/M4Competition +################################################################## + +#---- load and install necessary packages +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +library(foreach) +library(readr) +devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +devtools::install_github("thiyangt/seer") +library(seer) + +#---- load data +## load M4 competition data +data(M4) +yearly_m4 <- subset(M4, "yearly") +quarterly_m4 <- subset(M4, "quaterly") +monthly_m4 <- subset(M4, "monthly") +weekly_m4 <- subset(M4, "weekly") +daily_m4 <- subset(M4, "daily") +hourly_m4 <- subset(M4, "hourly") + +# load training data files +y_train <- load(file="data/yearly_training.rda") +q_train <- load(file="data/quarterly_training.rda") +m_train <- load(file="data/monthly_training.rda") +w_train <- load(file="data/weekly_training.rda") +d_train <- load(file="data/daily_training.rda") +h_train <- load(file="data/hourly_training.rda") + +#---- generate foecasts for yearly series +features_M4Y<- seer::cal_features(yearly_m4, database="M4", highfreq = FALSE) +yearly_rcp <- seer::build_rf(yearly_training, features_M4Y, rf_type="rcp", ntree=1000, seed=1) +predictions_yearlym4_rcp <- yearly_rcp$predictions +m4yearly_forecast <- seer::rf_forecast(predictions_yearlym4_rcp, + yearly_m4, "M4", "cal_MASE", h=6, + accuracy = FALSE) + +na_y <- matrix(NA, ncol=42, nrow=23000) +ymean <- cbind(m4yearly_forecast$mean, na_y) +ylower <- cbind(m4yearly_forecast$lower, na_y) +yupper <- cbind(m4yearly_forecast$upper, na_y) + + +#---- generate foecasts for quarterly series +features_M4Q<- seer::cal_features(quarterly_m4, seasonal=TRUE, m=4,lagmax=5L, + database="M4", highfreq = FALSE) +quarterly_rcp <- seer::build_rf(quarterly_training, features_M4Q, rf_type="rcp", ntree=1000, seed=1) +predictions_quarterlym4_rcp <- quarterly_rcp$predictions +m4quarterly_forecast <- rf_forecast(predictions_quarterlym4_rcp, + quarterly_m4, "M4", "cal_MASE", h=8, + accuracy = FALSE) + +na_q <- matrix(NA, ncol=40, nrow=24000) +qmean <- cbind(m4quarterly_forecast$mean, na_q) +qlower <- cbind(m4quarterly_forecast$lower, na_q) +qupper <- cbind(m4quarterly_forecast$upper, na_q) + +#---- generate foecasts for monthly series +features_M4M<- seer::cal_features(monthly_m4, seasonal=TRUE, m=12,lagmax=13L, + database="M4", highfreq = FALSE) +monthly_rcp <- seer::build_rf(monthly_training, features_M4M, rf_type="rcp", ntree=1000, seed=1) +predictions_monthlym4_rcp <- monthly_rcp$predictions +m4monthly_forecast <- seer::rf_forecast(predictions_monthlym4_rcp, monthly_m4, "M4", "cal_MASE", h=18, + accuracy = FALSE) + +na_m <- matrix(NA, ncol=30, nrow=48000) +mmean <- cbind(m4monthly_forecast$mean, na_m) +mlower <- cbind(m4monthly_forecast$lower, na_m) +mupper <- cbind(m4monthly_forecast$upper, na_m) + +#---- generate foecasts for weekly series +features_M4W <- seer::cal_features(weekly_m4, seasonal=TRUE, m=52,lagmax=53L, + database="M4", highfreq = FALSE) +weekly_rcp <- seer::build_rf(weekly_training, features_M4W, rf_type="rcp", ntree=1000, seed=1) +predictions_weeklym4_rcp <- weekly_rcp$predictions +m4weekly_forecast <- seer::rf_forecast(predictions_weeklym4_rcp, + weekly_m4, "M4", "cal_MASE", h=13, + accuracy = FALSE) +na_w <- matrix(NA, ncol=35, nrow=359) +wmean <- cbind(m4weekly_forecast$mean, na_w) +wlower <- cbind(m4weekly_forecast$lower, na_w) +wupper <- cbind(m4weekly_forecast$upper, na_w) + + +#---- generate foecasts for daily series +## convert data into msts object +dailym4_msts <- lapply(daily_m4, function(temp){ + temp$x <- convert_msts(temp$x, "daily") + return(temp) +}) +features_M4D <- seer::cal_features(dailym4_msts, seasonal=TRUE, m=7,lagmax=8L, + database="M4", highfreq = TRUE) +daily_rcp <- seer::build_rf(daily_training, features_M4D, rf_type="rcp", ntree=1000, seed=1) +predictions_dailym4_rcp <- daily_rcp$predictions +m4daily_forecast <- seer::rf_forecast(predictions_dailym4_rcp, dailym4_msts, "M4", "cal_MASE", h=14, + accuracy = FALSE) + +na_d <- matrix(NA, ncol=34, nrow=4227) +dmean <- cbind(m4daily_forecast$mean, na_d) +dlower <- cbind(m4daily_forecast$lower, na_d) +dupper <- cbind(m4daily_forecast$upper, na_d) + + +#---- generate foecasts for hourly series +## convert data into msts object +hourlym4_msts <- lapply(hourly_m4, function(temp){ + temp$x <- convert_msts(temp$x, "hourly") + return(temp) +}) +features_M4H <- seer::cal_features(hourlym4_msts, seasonal=TRUE, m=24,lagmax=25L, + database="M4", highfreq = TRUE) +hourly_rcp <- seer::build_rf(hourly_training, features_M4H, rf_type="rcp", ntree=1000, seed=1) +predictions_hourlym4_rcp <- hourly_rcp$predictions +m4hourly_forecast <- rf_forecast(predictions_hourlym4_rcp, + hourlym4_msts, "M4", "cal_MASE", h=48, + accuracy = FALSE) +hmean <- m4hourly_forecast$mean +hlower <- m4hourly_forecast$lower +hupper <- m4hourly_forecast$upper + +#---- processing final csv files +mean_m4 <- rbind(ymean, qmean, mmean, wmean, dmean, hmean) +lower_m4 <- rbind(ylower, qlower, mlower, wlower, dlower, hlower) +upper_m4 <- rbind(yupper, qupper, mupper, wupper, dupper, hupper) + +mean_m4[mean_m4 < 0] <- 0 +lower_m4[lower_m4 < 0] <- 0 +upper_m4[upper_m4 < 0] <- 0 + +#---- reading the column names and rownames from the template +# before running download the template folder at +template <- read.csv(file = "template/template_Naive.csv") +col_name <- colnames(template) +id <- template$id +colnames(mean_m4) <- col_name[2:49] +colnames(lower_m4) <- col_name[2:49] +colnames(upper_m4) <- col_name[2:49] + +mean_m4 <- data.frame(id , mean_m4) +write.csv(mean_m4, file="mean_m4.csv", + row.names = FALSE) + +lower_m4 <- data.frame(id , lower_m4) +write.csv(lower_m4, file="lower_m4.csv", + row.names = FALSE) + +upper_m4 <- data.frame(id , upper_m4) +write.csv(upper_m4, file="upper_m4.csv", + row.names = FALSE) \ No newline at end of file diff --git a/ThiyangaTalagala/src/code_weekly.R b/ThiyangaTalagala/src/code_weekly.R new file mode 100644 index 0000000..ed63006 --- /dev/null +++ b/ThiyangaTalagala/src/code_weekly.R @@ -0,0 +1,94 @@ +## ---- load-pkgs +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +library(foreach) + +## ---- load-data +data(M4) +M4_weekly <- subset(M4, "weekly") + +## ---- load-rmConstantSeries +M4_weekly_constant_train <- sapply(M4_weekly, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=13) + if (is.constant(training)==TRUE){print(temp$st)} +}) +## for weekly data there is no series with constant values + +# split the M4 series into training and test +names_m4_use_w <- names(M4_weekly) +set.seed(8) +index_test_w <- sample(names_m4_use_w, 100) +save(index_test_w, file="data/weekly/index_test_w.rda") +M4_training_weekly <- M4_weekly[!names(M4_weekly) %in% index_test_w] +length(M4_training_weekly) # 259 +save(M4_training_weekly, file="data/weekly/M4_training_weekly.rda") +M4_test_weekly <- M4_weekly[names(M4_weekly) %in% index_test_w] +length(M4_test_weekly) #100 +save(M4_test_weekly, file="data/weekly/M4_test_weekly.rda") + + +# ---- simulation +M4Wmstl <- lapply(M4_hourly, sim_mstlbased, Future=TRUE, Nsim=10, extralength=13, Combine=FALSE) + +# ---- classlabel +## M4-weekly +data_train_m4W <- lapply(M4_training_weekly, function(temp){temp$x}) +classlabelM4W <- fcast_accuracy(data_train_m4W, + models=c("arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=13, accuracyFun=cal_m4measures) + +## M4-weekly_simulate based on mstl +naomit_list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) } +M4Wmstl_nafree <- naomit_list(M4Wmstl) +classlabelM4WSmstl <- lapply(M4Wmstl_nafree, fcast_accuracy, + models=c("ets", "arima", "rw", "rwd", "wn", + "theta", "stlar", "nn", "snaive", "mstl", "tbats"), + database="other", + h=13, accuracyFun=cal_m4measures,length_out = 2) + +# ---- features +M4W_train <- lapply(M4_training_weekly, function(temp){temp$x}) +featuresM4W_training <- cal_features(M4W_train, seasonal=TRUE, h=13, m=52, lagmax=53L, database="other", highfreq=FALSE) + +## M4-weekly_simulate based on mstlETS +featuresM4Wmstl <- lapply(M4Wmstl_nafree, function(temp){ + lapply(temp, cal_features,seasonal=TRUE, h=13, m=52, lagmax=53L, database="other", highfreq=FALSE)}) +features_M4WS <- lapply(featuresM4Wmstl, function(temp){ + do.call(rbind, temp) +}) +features_M4WS_DF <- do.call(rbind, features_M4WS) +features_M4WS_DF <- within(features_M4WS_DF, rm(hwalpha, hwbeta, hwgamma)) + +# Random forest preparation +# M4 data +classlabelM4W_ms <- cal_medianscaled(classlabelM4W) +classlabelM4W_ms$ETS <- rep("ets", 259) +featuresM4W_training <- within(featuresM4W_training, rm(hwalpha, hwbeta, hwgamma, sediff_seacf1)) +M4W_rfset <- prepare_trainingset(accuracy_set=classlabelM4W_ms, + feature_set=featuresM4W_training) + +# simulated data +acc <- lapply(classlabelM4WSmstl, function(temp){temp$accuracy}) +acc_mat <- do.call(rbind, acc) +w_arima <- lapply(classlabelM4WSmstl, function(temp){temp$ARIMA}) +arima_mat <- do.call(rbind, w_arima) +w_ets <- lapply(classlabelM4WSmstl, function(temp){temp$ETS}) +ets_mat <- do.call(rbind, w_ets) +acc_info <- list(accuracy=acc_mat, ARIMA=arima_mat, ETS=ets_mat) +classlabelM4WSmstl_ms <- cal_medianscaled(acc_info) +classlabelM4WSmstl_ms$ETS <- rep("ets", 2940) +M4WS_rfset <- prepare_trainingset(accuracy_set=classlabelM4WSmstl_ms, + feature_set=features_M4WS_DF) + +weekly_training <- dplyr::bind_rows(M4W_rfset$trainingset, M4WS_rfset$trainingset) +dim(weekly_training) + diff --git a/ThiyangaTalagala/src/code_yearly.R b/ThiyangaTalagala/src/code_yearly.R new file mode 100644 index 0000000..8284ab6 --- /dev/null +++ b/ThiyangaTalagala/src/code_yearly.R @@ -0,0 +1,186 @@ +## ---- load-pkgs +library(tidyverse) +library(forecast) +library(Mcomp) +library(forecTheta) +# devtools::install_github("thiyangt/seer") +library(seer) +# devtools::install_github("robjhyndman/tsfeatures") +library(tsfeatures) +library(foreach) + +## ---- load-data +M1_yearly <- subset(M1, "Yearly") +M3_yearly <- subset(M3, "Yearly") +data(M4) +M4_yearly <- subset(M4, "Yearly") + +## ---- load-rmConstantSeries +M4_yearly_constant_train <- sapply(M4_yearly, function(temp){ + ts1 <- temp$x + training <- head_ts(ts1, h=6) + if (is.constant(training)==TRUE){print(temp$st)} +}) + +# Y12146 +# Y21168 + +M4_yearly_rm <- M4_yearly[-c(12146, 21168)] +length(M4_yearly_rm) # 22998 + +# split the M4 series into training and test +names_m4_use_y <- names(M4_yearly_rm) +set.seed(8) +index_test_y <- sample(names_m4_use_y, 1000) +M4_training_yearly <- M4_yearly_rm[!names(M4_yearly_rm) %in% index_test_y] +M4_test_yearly <- M4_yearly_rm[names(M4_yearly_rm) %in% index_test_y] + + +# simulate- arima based +set.seed(8) +M4YSimARIMA <- lapply(M4_yearly, sim_arimabased, Future=TRUE, Nsim=10, extralength=6, Combine=FALSE) +length(M4YSimARIMA) # 23000 + +# simulate- ets based +set.seed(8) +M4YSimETS <- lapply(M4_yearly, sim_etsbased, Future=TRUE, Nsim=10, extralength=6, Combine=FALSE) +length(M4YSimETS) + +## ---- load-calculate forecast accuracy measures +classlabelM1Y <- fcast_accuracy(M1_yearly, + models=c("ets","arima", "rw", "rwd", "wn", "theta", "nn"), + database="Mcomp", + h=6, accuracyFun=cal_m4measures, length_out = 2) + +## ---- load-classlabelM3Y +classlabelM3Y <- fcast_accuracy(M3_yearly, + models=c("ets","arima", "rw", "rwd", "wn", "theta", "nn"), + database="Mcomp", + h=6, accuracyFun=cal_m4measures, length_out=2) + +## ---- load - classlabelM4Y-training +data_train_m4y <- lapply(M4_training_yearly, function(temp){temp$x}) +classlabelM4Y_train <- fcast_accuracy(data_train_m4y, + models=c("ets","arima", "rw", "rwd", "wn", "theta", "nn"), + database="other", + h=6, accuracyFun=cal_m4measures, length_out=2) + + +## ---- load - simulatebased on ARIMA (run on Monash Computer Cluster) +classlabelM4YSA <- lapply(M4YSimARIMA, fcast_accuracy, + models=c("ets","arima", "rw", "rwd", "wn", "theta", "nn"), + database="other", + h=6, accuracyFun=cal_m4measures) + +## ---- load - simulatebased on ETS (run on Monash Computer Cluster) +classlabelM4YSE <- lapply(M4YSimETS, fcast_accuracy, + models=c("ets","arima", "rw", "rwd", "wn", "theta", "nn"), + database="other", + h=6, accuracyFun=cal_m4measures) + + +## ---- calculate features +# M1Y +featuresM1Y <- cal_features(M1_yearly,h=6,database="M1", highfreq = FALSE) + +# M3Y +featuresM3Y <- cal_features(M3_yearly, h=6,database="M3", highfreq = FALSE) + +# M4Y - training +M4Y_train <- lapply(M4_training_yearly, function(temp){temp$x}) +featuresM4Y_training <- cal_features(M4Y_train, h=6, database="other", highfreq = FALSE) + +# M4Y - simulate based on ETS +featuresM4YSE <- lapply(M4YSimETS, function(temp){ + lapply(temp, cal_features, h=6, database="other", highfreq=FALSE)}) + +# M4Y - simulate based on ARIMA +featuresM4YSA <- lapply(M4YSimARIMA, function(temp){ + lapply(temp, cal_features, h=6, database="other", highfreq=FALSE)}) + + +# prepare data for random forest training +## M1 +classlabel_m1y <- cal_meadianscaled(classlabelM1Y) +m1_df <- prepare_trainingset(classlabel_m1y, featuresM1Y) +m1_df_training <- m1_df$trainingset + +## M3 +classlabel_m3y <- cal_meadianscaled(classlabelM3Y) +m3_df <- prepare_trainingset(classlabel_m3y, M3yearly_features) +m3_df_training <- m3_df$trainingset + + +## M4 - training +classlabel_m4y <- cal_meadianscaled(classlabelM4Y_train) +m4_df <- prepare_trainingset(classlabel_m4y, featuresM4Y_training) +m4_df_training <- m4_df$trainingset + +## m4 simulation-arima based +features_M4YSA <- lapply(featuresM4YSA, function(temp){ + do.call(rbind, temp) +}) +features_M4YSA_DF <- do.call(rbind, features_M4YSA) +accuracym4ysa <- lapply(classlabelM4YSA, function(temp){ + ACCURACY <- temp$accuracy +}) +accuracym4ysa_mat <- do.call(rbind, accuracym4ysa) +arimam4ysa <- lapply(classlabelM4YSA, function(temp){ + temp$ARIMA +}) +arimam4ysa <- unlist(arimam4ysa) +etsm4ysa <- lapply(classlabelM4YSA, function(temp){ + temp$ETS +}) +etsm4ysa <- unlist(etsm4ysa) +accuracy_m4ysa <- list(accuracy=accuracym4ysa_mat, ARIMA=arimam4ysa, + ETS=etsm4ysa) +classlabel_m4ysa <- cal_meadianscaled(accuracy_m4ysa) +m4ysa_df <- prepare_trainingset(classlabel_m4ysa, features_M4YSA_DF) +m4ysa_df_training <- m4ysa_df$trainingset + +## m4-ets based +# the places with non-matrix +M4_yearly_ets <- sapply(classlabelM4YSE, function(temp){ + if (class(temp$accuracy)!="matrix"){print(parent.frame()$i[])} +}) +# Y21312 +# remove Y21312 from features and classlabels +featuresM4YSE2 <- featuresM4YSE[-21312] +length(featuresM4YSE) # 23000 +length(featuresM4YSE2) #22999 +classlabelM4YSE2 <- classlabelM4YSE[-21312] +length(classlabelM4YSE2) # 22999 +length(classlabelM4YSE) # 23000 + +# print matrix with NA +features_M4YSE <- lapply(featuresM4YSE2, function(temp){ + do.call(rbind, temp) +}) +features_M4YSE_DF <- do.call(rbind, features_M4YSE) +accuracym4yse <- lapply(classlabelM4YSE2, function(temp){ + ACCURACY <- temp$accuracy +}) +accuracym4yse_mat <- do.call(rbind, accuracym4yse) +arimam4yse <- lapply(classlabelM4YSE2, function(temp){ + temp$ARIMA +}) +arimam4yse <- unlist(arimam4yse) +etsm4yse <- lapply(classlabelM4YSE2, function(temp){ + temp$ETS +}) +etsm4yse <- unlist(etsm4yse) +accuracy_m4yse <- list(accuracy=accuracym4yse_mat, ARIMA=arimam4yse, + ETS=etsm4yse) +classlabel_m4yse <- cal_meadianscaled(accuracy_m4yse) +m4yse_df <- prepare_trainingset(classlabel_m4yse, features_M4YSE_DF) +m4yse_df_training <- m4yse_df$trainingset + +# Combine dataframes--------------------------------- +yearly_training <- dplyr::bind_rows(m1_df_training, m3_df_training) +yearly_training <- dplyr::bind_rows(yearly_training,m4_df_training) +yearly_training <- dplyr::bind_rows(yearly_training, m4ysa_df_training) +yearly_training <- dplyr::bind_rows(yearly_training, m4yse_df_training) +dim(yearly_training) #482813 26 +table(yearly_training$classlabels) +save(yearly_training, file="data/yearly_training.rda")