Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added 118 - slaweks17/ES_RNN_SlawekSmyl.pdf
Binary file not shown.
143 changes: 143 additions & 0 deletions 118 - slaweks17/R/merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Merging outputs, per category, M4 competition, for point forecasts, so for ES_RNN and ES_RNN_E
# Author: Slawek Smyl, Mar-May 2018


#The c++ executables write to one (occasinally two, sorry :-), so in such case move files to one dir before continuing) directories.
#(One logical run of several instances of the same program will produce a number files, e.g. outputs with different ibig value)
#This script merges, averages values, and writes them down to the same directory - FOREC_DIR
###############################################################################

#directory that should include all *-train.csv files, as well as M4-info.csv
DATA_DIR="F:/progs/data/M4DataSet/"
m4Info_df=read.csv(paste0(DATA_DIR,"M4-info.csv"))
options(stringsAsFactors =FALSE)

#directory with all the output files produced by the c++ code we want to merge
FOREC_DIR='F:\\progs\\data\\M4\\Quarterly2018-05-31_09_30' #do not end with separator

LBACK=1 #shoud be as in the c++ code, LBACK>0 means backtesting
SP="Quarterly"
#SP="Yearly"
#SP="Daily"
#SP="Hourly"

#//----------------PARAMS ---------- comment/uncomment following 3 variables
#for ES_RNN_E, so for all except Monthly and Quarterly runs:
#NUM_OF_SEEDS=1
#NUM_OF_CHUNKS=1
#IBIGS=<number of input files n FOREC_DIR>

#for ES_RNN (do for Monthly and Quarterly):
NUM_OF_CHUNKS=2 #same as NUM_OF_CHUNKS constant the the c++ cource code, changing it is not recommended.
NUM_OF_SEEDS=3 #It is equal to the number of seeds in the startup script, (or number of teams of worker processes)
# so number_of_concurrent_executables==number_of_lines_in_the_running script/NUM_OF_CHUNKS, and number_of_chunks
#E.g if using following script for ES_RNN:
# start <this_executable> 10 1 0
# start <this_executable> 10 2 0
# start <this_executable> 20 1 5
# start <this_executable> 20 2 5
# start <this_executable> 30 1 10
# start <this_executable> 30 2 10
# we have here three seeds: 10,20,30, and two chunks: 1,2. (The pairs of workes have IBIG offsets of 0,5,10)
IBIGS=3 #number of complete runs by each executables, so if programs are not interrupted, this should be equal to the constant BIG_LOOP in the c++ code, by default 3.


m4_df=read.csv(paste0(DATA_DIR,SP,"-train.csv"))

sMAPE<-function(forec,actual) {
mean(abs(forec-actual)/(abs(forec)+abs(actual)))*200
}
errorFunc=sMAPE


spInfo_df=m4Info_df[m4Info_df$SP==SP,]
ids=spInfo_df$M4id
horizon=spInfo_df[1,"Horizon"]

#VARIABLE + "_" + to_string(seedForChunks) + "_" + to_string(chunkNo) + "_" + to_string(ibigDb)+"_LB"+ to_string(LBACK)+ ".csv";
inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*LB",LBACK), full.names = T)
if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) {
stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS")
}


comp_df=NULL
fil=inputFiles[1]
for (fil in inputFiles) {
print(fil)
c_df=read.csv(fil, header=F)
comp_df=rbind(comp_df,c_df)
}
names(comp_df)[1]='id'

forecSeries=sort(unique(comp_df$id))
if (length(forecSeries)!=length(ids) && LBACK==0) {
stop(paste0("Expected number of cases:",length(ids)," but got:",length(forecSeries)))
}

SIZE_OF_CHUNK=1000
out_df=NULL; ou_df=NULL
fSeries=forecSeries[1]
for (fSeries in forecSeries) {
oneSeriesForecs_df=comp_df[comp_df$id==fSeries,]
o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)])
o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F)
ou_df=rbind(ou_df, o_df)
if (nrow(ou_df)>=SIZE_OF_CHUNK) {
out_df=rbind(out_df,ou_df)
ou_df=NULL
print(nrow(out_df))
}
}
out_df=rbind(out_df,ou_df)
print(nrow(out_df))
out_df=out_df[order(as.integer(substring(out_df$id, 2))),]

#FOREC_DIR="e:\\temp"
outPath=paste0(FOREC_DIR,'\\',SP,"Forec.csv")
write.csv(out_df,file=outPath,row.names = F)

################ Main work done, now just diagnostics calculations and plots

#display a sample of forecasts and, if LBACK>0, actuals
MAX_NUM_OF_POINTS_TO_SHOW=200
for (i in 1:100) {
irand=sample(1:length(forecSeries),1)
fSeries=forecSeries[irand]
forec=as.numeric(out_df[out_df$id==fSeries,2:ncol(out_df)])
actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)])
actual=actual[!is.na(actual)]
if (length(actual)>MAX_NUM_OF_POINTS_TO_SHOW) {
actual=actual[(length(actual)-MAX_NUM_OF_POINTS_TO_SHOW):length(actual)]
}
if (LBACK==0) {
plot(c(actual,forec), col=c(rep(1,length(actual)),rep(2,length(forec))), main=fSeries)
} else {
ymin=min(actual,forec)
ymax=max(actual,forec)
plot(1:length(actual),actual, main=fSeries, ylim=c(ymin,ymax))
lines((length(actual)-length(forec)+1):length(actual), forec, col=2, type='p')
}

Sys.sleep(5)
}


#calc error metrics
if (LBACK>0) {
summErrors=0
fSeries=forecSeries[1]
i=1
for (fSeries in forecSeries) {
if (i%%1000==0)
cat(".")
forec=as.numeric(out_df[out_df$id==fSeries,2:ncol(out_df)])
actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)])
actual=actual[!is.na(actual)]
actual=actual[(length(actual)-LBACK*horizon+1):(length(actual)-(LBACK-1)*horizon)]
summErrors=summErrors+errorFunc(forec,actual)
i=i+1
}
print(".")
print(paste0("avg error:",round(summErrors/length(forecSeries),2)))
}
210 changes: 210 additions & 0 deletions 118 - slaweks17/R/merge_PI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# Merging outputs, per category, M4 competition, for Prediction Intervals , so for ES_RNN_PI and ES_RNN_E_PI
# Author: Slawek Smyl, Mar-May 2018


#The c++ executables write to one (occasinally two, sorry :-), so in such case move files to one dir before continuing) directories.
#(One logical run of several instances of the same program will produce a number files, e.g. outputs with different ibig value)
#This script merges, averages values, and writes them down to the same directory - FOREC_DIR
###############################################################################

#directory that should include all *-train.csv files, as well as M4-info.csv
DATA_DIR="F:/progs/data/M4DataSet/"
m4Info_df=read.csv(paste0(DATA_DIR,"M4-info.csv"))
options(stringsAsFactors =FALSE)
memory.limit(10000)

#directory with all the output files produced by the c++ code we want to merge
FOREC_DIR='F:\\progs\\data\\M4\\Hourlygood' #do not end with separator

LBACK=1 #shoud be as in the c++ code, LBACK>0 means backtesting
#SP="Quarterly"
#SP="Yearly"
#SP="Daily"
SP="Hourly"
m4_df=read.csv(paste0(DATA_DIR,SP,"-train.csv"))


#//----------------PARAMS ---------- comment/uncomment following 3 variables
#for ES_RNN_E_PI, so for all except Monthly and Quarterly runs:
NUM_OF_SEEDS=1
NUM_OF_CHUNKS=1
#IBIGS=<number of input files n FOREC_DIR>/2
IBIGS=6

#for ES_RNN_PI (do for Monthly and Quarterly):
#NUM_OF_CHUNKS=2 #same as NUM_OF_CHUNKS constant the the c++ cource code, changing it is not recommended.
#NUM_OF_SEEDS=3 #It is equal to the number of seeds in the startup script, (or number of teams of worker processes)
# so number_of_concurrent_executables==number_of_lines_in_the_running script/NUM_OF_CHUNKS, and number_of_chunks
#E.g if using following script for ES_RNN:
# start <this_executable> 10 1 0
# start <this_executable> 10 2 0
# start <this_executable> 20 1 5
# start <this_executable> 20 2 5
# start <this_executable> 30 1 10
# start <this_executable> 30 2 10
# we have here three seeds: 10,20,30, and two chunks: 1,2. (The pairs of workes have IBIG offsets of 0,5,10)
#IBIGS=3 #number of complete runs by each executables, so if programs are not interrupted, this should be equal to the constant BIG_LOOP in the c++ code, by default 3.

ALPHA = 0.05;
ALPHA_MULTIP = 2 / ALPHA;

MSIS<-function(forecL,forecH,actual) {
sumDiffs=0
for (i in 1:(length(actual)-seasonality)) {
sumDiffs=sumDiffs+abs(actual[i+seasonality]-actual[i])
}
avgAbsDiff=sumDiffs/(length(actual)-seasonality)

actual=actual[(length(actual)-LBACK*horizon+1):(length(actual)-(LBACK-1)*horizon)]

msis=sum(forecH-forecL)+sum(pmax(0,forecL-actual))*ALPHA_MULTIP+sum(pmax(0,actual-forecH))*ALPHA_MULTIP
msis/horizon/avgAbsDiff
}
errorFunc=MSIS

spInfo_df=m4Info_df[m4Info_df$SP==SP,]
ids=spInfo_df$M4id
horizon=spInfo_df[1,"Horizon"]
seasonality=spInfo_df[1,"Frequency"]


#lower
#VARIABLE + "_" + to_string(seedForChunks) + "_" + to_string(chunkNo) + "_" + to_string(ibigDb)+"_LB"+ to_string(LBACK)+ ".csv";
inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*LLB",LBACK), full.names = T)
if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) {
stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS")
}

comp_df=NULL
fil=inputFiles[1]
for (fil in inputFiles) {
print(fil)
c_df=read.csv(fil, header=F)
comp_df=rbind(comp_df,c_df)
}
names(comp_df)[1]='id'

forecSeries=sort(unique(comp_df$id))
if (length(forecSeries)!=length(ids) && LBACK==0) {
stop(paste0("Expected number of cases:",length(ids)," but got:",length(forecSeries)))
}

SIZE_OF_CHUNK=1000
out_df=NULL; ou_df=NULL
fSeries=forecSeries[1]
for (fSeries in forecSeries) {
oneSeriesForecs_df=comp_df[comp_df$id==fSeries,]
o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)])
o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F)
ou_df=rbind(ou_df, o_df)
if (nrow(ou_df)>=SIZE_OF_CHUNK) {
out_df=rbind(out_df,ou_df)
ou_df=NULL
print(nrow(out_df))
}
}
out_df=rbind(out_df,ou_df)
print(nrow(out_df))
out_df=out_df[order(as.integer(substring(out_df$id, 2))),]

outPath=paste0(FOREC_DIR,'\\',SP,"ForecL.csv")
write.csv(out_df,file=outPath,row.names = F)

lower_df=out_df

#####################################
#higher
inputFiles=list.files(path = FOREC_DIR, pattern = paste0(SP,".*HLB",LBACK), full.names = T)
if (length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS) {
stop("length(inputFiles)!=NUM_OF_SEEDS*NUM_OF_CHUNKS*IBIGS")
}

comp_df=NULL
fil=inputFiles[1]
for (fil in inputFiles) {
print(fil)
c_df=read.csv(fil, header=F)
comp_df=rbind(comp_df,c_df)
}
names(comp_df)[1]='id'

forecSeries=sort(unique(comp_df$id))
if (length(forecSeries)!=length(ids) && LBACK==0) {
print(paste0("Warning. Expected number of cases:",length(ids)," but got:",length(forecSeries)))
}

SIZE_OF_CHUNK=1000
out_df=NULL; ou_df=NULL
fSeries=forecSeries[1]
for (fSeries in forecSeries) {
oneSeriesForecs_df=comp_df[comp_df$id==fSeries,]
o1=colMeans(oneSeriesForecs_df[,2:ncol(oneSeriesForecs_df)])
o_df=data.frame(id=fSeries, as.list(o1), stringsAsFactors =F)
ou_df=rbind(ou_df, o_df)
if (nrow(ou_df)>=SIZE_OF_CHUNK) {
out_df=rbind(out_df,ou_df)
ou_df=NULL
print(nrow(out_df))
}
}
out_df=rbind(out_df,ou_df)
print(nrow(out_df))
out_df=out_df[order(as.integer(substring(out_df$id, 2))),]

outPath=paste0(FOREC_DIR,'\\',SP,"ForecH.csv")
write.csv(out_df,file=outPath,row.names = F)

higher_df=out_df


################ Main work done, now just diagnostics calculations and plots

#display a sample of forecasts and, if LBACK>0, actuals
MAX_NUM_OF_POINTS_TO_SHOW=200
i=1
for (i in 1:100) {
irand=sample(1:length(forecSeries),1)
fSeries=forecSeries[irand]
forecL=as.numeric(lower_df[lower_df$id==fSeries,2:ncol(lower_df)])
forecH=as.numeric(higher_df[higher_df$id==fSeries,2:ncol(higher_df)])
actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)])
actual=actual[!is.na(actual)]
if (length(actual)>MAX_NUM_OF_POINTS_TO_SHOW) {
actual=actual[(length(actual)-MAX_NUM_OF_POINTS_TO_SHOW):length(actual)]
}
if (LBACK==0) {
plot(c(actual,forecH), col=c(rep(1,length(actual)),rep(2,length(forecH))), main=fSeries)
lines(c(actual,forecL), col=c(rep(1,length(actual)),rep(3,length(forecL))), type='p')
} else {
ymin=min(actual,forecL)
ymax=max(actual,forecH)
plot(1:length(actual),actual, main=fSeries, ylim=c(ymin,ymax))
lines((length(actual)-length(forecH)+1):length(actual), forecH, col=2, type='p')
lines((length(actual)-length(forecL)+1):length(actual), forecL, col=3, type='p')
}

Sys.sleep(5)
}



#calc error metric: MSIS
if (LBACK>0) {
summErrors=0
fSeries=forecSeries[1]
i=1
for (fSeries in forecSeries) {
if (i%%1000==0)
cat(".")
forecL=as.numeric(lower_df[lower_df$id==fSeries,2:ncol(lower_df)])
forecH=as.numeric(higher_df[higher_df$id==fSeries,2:ncol(higher_df)])
actual=as.numeric(m4_df[m4_df$V1==fSeries,2:ncol(m4_df)])
actual=actual[!is.na(actual)]
summErrors=summErrors+errorFunc(forecL, forecH, actual)
i=i+1
}
print(".")
print(paste0("avg error:",round(summErrors/length(forecSeries),2)))
}


8 changes: 8 additions & 0 deletions 118 - slaweks17/R/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
When the c++ workers run, they output results (forecasts) to a directory or two.
(Sorry occasionally two directories are filled, in such case first "manually" put all the output files to a single dir)
These scripts merge them into one file and save it, show a sample of graphs, and if this is backtesting run (LBACK>0), calculate some accuracy metrics.

Both scripts needs to be updated with your input, output dirs, and other params, see inside, there are a lot of comments there.

merge.R is meant to be used for point forecst runs, so for ES_RNN and ES_RNN_E programs.
mergePI.R - for Prediction Interval runs, so for ES_RNN_PI and ES_RNN_E_PI programs.
Loading