Skip to content

Code donation: Possible Enhancement: from PE call caret::train.formula. Does case weights. #2

@AndreMikulec

Description

@AndreMikulec

The following code allows. package performanceEstimation to to call package caret's caret::train.formula.

As far as I can tell it does work.

# TIMESTAMP APRIL 3 2016 runPerformanceEstimation3.R

options(error = recover)

runPerformanceEstimation3 <- function() {

  runPerformanceEstimation3_inner <- function() {

    require(performanceEstimation)
    require(quantmod)
    require(randomForest)

    # GSPC <- getSymbols('^GSPC',from='2008-01-01',to='2012-12-31', auto.assign = FALSE)
    load(file = "GSPCshrange.RData")

    data.model <- specifyModel(Next(100*Delt(Ad(GSPC))) ~ Delt(Ad(GSPC),k=1:10))

    bigdata <- as.data.frame(modelData(data.model))

    colnames(bigdata)[1]    <- 'PercVarClose'
    bigdata[,"scalesrowid"] <-  1:NROW(bigdata) # required for non MC scaling # optional(ignored) MC

    # required for task@dataSource TO SEE
    assign("bigdata", bigdata, envir = .GlobalEnv )

    attach_me <- list(

      trainCaret = function(frmula, dat, ..., scales = NULL, scalescolname = NULL, otherArgsText = "",  trControlText = "caret::trainControl(method = 'none')", tuneGridText  = "") { 

        restargs    <- list(...) 

        scales      <- eval(parse(text=scales)) 
        if(!is.null(scalescolname)) scalesrowid <- as.vector(unlist( dat[,scalescolname] ))
        NROWdat     <- NROW(dat)
        NROWscales  <- NROW(scales)

        # expecting NON-MonteCarlo
        # if LARGE length of case weights is sent, then collectION is expected to be non-ordered by random-ish positions 
        if(!is.null(scales) && ( NROWdat != NROWscales ) )  { 
           samplescales <- scales[scalesrowid]
        }

        # expecting    MonteCarlo # user may typically send '1:NROW(dat)' # function(dat) # function of sample size
        # if SMALL length of case weights are sent, then collection is expected to be ordered by exact position

        # scalescolname IS NOT required ( and Not used) in MonteCarlo, 
        # For user programming consisency/ease
        #  or for an 'extra' check, scalescolname, may be sent anyways
        if(  ( !is.null(scales)       ) && 
             ( NROWdat == NROWscales  ) && 
             ( if(!is.null(scalescolname)) { !is.unsorted(as.vector(unlist(dat[,scalescolname]))) } else { TRUE  } ) ) { 
          samplescales <- scales 
        }

        # if no case weights are sent
         if( is.null(scales) )                               { samplescales <- NULL }

         if( NROW(samplescales) != NROW(dat) ) stop("Incorrect case weights ( scales )")

        others       <- if(nchar(otherArgsText) == 0) { NULL }  else { eval(parse(text=otherArgsText)) }

        # if it has a scalesrowid, Remove it. It is not part of the dat.
        if(!is.null(scalescolname)) { 

           # remove the scalescolname variable so it does not exist
           dat[,scalescolname] <- NULL 

           # dynamically remove the scalescolname variable from the formula
           # I do not care if the variable exists or NOT
           frmula <- eval(substitute(update(frmula, ~ . - scalescolname_var),list(scalescolname_var = as.symbol(scalescolname))))

        }

        f <- substitute(
          do.call(caret::train,c(list(form = frmula, data = dat, weights = samplescales), others, restargs, list( tuneGrid=eval(parse( text=tuneGridText))), 
                                 list(trControl=eval(parse(text=trControlText)))  )) 
        )
        eval(f)

      }

    )

    pkgpos <-  which("package:me" == search())
    if(length(pkgpos) > 0){ detach(pos = pkgpos)}
    attach(what=attach_me, pos = 2L, name = "package:me", warn.conflicts = TRUE) 

    # if doing case weights, scales and scalescolname, are required.
    # if doing case weights, and doing Monte Carlo, scalecolnames is not required ( and ignored )

    spExp1 <- performanceEstimation(
      PredTask(PercVarClose ~ .,bigdata,'SP500_2012'),c(
        workflowVariants(wf='standardWF',wfID="CVstandGBM",   # THIS ONE IS O.K.
                learner="trainCaret", 
                learner.pars=list(method="gbm", 
                  distribution = c('gaussian'), 
                  tuneGridText  = "data.frame(n.trees = 10, interaction.depth = 2, shrinkage = 0.25, n.minobsinnode = 2)", 
                  scales = "1:NROW(bigdata)", scalescolname = 'scalesrowid'))    # CONSTANT dat population size
      ),
      EstimationTask(metrics="acc",method=CV(nReps=2,nFolds=5))
    ) 

    spExp2 <- performanceEstimation(
      PredTask(PercVarClose ~ .,bigdata,'SP500_2012'),c(
        workflowVariants(wf='standardWF',wfID="MCstandGBM",   # THIS ONE IS O.K. ( NOT LOGICALLY SENSIBLE)
                         learner="trainCaret", 
                         learner.pars=list(method="gbm", 
                                           distribution = c('gaussian'), 
                                           tuneGridText  = "data.frame(n.trees = 10, interaction.depth = 2, shrinkage = 0.25, n.minobsinnode = 2)", 
                                           scales = "1:NROW(dat)", scalescolname = 'scalesrowid')),        # CONSTANT dat sample size
        workflowVariants(wf='timeseriesWF',wfID="MCgrowGBM",  # THIS ONE IS O.K.
                         learner="trainCaret", 
                         learner.pars=list(method="gbm", 
                                           distribution = c('gaussian'), 
                                           tuneGridText  = "data.frame(n.trees = 10, interaction.depth = 2, shrinkage = 0.25, n.minobsinnode = 2)", 
                                           scales = "1:NROW(dat)", scalescolname = 'scalesrowid'),         #  VARIABLE(GROWING) dat sample size
                         type="grow", relearn.step=30),
        workflowVariants(wf='timeseriesWF',wfID="MCslideGBM",  # THIS ONE IS O.K
                         learner="trainCaret", 
                         learner.pars=list(method="gbm", 
                                           distribution = c('gaussian'), 
                                           tuneGridText  = "data.frame(n.trees = 10, interaction.depth = 2, shrinkage = 0.25, n.minobsinnode = 2)", 
                                           scales = "1:NROW(dat)", scalescolname = 'scalesrowid'),          #  CONSTANT dat sample size
                         type="slide", relearn.step=30)
      ),
      EstimationTask(metrics=c("mse","theil"),method=MonteCarlo(nReps=5,szTrain=0.5,szTest=0.25))
    ) 



    bookmarkhere <- 1 

  } 
  runPerformanceEstimation3_inner()

  return(invisible(NULL))
} 

# restart R, then
# rm(list = ls(envir = .GlobalEnv), envir = .GlobalEnv)
# devtools::load_all("~/doParallel")
# devtools::load_all("~/foreach")
# devtools::load_all("~/performanceEstimation")
# debugSource(paste0("~/runPerformanceEstimation3.R"))

Andre Mikulec
Andre_Mikulec@Hotmail.com

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions