Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 88 additions & 37 deletions tests/testthat/test-hyperparameterTuning.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,52 +18,103 @@ testthat::test_that(
, Fold3 = as.integer(seq(3,nrow(agaricus.train$data),by = 3))
)

scoringFunction <- function(
max_depth
, max_leaves
, min_child_weight
, subsample
, colsample_bytree
, gamma
, lambda
, alpha
) {

dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label)


scoringFunction <- function(max_depth, max_leaves,
min_child_weight, subsample,
colsample_bytree, gamma, lambda, alpha,
.debug = FALSE) {

# ---- Type coercion & scalarization ----
max_depth <- as.integer(max_depth)[1]
max_leaves <- as.integer(max_leaves)[1]
min_child_weight <- as.numeric(min_child_weight)[1]
subsample <- as.numeric(subsample)[1]
colsample_bytree <- as.numeric(colsample_bytree)[1]
gamma <- as.numeric(gamma)[1]
lambda <- as.numeric(lambda)[1]
alpha <- as.numeric(alpha)[1]

# ---- Data (assumes 'agaricus.train' and 'Folds' exist) ----
dtrain <- xgboost::xgb.DMatrix(
data = agaricus.train$data,
label = agaricus.train$label
)

# Base params
Pars <- list(
booster = "gbtree"
, eta = 0.001
, max_depth = max_depth
, max_leaves = max_leaves
, min_child_weight = min_child_weight
, subsample = subsample
, colsample_bytree = colsample_bytree
, gamma = gamma
, lambda = lambda
, alpha = alpha
, objective = "binary:logistic"
, eval_metric = "auc"
booster = "gbtree",
eta = 0.01,
max_depth = max_depth,
min_child_weight = min_child_weight,
subsample = subsample,
colsample_bytree = colsample_bytree,
gamma = gamma,
lambda = lambda, # L2 reg
alpha = alpha, # L1 reg
objective = "binary:logistic",
eval_metric = "auc"
)

xgbcv <- xgb.cv(
params = Pars
, data = dtrain
, nround = 100
, folds = Folds
, early_stopping_rounds = 5
, maximize = TRUE
, verbose = 0
# If max_leaves is requested, enable histogram/lossguide (so xgboost uses it)
if (!is.na(max_leaves) && max_leaves > 0L) {
Pars$tree_method <- "hist"
Pars$grow_policy <- "lossguide"
Pars$max_leaves <- max_leaves
# It's common to leave max_depth as-is; alternatively set max_depth = 0
# Pars$max_depth <- 0L
}

# ---- Safe CV wrapper ----
xgbcv <- try(
xgboost::xgb.cv(
params = Pars,
data = dtrain,
nrounds = 100,
folds = Folds,
prediction = FALSE,
showsd = TRUE,
early_stopping_rounds = 5,
maximize = TRUE,
verbose = 0
),
silent = TRUE
)

return(
list(
Score = max(xgbcv$evaluation_log$test_auc_mean)
, nrounds = xgbcv$best_iteration
)
# On error: return worst score but keep scalars so bayesOpt can proceed
if (inherits(xgbcv, "try-error")) {
if (isTRUE(.debug)) message("xgb.cv error: ", as.character(xgbcv))
return(list(Score = as.numeric(-Inf), BestNrounds = as.integer(1L)))
}

# ---- Scalar Score ----
score_vec <- as.numeric(xgbcv$evaluation_log$test_auc_mean)
if (!is.null(names(score_vec))) names(score_vec) <- NULL
Score <- as.numeric(max(score_vec, na.rm = TRUE))[1]

# ---- Scalar best nrounds ----
bi <- xgbcv$best_iteration
if (is.null(bi) || length(bi) != 1L || is.na(bi)) {
bi <- which.max(score_vec)
if (length(bi) != 1L || is.na(bi)) bi <- 1L
}
BestNrounds <- as.integer(bi)[1]

if (isTRUE(.debug)) {
cat(sprintf(
"DEBUG | Score len=%d val=%.6f | BestNrounds len=%d val=%d\n",
length(Score), Score, length(BestNrounds), BestNrounds
))
}

list(
Score = Score, # must be scalar
BestNrounds = BestNrounds # must be scalar
)
}



bounds <- list(
max_depth = c(1L, 5L)
, max_leaves = c(2L,25L)
Expand Down
74 changes: 48 additions & 26 deletions vignettes/tuningHyperparameters.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -53,38 +53,60 @@ Folds <- list(
Now we need to define the scoring function. This function should, at a minimum, return a list with a ```Score``` element, which is the model evaluation metric we want to maximize. We can also retain other pieces of information created by the scoring function by including them as named elements of the returned list. In this case, we want to retain the optimal number of rounds determined by the ```xgb.cv```:

```{r eval = xgbAvail}

scoringFunction <- function(max_depth, min_child_weight, subsample) {

dtrain <- xgb.DMatrix(agaricus.train$data,label = agaricus.train$label)

Pars <- list(
booster = "gbtree"
, eta = 0.01
, max_depth = max_depth
, min_child_weight = min_child_weight
, subsample = subsample
, objective = "binary:logistic"
, eval_metric = "auc"
# Coerce types explicitly
max_depth <- as.integer(max_depth)[1]
min_child_weight <- as.numeric(min_child_weight)[1]
subsample <- as.numeric(subsample)[1]

dtrain <- xgboost::xgb.DMatrix(
data = agaricus.train$data,
label = agaricus.train$label
)

Pars <- list(
booster = "gbtree",
eta = 0.01,
max_depth = max_depth,
min_child_weight = min_child_weight,
subsample = subsample,
objective = "binary:logistic",
eval_metric = "auc"
)

xgbcv <- xgb.cv(
params = Pars
, data = dtrain
, nround = 100
, folds = Folds
, prediction = TRUE
, showsd = TRUE
, early_stopping_rounds = 5
, maximize = TRUE
, verbose = 0)

return(
list(
Score = max(xgbcv$evaluation_log$test_auc_mean)
, nrounds = xgbcv$best_iteration
)
xgbcv <- xgboost::xgb.cv(
params = Pars,
data = dtrain,
nrounds = 100, # <- canonical argument name
folds = Folds,
prediction = FALSE, # set TRUE only if you actually need CV preds
showsd = TRUE,
early_stopping_rounds = 5,
maximize = TRUE,
verbose = 0
)

# Compute Score robustly (scalar)
score_vec <- xgbcv$evaluation_log$test_auc_mean
Score <- as.numeric(max(score_vec, na.rm = TRUE))[1]

# Derive a scalar "best nrounds" robustly
bi <- xgbcv$best_iteration
if (is.null(bi) || length(bi) != 1 || is.na(bi)) {
# fallback: the iteration where test AUC is maximized
bi <- which.max(score_vec)
if (length(bi) != 1 || is.na(bi)) bi <- 1L
}
BestNrounds <- as.integer(bi)[1]

list(
Score = Score,
BestNrounds = BestNrounds
)
}

```


Expand Down