diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 029c987..99f9f73 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 2.4.0 -Date: 2025-04-17 13:57:56 UTC -SHA: 9fa3109a41a9bd0ca095a30527ca34ba233c1e34 +Date: 2025-06-17 20:25:38 UTC +SHA: 32dc9e0e856f599cdf2dab33ac994973fadb5c2f diff --git a/DESCRIPTION b/DESCRIPTION index 4a00307..affb231 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,8 @@ Imports: parallel, tidyr, dplyr, - ggplot2 + ggplot2, + stringr Suggests: testthat, bookdown, diff --git a/NAMESPACE b/NAMESPACE index 22aebc4..fd95900 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,8 @@ S3method(plot,gg_variable) S3method(plot,gg_vimp) export(calc_auc) export(calc_roc) +export(df_partial) +export(df_partialpro) export(gg_error) export(gg_error.randomForest) export(gg_error.randomForest.formula) @@ -35,6 +37,7 @@ export(kaplan) export(nelson) export(quantile_pts) export(r_data_types) +export(varpro_feature_name) importFrom(dplyr,across) importFrom(dplyr,mutate) importFrom(dplyr,n_distinct) @@ -53,6 +56,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,xtabs) +importFrom(stringr,str_sub) importFrom(survival,Surv) importFrom(survival,strata) importFrom(survival,survfit) diff --git a/R/gg_partial_df.R b/R/gg_partial_df.R new file mode 100644 index 0000000..c7bb9d1 --- /dev/null +++ b/R/gg_partial_df.R @@ -0,0 +1,58 @@ +##============================================================================= +#' Split partial lots into continuous or categorical datasets +#' @param part_dta partial plot data from \code{rfsrc::plot.variable} +#' @param nvars how many of the partial plot variables to calculate +#' @param cat_limit Categorical features are build when there are fewer than +#' cat_limit unique features. +#' @param name a label name applied to all features. Useful when combining +#' multiple partial plot objects in figures. +#' +#' @export +df_partial = function(part_dta, nvars = NULL, cat_limit = 10, name=NULL) { + ## Prepare the partial dependencies data for panel plots + if (is.null(nvars)) { + nvars = length(part_dta$plotthis) + } + + cont_list = list() + cat_list = list() + for (feature in seq(nvars)) { + ## Format any continuous features (those with fewer than cat_limit unique values) + if (length(unique(part_dta$plotthis[[feature]]$x)) > cat_limit) { + plt.df = as.data.frame( + cbind( + x = part_dta$plotthis[[feature]]$x, + yhat = part_dta$plotthis[[feature]]$yhat + ) + ) + plt.df$name = names(part_dta$plotthis)[[feature]] + + cont_list[[feature]] <- plt.df + } else{ + ## Categorical features + + ## Though VarPro works with logical or continuous only. Factors are + ## one hot encoded internal to the varPro call. + plt.df = as.data.frame( + cbind( + x = factor(part_dta$plotthis[[feature]]$x), + yhat = part_dta$plotthis[[feature]]$yhat + ) + ) + plt.df$name = names(part_dta$plotthis)[[feature]] + + cat_list[[feature]] <- plt.df + } + } + continuous = dplyr::bind_rows(cont_list) + categorical = dplyr::bind_rows(cat_list) + + if(!is.na(name)){ + continuous$model <- categorical$model <- name + } + + return(list( + continuous = continuous, + categorical = categorical + )) +} \ No newline at end of file diff --git a/R/gg_partialpro_df.R b/R/gg_partialpro_df.R new file mode 100644 index 0000000..0965e63 --- /dev/null +++ b/R/gg_partialpro_df.R @@ -0,0 +1,79 @@ +##============================================================================= +##============================================================================= +#' Split partial lots into continuous or categorical datasets +#' @param part_dta partial plot data from \code{rfsrc::plot.variable} +#' @param nvars how many of the partial plot variables to calculate +#' @param cat_limit Categorical features are build when there are fewer than +#' cat_limit unique features. +#' @param name a label name applied to all features. Useful when combining +#' multiple partial plot objects in figures. +#' +#' @export +#' +df_partialpro = function(part_dta, nvars = NULL, cat_limit=12, name=NULL) { + ## Prepare the partial pro dependencies data for panel plots + if (is.null(nvars)) { + nvars = length(part_dta) + } + + cont_list = list() + cat_list = list() + for (feature in seq(nvars)) { + ## Format any continuous features (those with fewer than 10 unique values) + if (length(part_dta[[feature]]$xvirtual) > cat_limit) { + plt.df = as.data.frame( + cbind( + variable = part_dta[[feature]]$xvirtual, + parametric = colMeans(part_dta[[feature]]$yhat.par, na.rm = + TRUE), + nonparametric = colMeans(part_dta[[feature]]$yhat.nonpar, na.rm = + TRUE), + causal = colMeans(part_dta[[feature]]$yhat.causal, na.rm = + TRUE) + ) + ) + plt.df$name = names(part_dta)[[feature]] + + cont_list[[feature]] <- plt.df + } else{ + ## Categorical features + + ## Though VarPro works with logical or continuous only. Factors are + ## one hot encoded internal to the varPro call. + cat_feat = list() + ## Each yhat has at least 2 columns, for logical values... + for (ind in seq(length(unique(part_dta[[feature]]$xorg)))) { + cat_feat[[ind]] = as.data.frame( + cbind( + parametric = part_dta[[feature]]$yhat.par[, ind], + nonparametric = part_dta[[feature]]$yhat.nonpar[, ind], + causal = part_dta[[feature]]$yhat.causal[, ind] + ) + ) + cat_feat[[ind]]$variable <- + unique(part_dta[[feature]]$xorg)[ind] + if (ind == 1) { + plt.df <- cat_feat[[ind]] + } else{ + plt.df <- dplyr::bind_rows(plt.df, cat_feat[[ind]]) + } + } + + plt.df$name = names(part_dta)[[feature]] + + cat_list[[feature]] <- plt.df + } + } + + continuous = dplyr::bind_rows(cont_list) + categorical = dplyr::bind_rows(cat_list) + + if(!is.na(name)){ + continuous$model <- categorical$model <- name + } + + return(list( + continuous = continuous, + categorical = categorical + )) +} \ No newline at end of file diff --git a/R/varpro_feature_names.R b/R/varpro_feature_names.R new file mode 100644 index 0000000..5e85efc --- /dev/null +++ b/R/varpro_feature_names.R @@ -0,0 +1,23 @@ + +##============================================================================= +#' varpro one hot encodes features, so we need to get the "raw" +#' original variable names. This loops through the variable names +#' not in the original dataset, and cuts one character off the end +#' until we find the variable name in the original data. +#' +#' @param varpro_names vector of names output from varpro analysis +#' @param dataset the dataset used for varpro input. +#' +#' @importFrom stringr str_sub +#' @export +varpro_feature_name <- function(varpro_names, dataset) { + inc_set <- varpro_names[which(varpro_names %in% colnames(dataset))] + one_set <- varpro_names[which(!varpro_names %in% colnames(dataset))] + while (length(one_set) > 0) { + orig <- unlist(lapply(one_set, stringr::str_sub, 1,-2)) + inc_set <- + union(inc_set, orig[which(orig %in% colnames(dataset))]) + one_set <- orig[which(!orig %in% colnames(dataset))] + } + return(inc_set) +} diff --git a/man/df_partial.Rd b/man/df_partial.Rd new file mode 100644 index 0000000..3f1158d --- /dev/null +++ b/man/df_partial.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_partial_df.R +\name{df_partial} +\alias{df_partial} +\title{Split partial lots into continuous or categorical datasets} +\usage{ +df_partial(part_dta, nvars = NULL, cat_limit = 10, name = NULL) +} +\arguments{ +\item{part_dta}{partial plot data from \code{rfsrc::plot.variable}} + +\item{nvars}{how many of the partial plot variables to calculate} + +\item{cat_limit}{Categorical features are build when there are fewer than +cat_limit unique features.} + +\item{name}{a label name applied to all features. Useful when combining +multiple partial plot objects in figures.} +} +\description{ +Split partial lots into continuous or categorical datasets +} diff --git a/man/df_partialpro.Rd b/man/df_partialpro.Rd new file mode 100644 index 0000000..5e485ef --- /dev/null +++ b/man/df_partialpro.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_partialpro_df.R +\name{df_partialpro} +\alias{df_partialpro} +\title{Split partial lots into continuous or categorical datasets} +\usage{ +df_partialpro(part_dta, nvars = NULL, cat_limit = 12, name = NULL) +} +\arguments{ +\item{part_dta}{partial plot data from \code{rfsrc::plot.variable}} + +\item{nvars}{how many of the partial plot variables to calculate} + +\item{cat_limit}{Categorical features are build when there are fewer than +cat_limit unique features.} + +\item{name}{a label name applied to all features. Useful when combining +multiple partial plot objects in figures.} +} +\description{ +Split partial lots into continuous or categorical datasets +} diff --git a/man/varpro_feature_name.Rd b/man/varpro_feature_name.Rd new file mode 100644 index 0000000..e00c8ec --- /dev/null +++ b/man/varpro_feature_name.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/varpro_feature_names.R +\name{varpro_feature_name} +\alias{varpro_feature_name} +\title{varpro one hot encodes features, so we need to get the "raw" +original variable names. This loops through the variable names +not in the original dataset, and cuts one character off the end +until we find the variable name in the original data.} +\usage{ +varpro_feature_name(varpro_names, dataset) +} +\arguments{ +\item{varpro_names}{vector of names output from varpro analysis} + +\item{dataset}{the dataset used for varpro input.} +} +\description{ +varpro one hot encodes features, so we need to get the "raw" +original variable names. This loops through the variable names +not in the original dataset, and cuts one character off the end +until we find the variable name in the original data. +}