From ae769fcf13b023728c5bd505b24df6bc536763aa Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Thu, 19 Jun 2025 08:19:32 -0400 Subject: [PATCH 1/3] partial plot --- CRAN-SUBMISSION | 4 ++-- R/gg_partial_df.R | 50 ++++++++++++++++++++++++++++++++++++++++ R/varpro_feature_names.R | 18 +++++++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 R/gg_partial_df.R create mode 100644 R/varpro_feature_names.R diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 029c987..99f9f73 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 2.4.0 -Date: 2025-04-17 13:57:56 UTC -SHA: 9fa3109a41a9bd0ca095a30527ca34ba233c1e34 +Date: 2025-06-17 20:25:38 UTC +SHA: 32dc9e0e856f599cdf2dab33ac994973fadb5c2f diff --git a/R/gg_partial_df.R b/R/gg_partial_df.R new file mode 100644 index 0000000..68154da --- /dev/null +++ b/R/gg_partial_df.R @@ -0,0 +1,50 @@ +##============================================================================= +## Split partial lots into continuous or categorical datasets +df_partial = function(part_dta, nvars = NULL, cat_limit = 10, name=NULL) { + ## Prepare the partial dependencies data for panel plots + if (is.null(nvars)) { + nvars = length(part_dta$plotthis) + } + + cont_list = list() + cat_list = list() + for (feature in seq(nvars)) { + ## Format any continuous features (those with fewer than cat_limit unique values) + if (length(unique(part_dta$plotthis[[feature]]$x)) > cat_limit) { + plt.df = as.data.frame( + cbind( + x = part_dta$plotthis[[feature]]$x, + yhat = part_dta$plotthis[[feature]]$yhat + ) + ) + plt.df$name = names(part_dta$plotthis)[[feature]] + + cont_list[[feature]] <- plt.df + } else{ + ## Categorical features + + ## Though VarPro works with logical or continuous only. Factors are + ## one hot encoded internal to the varPro call. + plt.df = as.data.frame( + cbind( + x = factor(part_dta$plotthis[[feature]]$x), + yhat = part_dta$plotthis[[feature]]$yhat + ) + ) + plt.df$name = names(part_dta$plotthis)[[feature]] + + cat_list[[feature]] <- plt.df + } + } + continuous = bind_rows(cont_list) + categorical = bind_rows(cat_list) + + if(!is.na(name)){ + continuous$model <- categorical$model <- name + } + + return(list( + continuous = continuous, + categorical = categorical + )) +} \ No newline at end of file diff --git a/R/varpro_feature_names.R b/R/varpro_feature_names.R new file mode 100644 index 0000000..18d1769 --- /dev/null +++ b/R/varpro_feature_names.R @@ -0,0 +1,18 @@ + +##============================================================================= +## varpro one hot encodes features, so we need to get the "raw" +## original variable names. This loops through the variable names +## not in the original dataset, and cuts one character off the end +## until we find the variable name in the original data. + +varpro_feature_name <- function(varpro_names, dataset) { + inc_set <- varpro_names[which(varpro_names %in% colnames(dataset))] + one_set <- varpro_names[which(!varpro_names %in% colnames(dataset))] + while (length(one_set) > 0) { + orig <- unlist(lapply(one_set, str_sub, 1,-2)) + inc_set <- + union(inc_set, orig[which(orig %in% colnames(dataset))]) + one_set <- orig[which(!orig %in% colnames(dataset))] + } + return(inc_set) +} From 3c7ffab9588ca975554ea3ab5cb4e1d92dd0ddd9 Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Thu, 19 Jun 2025 09:01:17 -0400 Subject: [PATCH 2/3] Add some functionality --- DESCRIPTION | 3 ++- NAMESPACE | 3 +++ R/gg_partial_df.R | 14 +++++++++++--- R/varpro_feature_names.R | 17 +++++++++++------ man/df_partial.Rd | 22 ++++++++++++++++++++++ man/varpro_feature_name.Rd | 22 ++++++++++++++++++++++ 6 files changed, 71 insertions(+), 10 deletions(-) create mode 100644 man/df_partial.Rd create mode 100644 man/varpro_feature_name.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4a00307..affb231 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,8 @@ Imports: parallel, tidyr, dplyr, - ggplot2 + ggplot2, + stringr Suggests: testthat, bookdown, diff --git a/NAMESPACE b/NAMESPACE index 22aebc4..15f6e69 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ S3method(plot,gg_variable) S3method(plot,gg_vimp) export(calc_auc) export(calc_roc) +export(df_partial) export(gg_error) export(gg_error.randomForest) export(gg_error.randomForest.formula) @@ -35,6 +36,7 @@ export(kaplan) export(nelson) export(quantile_pts) export(r_data_types) +export(varpro_feature_name) importFrom(dplyr,across) importFrom(dplyr,mutate) importFrom(dplyr,n_distinct) @@ -53,6 +55,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,xtabs) +importFrom(stringr,str_sub) importFrom(survival,Surv) importFrom(survival,strata) importFrom(survival,survfit) diff --git a/R/gg_partial_df.R b/R/gg_partial_df.R index 68154da..c7bb9d1 100644 --- a/R/gg_partial_df.R +++ b/R/gg_partial_df.R @@ -1,5 +1,13 @@ ##============================================================================= -## Split partial lots into continuous or categorical datasets +#' Split partial lots into continuous or categorical datasets +#' @param part_dta partial plot data from \code{rfsrc::plot.variable} +#' @param nvars how many of the partial plot variables to calculate +#' @param cat_limit Categorical features are build when there are fewer than +#' cat_limit unique features. +#' @param name a label name applied to all features. Useful when combining +#' multiple partial plot objects in figures. +#' +#' @export df_partial = function(part_dta, nvars = NULL, cat_limit = 10, name=NULL) { ## Prepare the partial dependencies data for panel plots if (is.null(nvars)) { @@ -36,8 +44,8 @@ df_partial = function(part_dta, nvars = NULL, cat_limit = 10, name=NULL) { cat_list[[feature]] <- plt.df } } - continuous = bind_rows(cont_list) - categorical = bind_rows(cat_list) + continuous = dplyr::bind_rows(cont_list) + categorical = dplyr::bind_rows(cat_list) if(!is.na(name)){ continuous$model <- categorical$model <- name diff --git a/R/varpro_feature_names.R b/R/varpro_feature_names.R index 18d1769..5e85efc 100644 --- a/R/varpro_feature_names.R +++ b/R/varpro_feature_names.R @@ -1,15 +1,20 @@ ##============================================================================= -## varpro one hot encodes features, so we need to get the "raw" -## original variable names. This loops through the variable names -## not in the original dataset, and cuts one character off the end -## until we find the variable name in the original data. - +#' varpro one hot encodes features, so we need to get the "raw" +#' original variable names. This loops through the variable names +#' not in the original dataset, and cuts one character off the end +#' until we find the variable name in the original data. +#' +#' @param varpro_names vector of names output from varpro analysis +#' @param dataset the dataset used for varpro input. +#' +#' @importFrom stringr str_sub +#' @export varpro_feature_name <- function(varpro_names, dataset) { inc_set <- varpro_names[which(varpro_names %in% colnames(dataset))] one_set <- varpro_names[which(!varpro_names %in% colnames(dataset))] while (length(one_set) > 0) { - orig <- unlist(lapply(one_set, str_sub, 1,-2)) + orig <- unlist(lapply(one_set, stringr::str_sub, 1,-2)) inc_set <- union(inc_set, orig[which(orig %in% colnames(dataset))]) one_set <- orig[which(!orig %in% colnames(dataset))] diff --git a/man/df_partial.Rd b/man/df_partial.Rd new file mode 100644 index 0000000..3f1158d --- /dev/null +++ b/man/df_partial.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_partial_df.R +\name{df_partial} +\alias{df_partial} +\title{Split partial lots into continuous or categorical datasets} +\usage{ +df_partial(part_dta, nvars = NULL, cat_limit = 10, name = NULL) +} +\arguments{ +\item{part_dta}{partial plot data from \code{rfsrc::plot.variable}} + +\item{nvars}{how many of the partial plot variables to calculate} + +\item{cat_limit}{Categorical features are build when there are fewer than +cat_limit unique features.} + +\item{name}{a label name applied to all features. Useful when combining +multiple partial plot objects in figures.} +} +\description{ +Split partial lots into continuous or categorical datasets +} diff --git a/man/varpro_feature_name.Rd b/man/varpro_feature_name.Rd new file mode 100644 index 0000000..e00c8ec --- /dev/null +++ b/man/varpro_feature_name.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/varpro_feature_names.R +\name{varpro_feature_name} +\alias{varpro_feature_name} +\title{varpro one hot encodes features, so we need to get the "raw" +original variable names. This loops through the variable names +not in the original dataset, and cuts one character off the end +until we find the variable name in the original data.} +\usage{ +varpro_feature_name(varpro_names, dataset) +} +\arguments{ +\item{varpro_names}{vector of names output from varpro analysis} + +\item{dataset}{the dataset used for varpro input.} +} +\description{ +varpro one hot encodes features, so we need to get the "raw" +original variable names. This loops through the variable names +not in the original dataset, and cuts one character off the end +until we find the variable name in the original data. +} From 2fb60119e15592da36779b4a2d46a061c2da08a7 Mon Sep 17 00:00:00 2001 From: John Ehrlinger Date: Thu, 19 Jun 2025 16:11:36 -0400 Subject: [PATCH 3/3] varpro partial dataframe --- NAMESPACE | 1 + R/gg_partialpro_df.R | 79 ++++++++++++++++++++++++++++++++++++++++++++ man/df_partialpro.Rd | 22 ++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 R/gg_partialpro_df.R create mode 100644 man/df_partialpro.Rd diff --git a/NAMESPACE b/NAMESPACE index 15f6e69..fd95900 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ S3method(plot,gg_vimp) export(calc_auc) export(calc_roc) export(df_partial) +export(df_partialpro) export(gg_error) export(gg_error.randomForest) export(gg_error.randomForest.formula) diff --git a/R/gg_partialpro_df.R b/R/gg_partialpro_df.R new file mode 100644 index 0000000..0965e63 --- /dev/null +++ b/R/gg_partialpro_df.R @@ -0,0 +1,79 @@ +##============================================================================= +##============================================================================= +#' Split partial lots into continuous or categorical datasets +#' @param part_dta partial plot data from \code{rfsrc::plot.variable} +#' @param nvars how many of the partial plot variables to calculate +#' @param cat_limit Categorical features are build when there are fewer than +#' cat_limit unique features. +#' @param name a label name applied to all features. Useful when combining +#' multiple partial plot objects in figures. +#' +#' @export +#' +df_partialpro = function(part_dta, nvars = NULL, cat_limit=12, name=NULL) { + ## Prepare the partial pro dependencies data for panel plots + if (is.null(nvars)) { + nvars = length(part_dta) + } + + cont_list = list() + cat_list = list() + for (feature in seq(nvars)) { + ## Format any continuous features (those with fewer than 10 unique values) + if (length(part_dta[[feature]]$xvirtual) > cat_limit) { + plt.df = as.data.frame( + cbind( + variable = part_dta[[feature]]$xvirtual, + parametric = colMeans(part_dta[[feature]]$yhat.par, na.rm = + TRUE), + nonparametric = colMeans(part_dta[[feature]]$yhat.nonpar, na.rm = + TRUE), + causal = colMeans(part_dta[[feature]]$yhat.causal, na.rm = + TRUE) + ) + ) + plt.df$name = names(part_dta)[[feature]] + + cont_list[[feature]] <- plt.df + } else{ + ## Categorical features + + ## Though VarPro works with logical or continuous only. Factors are + ## one hot encoded internal to the varPro call. + cat_feat = list() + ## Each yhat has at least 2 columns, for logical values... + for (ind in seq(length(unique(part_dta[[feature]]$xorg)))) { + cat_feat[[ind]] = as.data.frame( + cbind( + parametric = part_dta[[feature]]$yhat.par[, ind], + nonparametric = part_dta[[feature]]$yhat.nonpar[, ind], + causal = part_dta[[feature]]$yhat.causal[, ind] + ) + ) + cat_feat[[ind]]$variable <- + unique(part_dta[[feature]]$xorg)[ind] + if (ind == 1) { + plt.df <- cat_feat[[ind]] + } else{ + plt.df <- dplyr::bind_rows(plt.df, cat_feat[[ind]]) + } + } + + plt.df$name = names(part_dta)[[feature]] + + cat_list[[feature]] <- plt.df + } + } + + continuous = dplyr::bind_rows(cont_list) + categorical = dplyr::bind_rows(cat_list) + + if(!is.na(name)){ + continuous$model <- categorical$model <- name + } + + return(list( + continuous = continuous, + categorical = categorical + )) +} \ No newline at end of file diff --git a/man/df_partialpro.Rd b/man/df_partialpro.Rd new file mode 100644 index 0000000..5e485ef --- /dev/null +++ b/man/df_partialpro.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gg_partialpro_df.R +\name{df_partialpro} +\alias{df_partialpro} +\title{Split partial lots into continuous or categorical datasets} +\usage{ +df_partialpro(part_dta, nvars = NULL, cat_limit = 12, name = NULL) +} +\arguments{ +\item{part_dta}{partial plot data from \code{rfsrc::plot.variable}} + +\item{nvars}{how many of the partial plot variables to calculate} + +\item{cat_limit}{Categorical features are build when there are fewer than +cat_limit unique features.} + +\item{name}{a label name applied to all features. Useful when combining +multiple partial plot objects in figures.} +} +\description{ +Split partial lots into continuous or categorical datasets +}