From 17cebc056909e7c715cf7243b8704ac762a45ef8 Mon Sep 17 00:00:00 2001 From: Roberto Villegas-Diaz Date: Sat, 20 Sep 2025 18:30:10 +0100 Subject: [PATCH 1/8] Update workflow to avoid relying on external PAT (secrets.GH_TOKEN) and create an artifact with test suite ouputs, instead of pushing to testStatus. Additionally, a new workflow was added to testStatus to 'consume' these artifacts --- .github/workflows/dsBase_test_suite.yaml | 48 ++++++++++-------------- 1 file changed, 20 insertions(+), 28 deletions(-) mode change 100644 => 100755 .github/workflows/dsBase_test_suite.yaml diff --git a/.github/workflows/dsBase_test_suite.yaml b/.github/workflows/dsBase_test_suite.yaml old mode 100644 new mode 100755 index 44020e83..8f03b10f --- a/.github/workflows/dsBase_test_suite.yaml +++ b/.github/workflows/dsBase_test_suite.yaml @@ -37,6 +37,7 @@ jobs: BRANCH_NAME: ${{ github.ref_name }} REPO_OWNER: ${{ github.repository_owner }} R_KEEP_PKG_SOURCE: yes + GITHUB_TOKEN: ${{ github.token || 'placeholder-token' }} steps: - name: Checkout dsBase @@ -45,12 +46,14 @@ jobs: path: dsBase - name: Checkout testStatus + if: ${{ github.actor != 'nektos/act' }} # for local deployment only uses: actions/checkout@v4 with: repository: ${{ env.REPO_OWNER }}/testStatus - token: ${{ secrets.GH_TOKEN }} ref: master path: testStatus + persist-credentials: false + token: ${{ env.GITHUB_TOKEN }} - uses: r-lib/actions/setup-pandoc@v2 @@ -156,44 +159,33 @@ jobs: run: | Rscript --verbose --vanilla ../testStatus/source/parse_test_report.R logs/ working-directory: dsBase - - - name: Commit results to testStatus + + - name: Render report run: | - git config --global user.email "github-actions[bot]@users.noreply.github.com" - git config --global user.name "github-actions[bot]" cd testStatus - # Reconfigure remote to use GitHub token for authentication - git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ env.REPO_OWNER }}/testStatus.git - git checkout master - git pull origin master - - mkdir -p logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ - mkdir -p docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ - mkdir -p docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/latest/ - # clear the latest directory - rm -rf docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/latest/* + mkdir -p new/logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ + mkdir -p new/docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ + mkdir -p new/docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/latest/ # Copy logs to new logs directory location - cp -rv ../dsBase/logs/* logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ - cp -rv ../dsBase/logs/${{ env.WORKFLOW_ID }}.txt logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ + cp -rv ../${{ env.PROJECT_NAME }}/logs/* new/logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ + cp -rv ../${{ env.PROJECT_NAME }}/logs/${{ env.WORKFLOW_ID }}.txt new/logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ - # Create symbolic links - ln -sf ${{ env.WORKFLOW_ID }}/ logs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/.LATEST - # ln -sf docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/ docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/latest - - R -e 'input_dir <- file.path("../logs", Sys.getenv("PROJECT_NAME"), Sys.getenv("BRANCH_NAME"), Sys.getenv("WORKFLOW_ID")); quarto::quarto_render("source/test_report.qmd", execute_params = list(input_dir = input_dir))' - mv source/test_report.html docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/index.html - cp -r docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/* docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/latest - - git add . - git commit -m "Auto test for ${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }} @ ${{ env.WORKFLOW_ID }}" || echo "No changes to commit" - git push origin master + R -e 'input_dir <- file.path("../new/logs", Sys.getenv("PROJECT_NAME"), Sys.getenv("BRANCH_NAME"), Sys.getenv("WORKFLOW_ID")); quarto::quarto_render("source/test_report.qmd", execute_params = list(input_dir = input_dir))' + mv source/test_report.html new/docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/index.html + cp -r new/docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/${{ env.WORKFLOW_ID }}/* new/docs/${{ env.PROJECT_NAME }}/${{ env.BRANCH_NAME }}/latest env: PROJECT_NAME: ${{ env.PROJECT_NAME }} BRANCH_NAME: ${{ env.BRANCH_NAME }} WORKFLOW_ID: ${{ env.WORKFLOW_ID }} + + - name: Upload test logs + uses: actions/upload-artifact@v4 + with: + name: dsbase-logs + path: testStatus/new - name: Dump environment info run: | From f49bc73aa72a386b1eda946b1aeb4c3741ef8fac Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:33:19 +0200 Subject: [PATCH 2/8] added privacy level check --- R/standardiseDfDS.R | 103 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 R/standardiseDfDS.R diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R new file mode 100644 index 00000000..1a8b79fe --- /dev/null +++ b/R/standardiseDfDS.R @@ -0,0 +1,103 @@ +#' Get the Class of All Columns in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @return A tibble with the class of each column in the data frame. +#' @importFrom dplyr %>% +#' @importFrom tibble as_tibble +#' @importFrom purrr map +#' @export +getClassAllColsDS <- function(df.name){ + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + all_classes <- map(df.name, class) %>% as_tibble() + return(all_classes) +} + +#' Change Class of Target Variables in a Data Frame +#' @param df.name A string representing the name of the data frame. +#' @param target_vars A character vector specifying the columns to be modified. +#' @param target_class A character vector specifying the new classes for each column (1 = factor, +#' 2 = integer, 3 = numeric, 4 = character, 5 = logical). +#' @return A modified data frame with the specified columns converted to the target classes. +#' @importFrom dplyr mutate across +#' @importFrom tidyselect all_of +#' @export +fixClassDS <- function(df.name, target_vars, target_class) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + df_transformed <- df %>% + mutate( + across(all_of(target_vars), + ~ .convertClass(.x, target_class[which(target_vars == cur_column())]))) + return(df_transformed) +} + +#' Convert a Vector to a Specified Class +#' @param x The vector to be converted. +#' @param class_name A string indicating the target class (1 = factor, 2 = integer, 3 = numeric, +#' 4 = character, 5 = logical). +#' @return The converted vector. +#' @noRd +.convertClass <- function(target_var, target_class_code) { + switch(target_class_code, + "1" = as.factor(target_var), + "2" = as.integer(target_var), + "3" = as.numeric(target_var), + "4" = as.character(target_var), + "5" = as.logical(target_var) + ) +} + +#' Add Missing Columns with NA Values +#' @param .data A string representing the name of the data frame. +#' @param cols A character vector specifying the columns to be added if missing. +#' @return A modified data frame with missing columns added and filled with NA. +#' @importFrom dplyr mutate select +#' @importFrom tidyselect peek_vars +#' @importFrom purrr set_names +#' @export +fixColsDS <- function(.data, cols) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + .data <- eval(parse(text = .data), envir = parent.frame()) + missing <- setdiff(cols, colnames(.data)) + out <- .data %>% + mutate(!!!set_names(rep(list(NA), length(missing)), missing)) %>% + select(sort(peek_vars())) + return(out) +} + +#' Retrieve Factor Levels for Specific Columns +#' @param df.name A string representing the name of the data frame. +#' @param factor_vars A character vector specifying the factor columns. +#' @return A list of factor levels for the specified columns. +#' @importFrom tidyselect all_of +#' @importFrom purrr map +#' @export +getAllLevelsDS <- function(df.name, factor_vars) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df <- eval(parse(text = df.name), envir = parent.frame()) + factor_vars_split <- strsplit(factor_vars, ",\\s*")[[1]] + levels <- df %>% dplyr::select(all_of(factor_vars_split)) %>% map(levels) + browser() + return() +} + + + + +#' Set Factor Levels for Specific Columns in a Data Frame +#' @param df.name A string representing the name of the data frame to modify. +#' @param vars A character vector specifying the columns to be modified. +#' @param levels A named list where each element contains the levels for the corresponding factor variable. +#' @return A modified data frame with the specified columns converted to factors with the provided levels. +#' @export +fixLevelsDS <- function(df.name, vars, levels) { + dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) + + df.name <- eval(parse(text = df.name), envir = parent.frame()) + out <- df.name %>% + mutate(across(all_of(vars), ~factor(., levels = levels[[dplyr::cur_column()]]))) +} From 9d82aaec380a91d44aa2209e89950d0bb41d3264 Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:54:47 +0200 Subject: [PATCH 3/8] add disclosure check when levels are returned --- R/standardiseDfDS.R | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R index 1a8b79fe..1a8dab72 100644 --- a/R/standardiseDfDS.R +++ b/R/standardiseDfDS.R @@ -81,12 +81,29 @@ getAllLevelsDS <- function(df.name, factor_vars) { df <- eval(parse(text = df.name), envir = parent.frame()) factor_vars_split <- strsplit(factor_vars, ",\\s*")[[1]] levels <- df %>% dplyr::select(all_of(factor_vars_split)) %>% map(levels) - browser() - return() + + disclosure_check <- imap(levels, function(lvls, var) { + .checkLevelsDisclosure(df.name = df.name, var = var, levels = lvls) + }) + + failed <- paste(names(disclosure_check)[unlist(disclosure_check)], collapse = ", ") + + if(length(failed) > 0) { + stop("Based on the value of nfilter.levels.density, these factor variables", " {", failed, "} ", "have too many levels compared to the length of the variable. Please reduce the numnber of levels or change the variable type and try again") + } else { + return(levels) + } } - - +.checkLevelsDisclosure <- function(df.name, var, levels) { + + thr <- dsBase::listDisclosureSettingsDS() + nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) + n_levels <- length(levels) + length_var <- length(get(df.name)[[var]]) + fail <- (length_var * nfilter.levels.density) < n_levels + return(fail) +} #' Set Factor Levels for Specific Columns in a Data Frame #' @param df.name A string representing the name of the data frame to modify. From 5dd562152e9d4fd72b8539b62077c638031bb71d Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 13:36:48 +0200 Subject: [PATCH 4/8] fixed environment scoping issue --- R/standardiseDfDS.R | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R index 1a8dab72..b3ffcd1e 100644 --- a/R/standardiseDfDS.R +++ b/R/standardiseDfDS.R @@ -73,7 +73,7 @@ fixColsDS <- function(.data, cols) { #' @param factor_vars A character vector specifying the factor columns. #' @return A list of factor levels for the specified columns. #' @importFrom tidyselect all_of -#' @importFrom purrr map +#' @importFrom purrr map imap #' @export getAllLevelsDS <- function(df.name, factor_vars) { dsBase::checkPermissivePrivacyControlLevel(c('permissive', 'banana', 'carrot')) @@ -83,24 +83,37 @@ getAllLevelsDS <- function(df.name, factor_vars) { levels <- df %>% dplyr::select(all_of(factor_vars_split)) %>% map(levels) disclosure_check <- imap(levels, function(lvls, var) { - .checkLevelsDisclosure(df.name = df.name, var = var, levels = lvls) + .checkLevelsDisclosure(df = df, var = var, levels = lvls) }) - failed <- paste(names(disclosure_check)[unlist(disclosure_check)], collapse = ", ") + failed_vars <- names(disclosure_check)[unlist(disclosure_check)] - if(length(failed) > 0) { - stop("Based on the value of nfilter.levels.density, these factor variables", " {", failed, "} ", "have too many levels compared to the length of the variable. Please reduce the numnber of levels or change the variable type and try again") + if(length(failed_vars) > 0) { + stop("Based on the value of nfilter.levels.density, these factor variables", " {", failed_vars, "} ", "have too many levels compared to the length of the variable. Please reduce the numnber of levels or change the variable type and try again") } else { return(levels) } } -.checkLevelsDisclosure <- function(df.name, var, levels) { - - thr <- dsBase::listDisclosureSettingsDS() +#' Check variable levels against disclosure thresholds +#' +#' Internal helper function to verify whether the number of levels in a variable +#' exceeds the allowed density threshold defined by `dsBase::listDisclosureSettingsDS()`. +#' +#' @param df A data frame containing the variable. +#' @param var Character string. Name of the variable to check. +#' @param levels Character vector. Levels of the variable. +#' +#' @return Logical. `TRUE` if the check fails (i.e., disclosure threshold is violated), +#' otherwise `FALSE`. +#' +#' @keywords internal +#' @noRd +.checkLevelsDisclosure <- function(df, var, levels) { + thr <- dsBase::listDisclosureSettingsDS() nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) n_levels <- length(levels) - length_var <- length(get(df.name)[[var]]) + length_var <- length(df[[var]]) fail <- (length_var * nfilter.levels.density) < n_levels return(fail) } From 7a4d418d51863b52fca08ad04dacb8f864b76eab Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:18:54 +0200 Subject: [PATCH 5/8] explitly use base levels --- R/standardiseDfDS.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/standardiseDfDS.R b/R/standardiseDfDS.R index b3ffcd1e..24e3eecf 100644 --- a/R/standardiseDfDS.R +++ b/R/standardiseDfDS.R @@ -80,7 +80,7 @@ getAllLevelsDS <- function(df.name, factor_vars) { df <- eval(parse(text = df.name), envir = parent.frame()) factor_vars_split <- strsplit(factor_vars, ",\\s*")[[1]] - levels <- df %>% dplyr::select(all_of(factor_vars_split)) %>% map(levels) + levels <- purrr::map(df[factor_vars_split], base::levels) disclosure_check <- imap(levels, function(lvls, var) { .checkLevelsDisclosure(df = df, var = var, levels = lvls) From 5aad33006456c2a1f5050a100079c086f81a834b Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:19:10 +0200 Subject: [PATCH 6/8] add test for disclosure check --- tests/testthat/test-smk-standardiseDfDS.R | 171 ++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 tests/testthat/test-smk-standardiseDfDS.R diff --git a/tests/testthat/test-smk-standardiseDfDS.R b/tests/testthat/test-smk-standardiseDfDS.R new file mode 100644 index 00000000..4397fe65 --- /dev/null +++ b/tests/testthat/test-smk-standardiseDfDS.R @@ -0,0 +1,171 @@ +#------------------------------------------------------------------------------- +# Copyright (c) 2025 University Medical Center Groningen (UCMG), Netherlands. All rights reserved. +# +# This program and the accompanying materials +# are made available under the terms of the GNU Public License v3.0. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- + +# +# Set up +# + +context("standardiseDfDS::smk::setup") + +df <- create_mixed_dataframe() +df_list <- create_additional_dataframes(df) + +df_1 <- df +df_2 <- df_list[[1]] +df_3 <- df_list[[2]] +df_4 <- df_list[[3]] + +context("standardiseDfDS::smk") +test_that("getClassAllColsDS returns correct classes", { + expect_equal( + getClassAllColsDS("df_1"), + tibble( + fac_col1 = "factor", fac_col2 = "factor", fac_col3 = "factor", fac_col4 = "factor", fac_col5 = "factor", + fac_col6 = "factor", fac_col7 = "factor", fac_col8 = "factor", fac_col9 = "factor", fac_col10 = "factor", + fac_col11 = "factor", fac_col12 = "factor", fac_col13 = "factor", fac_col14 = "factor", fac_col15 = "factor", + col16 = "integer", col17 = "integer", col18 = "numeric", col19 = "numeric", col20 = "character", + col21 = "character", col22 = "integer", col23 = "numeric", col24 = "character", col25 = "integer", + col26 = "numeric", col27 = "character", col28 = "integer", col29 = "numeric", col30 = "character" + ) + ) +}) + +test_that("fixClassDS sets classes correctly", { + + cols_to_set <- c("fac_col13", "fac_col5", "col22", "col19", "col25", "col20", "col28", + "fac_col14", "fac_col3", "fac_col8") + + classes_to_set <- c("4", "1", "3", "5", "3", "2", "5", "5", "3", "2") + + expect_warning( + classes_changed_df <- fixClassDS("df_1", cols_to_set, classes_to_set) + ) + + expect_equal( + classes_changed_df %>% purrr::map_chr(class) %>% unname(), + c("factor", "factor", "numeric", "factor", "factor", "factor", "factor", "integer", "factor", + "factor", "factor", "factor", "character", "logical", "factor", "integer", "integer", + "numeric", "logical", "integer", "character", "numeric", "numeric", "character", "numeric", + "numeric", "character", "logical", "numeric", "character") + ) +}) + +test_that("convert_class calls the correct function", { + + result <- .convertClass(c(1, 2, 3), "1") + expect_true(is.factor(result)) + + result <- .convertClass(c(1.5, 2.5, 3.7), "2") + expect_true(is.integer(result)) + + result <- .convertClass(c("1", "2", "3"), "3") + expect_true(is.numeric(result)) + + result <- .convertClass(c(1, 2, 3), "4") + expect_true(is.character(result)) + + result <- .convertClass(c(0, 1, 0), "5") + expect_true(is.logical(result)) + +}) + +test_that("fixColsDS correctly adds missing columns", { + + all_cols <- unique(c(colnames(df_1), colnames(df_2), colnames(df_3), colnames(df_4))) + out <- fixColsDS("df_3", all_cols) + + expect_equal( + colnames(out), + sort(all_cols)) + +}) + +test_that("getAllLevelsDS correctly retrieves the levels of specified factor columns", { + + factor_vars <- "fac_col1, fac_col2, fac_col3, fac_col4, fac_col5, fac_col6, fac_col7, fac_col8, fac_col9, fac_col10, fac_col11, fac_col12, fac_col14, fac_col15, col27" + + observed <- getAllLevelsDS("df_3", factor_vars) + + expected <- list( + fac_col1 = c("High", "Low", "Medium"), + fac_col2 = c("Blue", "Green", "Red"), + fac_col3 = c("No", "Yes"), + fac_col4 = c("A", "B", "C"), + fac_col5 = c("One", "Three", "Two"), + fac_col6 = c("Bird", "Cat", "Dog"), + fac_col7 = c("Large", "Medium", "Small"), + fac_col8 = c("Alpha", "Beta", "Gamma"), + fac_col9 = c("False", "True"), + fac_col10 = c("Left", "Right"), + fac_col11 = c("East", "North", "South", "West"), + fac_col12 = c("Day", "Night"), + fac_col14 = c("Female", "Male"), + fac_col15 = c("Fall", "Spring", "Summer", "Winter"), + col27 = letters + ) + + expect_equal(expected, observed) + +}) + +example_df <- data.frame( + col1 = c("A", "B", "A", "C"), + col2 = c("X", "Y", "X", "Z"), + col3 = c("Yes", "No", "Yes", "No"), + stringsAsFactors = FALSE +) + +test_that("fixLevelsDS sets factor levels correctly", { + + levels <- list( + col1 = c("A", "B", "C"), + col2 = c("X", "Y", "Z"), + col3 = c("Yes", "No") + ) + + modified_df <- fixLevelsDS("example_df", c("col1", "col2", "col3"), levels) + + expect_s3_class(modified_df$col1, "factor") + expect_s3_class(modified_df$col2, "factor") + expect_s3_class(modified_df$col3, "factor") + + expect_equal(levels(modified_df$col1), levels$col1) + expect_equal(levels(modified_df$col2), levels$col2) + expect_equal(levels(modified_df$col3), levels$col3) + +}) + +test_that("fixLevelsDS throws an error for invalid input", { + + levels <- list( + col1 = c("A", "B", "C"), + col2 = c("X", "Y", "Z") + ) + + expect_error(fixLevelsDS("example_df", c("col1", "non_existent_col"), levels)) +}) + +test_that("getAllLevelsDS stops when a variable exceeds disclosure threshold", { + + mtcars_bad_group <- mtcars %>% mutate(qsec = as.factor(qsec)) + + expect_error( + getAllLevelsDS("mtcars_bad_group", "qsec"), + regexp = "Based on the value of nfilter.levels.density" + ) +}) + +# +# Done +# + +context("standardiseDfDS::smk::shutdown") + +context("standardiseDfDS::smk::done") From cd876c6f8cb7e50282249a97f0dae2b0b9d0398c Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:19:42 +0200 Subject: [PATCH 7/8] export functions --- NAMESPACE | 9 +++++++++ inst/DATASHIELD | 12 +++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 897148d1..09705258 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -140,3 +140,12 @@ import(gamlss.dist) import(mice) importFrom(gamlss.dist,pST3) importFrom(gamlss.dist,qST3) +<<<<<<< HEAD +======= +importFrom(purrr,imap) +importFrom(purrr,map) +importFrom(purrr,set_names) +importFrom(tibble,as_tibble) +importFrom(tidyselect,all_of) +importFrom(tidyselect,peek_vars) +>>>>>>> acb2b95 (export functions) diff --git a/inst/DATASHIELD b/inst/DATASHIELD index c9dd9390..804f4780 100644 --- a/inst/DATASHIELD +++ b/inst/DATASHIELD @@ -69,7 +69,9 @@ AggregateMethods: is.null=base::is.null, is.numeric=base::is.numeric, NROW=base::NROW, - t.test=stats::t.test + t.test=stats::t.test, + getClassAllColsDS, + getAllLevelsDS AssignMethods: absDS, asCharacterDS, @@ -160,6 +162,7 @@ AssignMethods: acos=base::acos, atan=base::atan, sum=base::sum, +<<<<<<< HEAD unlist=base::unlist Options: datashield.privacyLevel=5, @@ -173,3 +176,10 @@ Options: default.nfilter.noise=0.25, default.nfilter.levels.density=0.33, default.nfilter.levels.max=40 +======= + unlist=base::unlist, + fixClassDS, + fixColsDS, + fixLevelsDS + +>>>>>>> acb2b95 (export functions) From 54150425de192c42688d80d66e49b6aff160493f Mon Sep 17 00:00:00 2001 From: Tim Cadman <41470917+timcadman@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:20:06 +0200 Subject: [PATCH 8/8] test: explicitly use namespace --- tests/testthat/helper.R | 162 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 tests/testthat/helper.R diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R new file mode 100644 index 00000000..1a51bbe4 --- /dev/null +++ b/tests/testthat/helper.R @@ -0,0 +1,162 @@ +#' Create a mixed dataframe with factor and other types of columns +#' +#' This function generates a dataframe with a specified number of rows, +#' factor columns, and other columns (integer, numeric, and string). +#' +#' @param n_rows Number of rows in the dataframe. Default is 10,000. +#' @param n_factor_cols Number of factor columns in the dataframe. Default is 15. +#' @param n_other_cols Number of other columns (integer, numeric, and string) in the dataframe. Default is 15. +#' +#' @return A dataframe with the specified number of rows and columns, containing mixed data types. +#' @importFrom dplyr bind_cols +#' @importFrom purrr purrr::map_dfc +#' @examples +#' df <- create_mixed_dataframe(n_rows = 100, n_factor_cols = 10, n_other_cols = 5) +create_mixed_dataframe <- function(n_rows = 10000, n_factor_cols = 15, n_other_cols = 15) { + + create_factor_column <- function(levels, n = n_rows) { + set.seed(123) # Set seed before sample + factor(sample(levels, n, replace = TRUE)) + } + + factor_levels <- list( + c("Low", "Medium", "High"), + c("Red", "Green", "Blue"), + c("Yes", "No"), + c("A", "B", "C"), + c("One", "Two", "Three"), + c("Cat", "Dog", "Bird"), + c("Small", "Medium", "Large"), + c("Alpha", "Beta", "Gamma"), + c("True", "False"), + c("Left", "Right"), + c("North", "South", "East", "West"), + c("Day", "Night"), + c("Up", "Down"), + c("Male", "Female"), + c("Summer", "Winter", "Spring", "Fall") + ) + + factor_columns <- purrr::map_dfc(factor_levels[1:n_factor_cols], create_factor_column) + colnames(factor_columns) <- paste0("fac_col", 1:n_factor_cols) + + create_other_column <- function(type, n = n_rows) { + set.seed(123) # Set seed before sample + switch(type, + "int" = sample(1:100, n, replace = TRUE), + "num" = runif(n, 0, 100), + "str" = sample(letters, n, replace = TRUE) + ) + } + + column_types <- c( + "int", "int", "num", "num", "str", + "str", "int", "num", "str", "int", + "num", "str", "int", "num", "str" + ) + + other_columns <- purrr::map_dfc(column_types[1:n_other_cols], create_other_column) + colnames(other_columns) <- paste0("col", (n_factor_cols + 1):(n_factor_cols + n_other_cols)) + df <- bind_cols(factor_columns, other_columns) + + return(df) +} + + +#' Modify factor levels for partial overlap +#' +#' This function takes two sets of factor levels, computes the common and unique levels, +#' and returns a new set of levels with partial overlap. +#' +#' @param levels1 First set of factor levels. +#' @param levels2 Second set of factor levels. +#' +#' @return A character vector of new factor levels with partial overlap. +#' @examples +#' new_levels <- partial_overlap_levels(c("A", "B", "C"), c("B", "C", "D")) +partial_overlap_levels <- function(levels1, levels2) { + common <- intersect(levels1, levels2) + unique1 <- setdiff(levels1, common) + unique2 <- setdiff(levels2, common) + + # Set seed before each sample call + set.seed(123) + sampled_unique1 <- sample(unique1, length(unique1) * 0.5) + + set.seed(123) + sampled_unique2 <- sample(unique2, length(unique2) * 0.5) + + new_levels <- c(common, sampled_unique1, sampled_unique2) + return(new_levels) +} + + +#' Create additional dataframes with specific conditions +#' +#' This function generates additional dataframes based on an input dataframe, modifying column classes and levels, +#' and adding new columns with unique names. Different seeds are used for each iteration of the loop, +#' ensuring reproducibility of the generated dataframes. +#' +#' @param base_df The base dataframe used to create the additional dataframes. +#' @param n_rows Number of rows in the additional dataframes. Default is 10,000. +#' @param df_names Names of the additional dataframes to be created. Default is c("df1", "df2", "df3"). +#' +#' @return A list of dataframes with the specified modifications. +#' @importFrom dplyr bind_cols +#' @examples +#' base_df <- create_mixed_dataframe(n_rows = 100, n_factor_cols = 10, n_other_cols = 5) +#' additional_dfs <- create_additional_dataframes(base_df, n_rows = 1000, df_names = c("df1", "df2")) +create_additional_dataframes <- function(base_df, n_rows = 10000, df_names = c("df1", "df2", "df3")) { + + # Define a fixed sequence of seeds, one for each dataframe to be created + seeds <- c(123, 456, 789, 101112) + + df_list <- list() + + for (i in seq_along(df_names)) { + # Set the seed for this iteration based on the pre-defined seeds + set.seed(seeds[i]) + + # Proceed with the dataframe generation process + overlap_cols <- sample(colnames(base_df), size = round(0.8 * ncol(base_df))) + df <- base_df + cols_to_modify_class <- sample(overlap_cols, size = round(0.2 * length(overlap_cols))) + + # Modify columns to have different data types + for (col in cols_to_modify_class) { + current_class <- class(df[[col]]) + new_class <- switch(current_class, + "factor" = as.character(df[[col]]), + "character" = as.factor(df[[col]]), + "numeric" = as.integer(df[[col]]), + "integer" = as.numeric(df[[col]]), + df[[col]]) + df[[col]] <- new_class + } + + # Modify factor levels for partial overlap + factor_cols <- colnames(base_df)[sapply(base_df, is.factor)] + overlap_factor_cols <- intersect(overlap_cols, factor_cols) + cols_to_modify_levels <- sample(overlap_factor_cols, size = round(0.5 * length(overlap_factor_cols))) + + for (col in cols_to_modify_levels) { + original_levels <- levels(base_df[[col]]) + new_levels <- partial_overlap_levels(original_levels, original_levels) + df[[col]] <- factor(df[[col]], levels = new_levels) + } + + # Create new random columns for each dataframe (these will vary by seed) + set.seed(seeds[i]) # Set the seed again for generating new columns + n_new_cols <- round(0.2 * ncol(base_df)) + new_col_names <- paste0(df_names[i], "_new_col_", 1:n_new_cols) + new_cols <- data.frame(matrix(runif(n_rows * n_new_cols), ncol = n_new_cols)) + colnames(new_cols) <- new_col_names + + # Bind new columns to the dataframe + df <- bind_cols(df, new_cols) + df_list[[df_names[i]]] <- df + } + + return(df_list) +} +