diff --git a/.Rbuildignore b/.Rbuildignore index 0bfe703..8995890 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,4 +1,4 @@ -^veupathUtils\.Rproj$ +^mbioUtils\.Rproj$ ^\.Rproj\.user$ ^\.github$ ^codecov\.yml$ diff --git a/.dev/megastudy-helpers.R b/.dev/megastudy-helpers.R deleted file mode 100644 index 7b9e094..0000000 --- a/.dev/megastudy-helpers.R +++ /dev/null @@ -1,208 +0,0 @@ -## ignore these. they are helpers i use during testing, to recreate what java does outside the data service -## ive just had to do this too many times to not have them in git right next to the thing im testing now - -# to read in root-vocab endpoint results -# rootVocabHandle is a file handle to a two column data.table w no header -# first column is internal study id, second is a variable value from the vocab -# it is produced by the root-vocab endpoint in the subsetting service -getStudySpecificVocabularyByVariable <- function(rootVocabHandle, entityId, variableId, studyIdColName) { - rootVocab <- readRootVocab(rootVocabHandle) - tbl <- getStudyVocabTibble(rootVocab, entityId, variableId, studyIdColName) - - veupathUtils::StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(eval(parse(text=paste0('c(',paste(tbl$values, collapse=','),')'))))) -} - -readRootVocab <- function(rootVocabHandle) { - rootVocab <- data.table::fread(rootVocabHandle, header=FALSE) - names(rootVocab) <- c('studyId', 'value') - - if (nrow(rootVocab) == 0) { - return(veupathUtils::StudySpecificVocabulariesByVariable()) - } - - return(rootVocab) -} - -getStudyVocabTibble <- function(rootVocab, entityId, variableId, studyIdColName) { - varSpecString <- getVarSpecAsString(entityId, variableId) - - tbl <- - dplyr::reframe( - dplyr::group_by( - rootVocab, - studyId - ), - values=paste0( - "veupathUtils::StudySpecificVocabulary(variableSpec=", - varSpecString, - ", vocabulary=c('", - paste(value, collapse='\',\''), - "'),study='", - studyId, - "',studyIdColumnName='", - studyIdColName, - "')" - ) - ) - - return(unique(tbl)) -} - -getVarSpecAsString <- function(entityId, variableId) { - paste0("veupathUtils::VariableSpec(entityId='", entityId, "', variableId='", variableId, "')") -} - -# an example output from the `values` column of the output of getStudyVocabTibble function -#veupathUtils::StudySpecificVocabulary( -# variableSpec=veupathUtils::VariableSpec( -# entityId='EUPATH_0000609', -# variableId='PATO_0000047' -# ), -# vocabulary=c('female,male'), -# study='1969-Iowa-surveillance', -# studyIdColumnName='EUPATH_0000605.Study_stable_id' -#) - -## for reading in some vocabs -sexVocab <- getStudySpecificVocabularyByVariable( - '../rootVocab_bobs_analysis_sex.tsv', - 'EUPATH_0000609', - 'PATO_0000047', - 'EUPATH_0000605.Study_stable_id' -) - -speciesVocab <- getStudySpecificVocabularyByVariable( - '../rootVocab_bobs_analysis_species.tsv', - 'EUPATH_0000609', - 'OBI_0001909', - 'EUPATH_0000605.Study_stable_id' -) - -lifeStageVocab <- getStudySpecificVocabularyByVariable( - '../rootVocab_bobs_analysis_lifeStage.tsv', - 'EUPATH_0000609', - 'UBERON_0000105', - 'EUPATH_0000605.Study_stable_id' -) - -feedingStatusVocab <- getStudySpecificVocabularyByVariable( - '../rootVocab_bobs_analysis_feedingStatus.tsv', - 'EUPATH_0000609', - 'EUPATH_0043227', - 'EUPATH_0000605.Study_stable_id' -) - -## some 'real life' test data, though minified for brevity -# some stuff for bobs analysis - -sexVocabReal.mini <- - StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(StudySpecificVocabulary( - studyIdColumnName='EUPATH_0000605.Study_stable_id', - study='2023-abundance-SLCMCD-2022', - variableSpec=VariableSpec( - entityId='EUPATH_0000609', - variableId='PATO_0000047' - ), - vocabulary=c('male','mixed sex') - ))) - -speciesVocabReal.mini <- - StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(StudySpecificVocabulary( - studyIdColumnName='EUPATH_0000605.Study_stable_id', - study='2023-abundance-SLCMCD-2022', - variableSpec=VariableSpec( - entityId='EUPATH_0000609', - variableId='OBI_0001909' - ), - vocabulary=c( - 'Aedes vexans nipponii', - 'Anopheles freeborni', - 'Coquillettidia perturbans', - 'Culex erythrothorax', - 'Culex pipiens', - 'Culex salinarius', - 'Culex tarsalis', - 'Culiseta incidens', - 'Culiseta inornata', - 'Ochlerotatus dorsalis', - 'Ochlerotatus increpitus', - 'Ochlerotatus nigromaculis', - 'Ochlerotatus sierrensis' - ) - ))) - - lifeStageVocabReal.mini <- - StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(StudySpecificVocabulary( - studyIdColumnName='EUPATH_0000605.Study_stable_id', - study='2023-abundance-SLCMCD-2022', - variableSpec=VariableSpec( - entityId='EUPATH_0000609', - variableId='UBERON_0000105' - ), - vocabulary=c('prime adult stage') - ))) - - feedingStatusVocabReal.mini <- - StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(StudySpecificVocabulary( - studyIdColumnName='EUPATH_0000605.Study_stable_id', - study='2023-abundance-SLCMCD-2022', - variableSpec=VariableSpec( - entityId='EUPATH_0000609', - variableId='EUPATH_0043227' - ), - vocabulary=c('') - ))) - - megastudyVariablesReal <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'POPBIO_8000017', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS'), - hasStudyDependentVocabulary = FALSE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'PATO_0000047', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'OBI_0001909', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'UBERON_0000105', entityId = 'EUPATH_0000609'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'EUPATH_0043227', entityId = 'EUPATH_0000609'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE) - )) - - ## mini vocabs - megastudyReal.mini <- Megastudy( - data=megastudyDataReal, - ancestorIdColumns=c('EUPATH_0000605.Study_stable_id', 'GAZ_00000448.GeographicLocation_stable_id','OBI_0000659.ParentOfSample_stable_id','EUPATH_0000609.Sample_stable_id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(sexVocabReal.mini, speciesVocabReal.mini, lifeStageVocabReal.mini, feedingStatusVocabReal.mini)) - ) - - ## full vocabs - megastudyReal <- Megastudy( - data=megastudyDataReal, - ancestorIdColumns=c('EUPATH_0000605.Study_stable_id', 'GAZ_00000448.GeographicLocation_stable_id','OBI_0000659.ParentOfSample_stable_id','EUPATH_0000609.Sample_stable_id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(sexVocab, speciesVocab, lifeStageVocab, feedingStatusVocab)) - ) \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 77c09f5..4b9ab5c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,39 +1,39 @@ -Package: veupathUtils -Title: General Helper Functions for MicrobiomeDB Projects -Version: 2.6.7 +Package: mbioUtils +Title: Utility Functions for MicrobiomeDB R Packages +Version: 0.1.0 Authors@R: c(person(given = "Danielle", family = "Callan", - role = c("aut"), + role = c("aut", "cre"), email = "dcallan@upenn.edu"), person(given = "Ann", family = "Blevins", - role = c("aut", "cre"), - email = "annsize@upenn.edu")) + role = c("aut"), + email = "annsize@upenn.edu", + comment = "Original veupathUtils author")) biocViews: Imports: boot, data.table, - digest, Hmisc, - microbenchmark, + methods, + purrr, S4Vectors, - SpiecEasi (>= 1.1.1), - stringi, - purrr + SpiecEasi (>= 1.0.7), + stringi Depends: R (>= 2.10), jsonlite Remotes: - zdk123/SpiecEasi -URL: https://github.com/microbiomeDB/veupathUtils -BugReports: https://github.com/microbiomeDB/veupathUtils/issues -Description: veupathUtils contains various R helper functions intended to be useful across a variety of projects, including the plot.data and microbiomeComputations packages. + zdk123/SpiecEasi@v1.0.7 +URL: https://github.com/microbiomeDB/mbioUtils +BugReports: https://github.com/microbiomeDB/mbioUtils/issues +Description: mbioUtils contains utility functions and data structures for microbiomeDB R packages, including microbiomeComputations, MicrobiomeDB, and microbiomeData. Forked from veupathUtils 2.6.7. License: Apache License (>= 2) Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.3 Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 @@ -46,22 +46,16 @@ Collate: 'class-VariableMetadata.R' 'class-ComputeResult.R' 'class-CorrelationResult.R' - 'class-Megastudy.R' - 'class-Range.R' - 'class-Statistic.R' - 'data.R' + 'mbioUtils-package.R' 'method-correlation.R' 'methods-Bin.R' 'methods-CollectionWithMetadata.R' 'methods-Collections.R' 'methods-VariableMetadata.R' 'methods-ComputeResult.R' - 'methods-Megastudy.R' 'methods-SampleMetadata.R' - 'methods-Statistic.R' 'utils-classes.R' 'utils-cut.R' 'utils-numeric.R' 'utils-string.R' 'utils.R' - 'veupathUtils-package.R' diff --git a/NAMESPACE b/NAMESPACE index b8c035b..d7e9bab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,5 @@ # Generated by roxygen2: do not edit by hand -S3method(naToZero,data.frame) -S3method(naToZero,data.table) -S3method(naToZero,default) -S3method(naToZero,list) -S3method(validateNumericCols,data.table) -S3method(validateNumericCols,default) -S3method(validateNumericCols,list) export("name<-") export(Bin) export(BinList) @@ -17,15 +10,9 @@ export(ComputeResult) export(CorrelationResult) export(DataShape) export(DataType) -export(Megastudy) export(PlotReference) -export(Range) export(S4SimpleListToJSON) export(SampleMetadata) -export(Statistic) -export(StatisticList) -export(StudySpecificVocabulariesByVariable) -export(StudySpecificVocabulariesByVariableList) export(VariableClass) export(VariableMetadata) export(VariableMetadataList) @@ -68,37 +55,27 @@ export(getCollectionName) export(getCollectionNames) export(getCollectionVariableNames) export(getCollectionsList) -export(getDTWithImputedZeroes) export(getDataFromSource) -export(getDataTable) -export(getDiscretizedBins) -export(getEntityId) export(getHasStudyDependentVocabulary) export(getIdColumns) export(getMetadataVariableNames) export(getMetadataVariableSummary) export(getSampleMetadata) export(getSampleMetadataIdColumns) -export(getStudyIdColumnName) -export(getVariableSpec) -export(getVariableSpecColumnName) export(is.POSIXct) export(is.error) export(isOneToManyWithAncestor) export(logWithTime) export(matchArg) export(merge) -export(naToZero) export(name) export(new_data_frame) -export(nonZeroRound) export(predicateFactory) export(pruneFeatures) export(removeAttr) export(removeIncompleteRecords) export(selfCorrelation) export(setAttrFromList) -export(setNaToZero) export(setroworder) export(shiftToNonNeg) export(strSplit) @@ -108,7 +85,6 @@ export(toStringOrNull) export(toStringOrPoint) export(trim) export(updateAttrById) -export(validateNumericCols) export(whichValuesInBin) export(whichValuesInBinList) export(writeData) @@ -122,14 +98,8 @@ exportClasses(ComputeResult) exportClasses(CorrelationResult) exportClasses(DataShape) exportClasses(DataType) -exportClasses(Megastudy) exportClasses(PlotReference) -exportClasses(Range) exportClasses(SampleMetadata) -exportClasses(Statistic) -exportClasses(StatisticList) -exportClasses(StudySpecificVocabulariesByVariable) -exportClasses(StudySpecificVocabulariesByVariableList) exportClasses(VariableClass) exportClasses(VariableMetadata) exportClasses(VariableMetadataList) @@ -152,13 +122,7 @@ exportMethods(findVariablesNeedingWeightingVariableMetadata) exportMethods(findWeightingVariableSpecs) exportMethods(findWeightingVariablesMetadata) exportMethods(getColName) -exportMethods(getDTWithImputedZeroes) -exportMethods(getDataTable) -exportMethods(getEntityId) exportMethods(getHasStudyDependentVocabulary) -exportMethods(getStudyIdColumnName) -exportMethods(getVariableSpec) -exportMethods(getVariableSpecColumnName) exportMethods(merge) exportMethods(predicateFactory) exportMethods(toJSON) @@ -173,8 +137,13 @@ importFrom(S4Vectors,SimpleList) importFrom(SpiecEasi,pval.sparccboot) importFrom(SpiecEasi,sparcc) importFrom(SpiecEasi,sparccboot) -importFrom(digest,digest) -importFrom(microbenchmark,microbenchmark) +importFrom(methods,new) +importFrom(methods,slotNames) +importFrom(methods,validObject) importFrom(purrr,map) -importFrom(purrr,map_lgl) +importFrom(stats,complete.cases) +importFrom(stats,median) +importFrom(stats,sd) +importFrom(stats,var) importFrom(stringi,stri_detect_regex) +importFrom(utils,tail) diff --git a/R/class-Collections.R b/R/class-Collections.R index 9ee578e..4c8a01b 100644 --- a/R/class-Collections.R +++ b/R/class-Collections.R @@ -23,7 +23,7 @@ check_collection <- function(object) { # collection data should all come from the same entity # using the presence of the period to indicate eda services formatted data if (all(grepl(".", names(df), fixed = TRUE))) { - if (uniqueN(veupathUtils::strSplit(names(df)[!names(df) %in% object@ancestorIdColumns], ".", ncol=2, index=1)) > 1) { + if (uniqueN(mbioUtils::strSplit(names(df)[!names(df) %in% object@ancestorIdColumns], ".", ncol=2, index=1)) > 1) { msg <- paste("All columns must belong to the same entity.") errors <- c(errors, msg) } diff --git a/R/class-ComputeResult.R b/R/class-ComputeResult.R index 38c7054..25e9bae 100644 --- a/R/class-ComputeResult.R +++ b/R/class-ComputeResult.R @@ -17,7 +17,7 @@ check_compute_result <- function(object) { # the variable classes are correct. if (!!length(object@computedVariableMetadata)) { variables <- object@computedVariableMetadata - col_names <- stripEntityIdFromColumnHeader(veupathUtils::findAllColNames(variables)) + col_names <- stripEntityIdFromColumnHeader(mbioUtils::findAllColNames(variables)) if (!all(col_names %in% names(object@data))) { msg <- paste("Some specified computed variables are not present in compute result data.frame") @@ -64,7 +64,7 @@ check_compute_result <- function(object) { #' @slot name The name of the compute, ex: 'alphaDiv'. #' @slot recordIdColumn The name of the column containing IDs for the samples. All other columns will be treated as computed values. #' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. -#' @slot computedVariableMetadata veupathUtils::VariableMetadataList detailing the computed variables. +#' @slot computedVariableMetadata mbioUtils::VariableMetadataList detailing the computed variables. #' @slot statistics An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples. #' @slot computationDetails An optional message about the computed results. #' @slot parameters A record of the input parameters used to generate the computed results. diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R deleted file mode 100644 index 4ff2df8..0000000 --- a/R/class-Megastudy.R +++ /dev/null @@ -1,108 +0,0 @@ -check_study_vocabulary <- function(object) { - errors <- character() - - # the column names should be in the vocabulary - if (!object@studyIdColumnName %in% names(object@studyVocab)) { - msg <- paste0("Study ID column '", object@getStudyIdColumnName, "' not found in vocabulary.") - errors <- c(errors, msg) - } - - if (!veupathUtils::getColName(object@variableSpec) %in% names(object@studyVocab)) { - msg <- paste0("Variable spec column '", veupathUtils::getColName(object@variableSpec), "' not found in vocabulary.") - errors <- c(errors, msg) - } - - return(if (length(errors) == 0) TRUE else errors) -} - -#' Study Specific Vocabularies By Variable -#' -#' A class to specify expected values per study for some variable -#' of interest. -#' -#' @slot studyIdColumnName A string specifying the name of the column in the vocab data table that contains the study id -#' @slot variableSpecColumnName A string specifying the name of the column in the vocab data table that contains the variable vocabulary values -#' @slot studyVocab A data.table with columns studyIdColumnName and variableSpecColumnName that specifies expected vocabularies for each study -#' @name StudySpecificVocabulariesByVariable-class -#' @rdname StudySpecificVocabulariesByVariable-class -#' @include class-VariableMetadata.R -#' @export -StudySpecificVocabulariesByVariable <- setClass("StudySpecificVocabulariesByVariable", - representation = representation( - studyIdColumnName = 'character', - variableSpec = 'VariableSpec', - studyVocab = 'data.table' - ), - validity = check_study_vocabulary -) - -check_multiple_study_vocabularies_on_same_entity <- function(object) { - errors <- character() - - if (length(unique(unlist(lapply(as.list(object), getStudyIdColumnName)))) != 1) { - errors <- c(errors, paste0("All study vocabularies must be able to be identified by the same study entity. Found the following study entities: ", paste(unique(unlist(lapply(as.list(object), getStudyIdColumnName))), collapse = ", "))) - } - - if (length(unique(unlist(lapply(as.list(object), getEntityId)))) != 1) { - errors <- c(errors, paste0("All study vocabularies must belong to the same entity. Found the following entities: ", paste(unique(unlist(lapply(as.list(object), getEntityId))), collapse = ", "))) - } - - return(if (length(errors) == 0) TRUE else errors) -} - -#' @export -StudySpecificVocabulariesByVariableList <- setClass("StudySpecificVocabulariesByVariableList", - contains = "SimpleList", - prototype = prototype(elementType = "StudySpecificVocabulariesByVariable"), - validity = check_multiple_study_vocabularies_on_same_entity -) - -#this also sets us up for megastudy specific methods in plot.data if it turns out we need them - -check_megastudy <- function(object) { - errors <- character() - df <- object@data - ancestor_id_cols <- object@ancestorIdColumns - - if (!!length(ancestor_id_cols)) { - if (!all(ancestor_id_cols %in% names(df))) { - msg <- paste("Not all ancestor ID columns are present in data.frame") - errors <- c(errors, msg) - } - } else { - msg <- paste("Ancestor ID columns are required but not provided.") - errors <- c(errors, msg) - } - - if (!!length(object@collectionsDT)) { - if (!all(ancestor_id_cols[1:length(ancestor_id_cols)-1] %in% names(object@collectionsDT))) { - msg <- paste("Not all ancestor ID columns are present in collection data.frame") - errors <- c(errors, msg) - } - } - - return(if (length(errors) == 0) TRUE else errors) -} - -#' Megastudy -#' -#' A class to encapsulate everything we need for our special handling -#' of 'megastudies' in EDA. Currently that is imputing zeroes on tall data -#' given that each (sub-)study has different expected vocabularies for -#' that data. -#' -#' @slot data A data.table -#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. -#' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList -#' @slot collectionIds A data.table including collection ids and any variables of interest for the collection entity. -#' If none provided, the collection ids will be inferred from those present in `data`. -#' -#' @name Megastudy-class -#' @rdname Megastudy-class -#' @export -Megastudy <- setClass("Megastudy", representation( - data = 'data.table', - ancestorIdColumns = 'character', - studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList', - collectionsDT = 'data.frame' -), validity = check_megastudy) \ No newline at end of file diff --git a/R/class-Range.R b/R/class-Range.R deleted file mode 100644 index c5db34d..0000000 --- a/R/class-Range.R +++ /dev/null @@ -1,36 +0,0 @@ -check_range <- function(object) { - errors <- character() - min <- object@minimum - max <- object@maximum - - if (class(min) != class(max)) { - msg <- "Provided minimum and maximum values must be of the same base type (numeric, Date, Posix)." - errors <- c(errors, msg) - } - - if (!inherits(min, c('numeric', 'Date', 'Posix'))) { - msg <- "Provided minimum and maximum values must be numeric, Date, or Posix." - errors <- c(errors, msg) - } - - return(if (length(errors) == 0) TRUE else errors) -} - -#' Data Range -#' -#' A class to a data range, represented as minimum and maximum values. -#' The range could be numeric or Date/ Posix -#' -#' @slot minimum A number, Date or Posix value -#' @slot maximum A number, Date or Posix value -#' -#' @name Range-class -#' @rdname Range-class -#' @export -Range <- setClass("Range", representation( - minimum = 'ANY', - maximum = 'ANY' -), prototype = prototype( - minimum = NA_real_, - maximum = NA_real_ -), validity = check_range) \ No newline at end of file diff --git a/R/class-Statistic.R b/R/class-Statistic.R deleted file mode 100644 index 5b5c898..0000000 --- a/R/class-Statistic.R +++ /dev/null @@ -1,103 +0,0 @@ -check_statistic <- function(object) { - errors <- character() - name <- object@name - value <- object@value - pvalue <- object@pvalue - ciMin <- object@confidenceInterval@minimum - ciMax <- object@confidenceInterval@maximum - ciLevel <- object@confidenceLevel - - if (length(name) != 1 || is.na(name)) { - msg <- "The slot `name` must have a single value." - errors <- c(errors, msg) - } - - if (length(value) != 1) { - msg <- "The slot `value` must have a single value." - errors <- c(errors, msg) - } - - if (!is.na(pvalue)) { - if (length(pvalue) != 1) { - msg <- "The slot `pvalue` must have a single value." - errors <- c(errors, msg) - } else { - numericPVal <- suppressWarnings(as.numeric(pvalue)) - if (is.na(numericPVal)) { - if (pvalue != '<0.0001') { - msg <- "Provided p-value is invalid. It must either be coercible to a number or be the string '<0.0001'." - errors <- c(errors, msg) - } - } else if (numericPVal < 0 || numericPVal > 1) { - msg <- "Provided p-value is invalid. It is not between 0 and 1." - errors <- c(errors, msg) - } - } - } - - # false for NaN, while is.na is TRUE for NaN - is.NA <- function(object) { ifelse(is.na(object), ifelse(is.nan(object), FALSE, TRUE), FALSE) } - - if (is.NA(ciMax) || is.NA(ciMin)) { - if (!is.na(ciLevel)) { - msg <- "A confidence level was provided without a confidence interval." - errors <- c(errors, msg) - } - } else { - #just to guarantee the below condition works - if (is.nan(ciMin)) ciMin <- -Inf - if (is.nan(ciMax)) ciMax <- Inf - - if (!is.nan(value)) { - if (value < ciMin || value > ciMax) { - msg <- "Provided value is not within the specified confidence interval." - errors <- c(errors, msg) - } - } - - if (!is.na(ciLevel)) { - if (ciLevel < 0 || ciLevel > 1) { - msg <- "Provided confidence level is invalid." - errors <- c(errors, msg) - } - } else { - msg <- "A confidence interval was provided without a confidence level." - errors <- c(errors, msg) - } - } - - - - return(if (length(errors) == 0) TRUE else errors) -} - -#' Statistic -#' -#' A class to specify a named statistic, its value, confidence interval and p-value. -#' This is primarily to help maintain consistency in meeting the EDA API. -#' -#' @slot name A string specifying what statistic was calculated -#' @slot value A number -#' @slot confidenceInterval An optional Range specifying minimum and maximum values for the confidence interval -#' @slot confidenceLevel An optional decimal number indicating the degree of confidence represented by the confidence interval -#' @slot pvalue An optional string representing the p-value associated with the statistic. The string should either be coercible to a number or be '<0.0001'. -#' -#' @name Statistic-class -#' @rdname Statistic-class -#' @export -Statistic <- setClass("Statistic", representation( - name = 'character', - value = 'numeric', - confidenceInterval = 'Range', - confidenceLevel = 'numeric', - pvalue = 'character' -), prototype = prototype( - confidenceLevel = NA_real_, - pvalue = NA_character_ -), validity = check_statistic) - -#' @export -StatisticList <- setClass("StatisticList", - contains = "SimpleList", - prototype = prototype(elementType = "Statistic") -) diff --git a/R/class-VariableMetadata.R b/R/class-VariableMetadata.R index 2db8f78..ac18e50 100644 --- a/R/class-VariableMetadata.R +++ b/R/class-VariableMetadata.R @@ -179,7 +179,7 @@ check_variable_metadata <- function(object) { errors <- c(errors, "Members must be non-empty for collection variables.") } else { memberEntityIds <- unlist(lapply(as.list(object@members), function(x) {return(x@entityId)})) - memberColNames <- unlist(lapply(as.list(object@members), function(x) {return(veupathUtils::getColName(x))})) + memberColNames <- unlist(lapply(as.list(object@members), function(x) {return(mbioUtils::getColName(x))})) # Require all members to have the same entity if (data.table::uniqueN(memberEntityIds) > 1) { diff --git a/R/data.R b/R/data.R deleted file mode 100644 index 129a79f..0000000 --- a/R/data.R +++ /dev/null @@ -1,61 +0,0 @@ -#' Some Real Life Popbio Megastudy Test Data -#' -#' It is the data used in the subset associated w this saved analysis: -#' https://vectorbase.org/vectorbase/app/workspace/maps/A4nuqcm/import -#' as of 9 Jan 2024. -#' As of the time of writing the filters in this analysis are like this: -#' "filters": [ -#' { -#' "entityId": "EUPATH_0000605", -#' "variableId": "POPBIO_8000215", -#' "stringSet": [ -#' "VBP0000844" -#' ], -#' "type": "stringSet" -#' }, -#' { -#' "entityId": "GAZ_00000448", -#' "variableId": "EUPATH_0000542", -#' "stringSet": [ -#' "Canyon Rim Church" -#' ], -#' "type": "stringSet" -#' }, -#' { -#' "variableId": "EUPATH_0043256", -#' "entityId": "OBI_0000659", -#' "type": "dateRange", -#' "min": "2022-05-01T00:00:00Z", -#' "max": "2022-11-01T00:00:00Z" -#' }, -#' { -#' "type": "numberRange", -#' "variableId": "OBI_0001620", -#' "entityId": "GAZ_00000448", -#' "min": 40.698307645373106, -#' "max": 40.71193901146775 -#' }, -#' { -#' "type": "longitudeRange", -#' "variableId": "OBI_0001621", -#' "entityId": "GAZ_00000448", -#' "left": -111.8212938308716, -#' "right": -111.79749727249147 -#' } -#' ] -#' -#' @format ## `megastudyDataReal` -#' A data frame with 14 rows and 9 columns: -#' \describe{ -#' \item{EUPATH_0000609.Sample_stable_id}{Stable ID for the `Sample` entity} -#' \item{OBI_0000659.ParentOfSample_stable_id}{Stable ID for the `Collections` entity} -#' \item{GAZ_00000448.GeographicLocation_stable_id}{Stable ID for the `Collection sites` entity} -#' \item{EUPATH_0000605.Study_stable_id}{Stable ID for the `Studies` entity} -#' \item{EUPATH_0000609.PATO_0000047}{Sample -> Sex} -#' \item{EUPATH_0000609.POPBIO_8000017}{Sample -> Unbiased Specimen Count} -#' \item{EUPATH_0000609.OBI_0001909}{Sample -> Species} -#' \item{EUPATH_0000609.EUPATH_0043227}{Sample -> Female Insect Feeding Status} -#' \item{EUPATH_0000609.UBERON_0000105}{Sample -> Life Cycle Stage} -#' } -#' @source -"megastudyDataReal" diff --git a/R/internal-utils.R b/R/internal-utils.R index d093885..810ffb5 100644 --- a/R/internal-utils.R +++ b/R/internal-utils.R @@ -80,11 +80,11 @@ clean_names <- function(names, makeUnique = FALSE) { ## cleanColumnNames will clean up the column names to make them valid column names in R, and hopefully improve consistncy of labels as well #' @export getDataFromSource <- function(dataSource, keepIdsAndNumbersOnly = c(TRUE, FALSE), cleanColumnNames = c(FALSE, TRUE)) { - keepIdsAndNumbersOnly <- veupathUtils::matchArg(keepIdsAndNumbersOnly) - cleanColumnNames <- veupathUtils::matchArg(cleanColumnNames) + keepIdsAndNumbersOnly <- mbioUtils::matchArg(keepIdsAndNumbersOnly) + cleanColumnNames <- mbioUtils::matchArg(cleanColumnNames) if (inherits(dataSource, "character")) { - veupathUtils::logWithTime(sprintf("Attempting to read file: %s", dataSource), verbose = TRUE) + mbioUtils::logWithTime(sprintf("Attempting to read file: %s", dataSource), verbose = TRUE) dt <- data.table::fread(dataSource, na.strings=c('')) } else if (inherits(dataSource, "data.frame")) { dt <- data.table::as.data.table(dataSource) diff --git a/R/veupathUtils-package.R b/R/mbioUtils-package.R similarity index 61% rename from R/veupathUtils-package.R rename to R/mbioUtils-package.R index 018367d..369548d 100644 --- a/R/veupathUtils-package.R +++ b/R/mbioUtils-package.R @@ -1,8 +1,9 @@ #' @keywords internal +#' @importFrom methods new slotNames validObject +#' @importFrom stats complete.cases median sd var +#' @importFrom utils tail "_PACKAGE" -#' @importFrom microbenchmark microbenchmark - # The following block is used by usethis to automatically manage # roxygen namespace tags. Modify with care! ## usethis namespace: start diff --git a/R/method-correlation.R b/R/method-correlation.R index 5c19fc1..361d77e 100644 --- a/R/method-correlation.R +++ b/R/method-correlation.R @@ -31,7 +31,7 @@ function( predicateType = c('proportionNonZero', 'variance', 'sd'), threshold = 0.5 ) { - predicateType <- veupathUtils::matchArg(predicateType) + predicateType <- mbioUtils::matchArg(predicateType) if (predicateType == 'proportionNonZero') { if (threshold < 0 | threshold > 1) { @@ -101,9 +101,9 @@ function( verbose = c(TRUE, FALSE) ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) # Check that the number of rows match. if (!identical(nrow(data1), nrow(data2))) { @@ -111,17 +111,17 @@ function( } # Check that all values are numeric - if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { + if (!identical(mbioUtils::findNumericCols(data1), names(data1))) { warning("All columns in data1 are not numeric. Only numeric columns will be used.") - keepCols <- veupathUtils::findNumericCols(data1) + keepCols <- mbioUtils::findNumericCols(data1) if (length(keepCols) == 0) { stop("No numeric columns found in data1.") } data1 <- data1[, ..keepCols] } - if (!identical(veupathUtils::findNumericCols(data2), names(data2))) { + if (!identical(mbioUtils::findNumericCols(data2), names(data2))) { warning("All columns in data2 are not numeric. Only numeric columns will be used.") - keepCols <- veupathUtils::findNumericCols(data2) + keepCols <- mbioUtils::findNumericCols(data2) if (length(keepCols) == 0) { stop("No numeric columns found in data2.") } @@ -146,7 +146,7 @@ function( keep.rownames = T ) - veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) + mbioUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) ## Format results @@ -184,14 +184,14 @@ function( verbose = c(TRUE, FALSE) ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) # Check that all values are numeric - if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { + if (!identical(mbioUtils::findNumericCols(data1), names(data1))) { warning("All columns in data1 are not numeric. Only numeric columns will be used.") - keepCols <- veupathUtils::findNumericCols(data1) + keepCols <- mbioUtils::findNumericCols(data1) if (length(keepCols) == 0) { stop("No numeric columns found in data1.") } @@ -232,7 +232,7 @@ function( } - veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) + mbioUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) ## Format results rowAndColNames <- expand.grid(rownames(corrResult), colnames(corrResult)) @@ -270,8 +270,8 @@ buildCorrelationComputeResult <- function( method = c('spearman','pearson','sparcc'), verbose = c(TRUE, FALSE) ) { - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) # both AbundanceData and SampleMetadata have these slots recordIdColumn <- ifelse('recordIdColumn' %in% slotNames(data1), data1@recordIdColumn, NA_character_) @@ -293,7 +293,7 @@ buildCorrelationComputeResult <- function( result@parameters <- paste0('method = ', method) validObject(result) - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( 'Correlation computation completed with parameters recordIdColumn=', recordIdColumn, ', method = ', method @@ -339,9 +339,9 @@ function( verbose = c(TRUE, FALSE) ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) correlation(data, NULL, method = method, format = format, verbose = verbose) }) @@ -361,10 +361,10 @@ function( metadataIsFirst = c(FALSE,TRUE) ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) - metadataIsFirst <- veupathUtils::matchArg(metadataIsFirst) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) + metadataIsFirst <- mbioUtils::matchArg(metadataIsFirst) #prefilters applied data1 <- pruneFeatures(data1, predicateFactory('proportionNonZero', proportionNonZeroThreshold), verbose) @@ -390,7 +390,7 @@ function( ) } - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Received df table with", nrow(values), "samples and", @@ -425,9 +425,9 @@ function( stdDevThreshold = 0 ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) #prefilters applied data <- pruneFeatures(data, predicateFactory('proportionNonZero', proportionNonZeroThreshold), verbose) @@ -437,7 +437,7 @@ function( values <- getCollectionData(data, variableNames = NULL, ignoreImputeZero = FALSE, includeIds = FALSE, verbose = verbose) corrResult <- correlation(values, NULL, method = method, format = 'data.table', verbose = verbose) - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Received df table with", nrow(values), "samples and", @@ -464,9 +464,9 @@ function( format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE) ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) corrResult <- correlation( getSampleMetadata(data, TRUE, FALSE), @@ -476,7 +476,7 @@ function( verbose = verbose ) - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Received df table with", nrow(data), "samples and", @@ -508,9 +508,9 @@ function( stdDevThreshold = 0 ) { - format <- veupathUtils::matchArg(format) - method <- veupathUtils::matchArg(method) - verbose <- veupathUtils::matchArg(verbose) + format <- mbioUtils::matchArg(format) + method <- mbioUtils::matchArg(method) + verbose <- mbioUtils::matchArg(verbose) #prefilters applied data1 <- pruneFeatures(data1, predicateFactory('proportionNonZero', proportionNonZeroThreshold), verbose) @@ -523,7 +523,7 @@ function( values1 <- getCollectionData(data1, variableNames = NULL, ignoreImputeZero = FALSE, includeIds = TRUE, verbose = verbose) values2 <- getCollectionData(data2, variableNames = NULL, ignoreImputeZero = FALSE, includeIds = TRUE, verbose = verbose) - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Received first df table with", nrow(values1), "samples and", @@ -531,7 +531,7 @@ function( ), verbose ) - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Received second df table with", nrow(values2), "samples and", @@ -557,7 +557,7 @@ function( if (length(commonSamples) == 0) { stop('No samples in common between data1 and data2') } else { - veupathUtils::logWithTime( + mbioUtils::logWithTime( paste( "Found", length(commonSamples), "samples in common between data1 and data2. Only these samples will be used." ), @@ -572,7 +572,7 @@ function( values1 <- values1[, -..allIdColumns] values2 <- values2[, -..allIdColumns] - corrResult <- veupathUtils::correlation( + corrResult <- mbioUtils::correlation( values1, values2, method = method, @@ -583,7 +583,7 @@ function( if (format == 'data.table') { return(corrResult) } else { - result <- veupathUtils::buildCorrelationComputeResult(corrResult, data1, data2, method, verbose) + result <- mbioUtils::buildCorrelationComputeResult(corrResult, data1, data2, method, verbose) result@computationDetails <- 'correlation' return(result) } diff --git a/R/methods-CollectionWithMetadata.R b/R/methods-CollectionWithMetadata.R index 32b5dd9..3e6f1eb 100644 --- a/R/methods-CollectionWithMetadata.R +++ b/R/methods-CollectionWithMetadata.R @@ -86,7 +86,7 @@ setMethod("getSampleMetadataIdColumns", "CollectionWithMetadata", function(objec #' @param includeIds boolean indicating whether we should include recordIdColumn and ancestorIdColumns #' @param metadataVariables The metadata variables to include in the sample metadata. If NULL, all metadata variables will be included. #' @return data.table of sample metadata -#' @import veupathUtils +#' @import mbioUtils #' @import data.table #' @rdname getSampleMetadata #' @export @@ -98,8 +98,8 @@ setGeneric("getSampleMetadata", #' @rdname getSampleMetadata #' @aliases getSampleMetadata,CollectionWithMetadata-method setMethod("getSampleMetadata", signature("CollectionWithMetadata"), function(object, asCopy = c(TRUE, FALSE), includeIds = c(TRUE, FALSE), metadataVariables = NULL) { - asCopy <- veupathUtils::matchArg(asCopy) - includeIds <- veupathUtils::matchArg(includeIds) + asCopy <- mbioUtils::matchArg(asCopy) + includeIds <- mbioUtils::matchArg(includeIds) dt <- object@sampleMetadata@data allIdColumns <- getSampleMetadataIdColumns(object) @@ -152,7 +152,7 @@ setGeneric("removeIncompleteRecords", #' @rdname removeIncompleteRecords #' @aliases removeIncompleteRecords,CollectionWithMetadata-method setMethod("removeIncompleteRecords", signature("CollectionWithMetadata"), function(object, colName = character(), verbose = c(TRUE, FALSE)) { - verbose <- veupathUtils::matchArg(verbose) + verbose <- mbioUtils::matchArg(verbose) df <- getCollectionData(object, verbose = verbose) sampleMetadata <- getSampleMetadata(object) # df may have had rows removed due to getCollectionData behavior. Subset sampleMetadata to match @@ -160,7 +160,7 @@ setMethod("removeIncompleteRecords", signature("CollectionWithMetadata"), functi # Remove Records with NA from data and metadata if (any(is.na(sampleMetadata[[colName]]))) { - veupathUtils::logWithTime("Found NAs in specified variable. Removing these records.", verbose) + mbioUtils::logWithTime("Found NAs in specified variable. Removing these records.", verbose) recordsWithData <- which(!is.na(sampleMetadata[[colName]])) # Keep records with data. Recall the CollectionWithMetadata object requires records to be in the same order # in both the data and metadata diff --git a/R/methods-Collections.R b/R/methods-Collections.R index 855b2df..7f51dfc 100644 --- a/R/methods-Collections.R +++ b/R/methods-Collections.R @@ -85,9 +85,9 @@ function( includeIds = c(TRUE, FALSE), verbose = c(TRUE, FALSE) ) { - ignoreImputeZero <- veupathUtils::matchArg(ignoreImputeZero) - includeIds <- veupathUtils::matchArg(includeIds) - verbose <- veupathUtils::matchArg(verbose) + ignoreImputeZero <- mbioUtils::matchArg(ignoreImputeZero) + includeIds <- mbioUtils::matchArg(includeIds) + verbose <- mbioUtils::matchArg(verbose) allIdColumns <- getIdColumns(object) if (is.null(variableNames)) { @@ -108,13 +108,21 @@ function( dt <- dt[rowSums(isNAorZero(dt.noIds)) != ncol(dt.noIds),] numRecordsRemoved <- nrow(dt.noIds) - nrow(dt) if (numRecordsRemoved > 0) { - veupathUtils::logWithTime(paste0("Removed ", numRecordsRemoved, " records with no data."), verbose) + mbioUtils::logWithTime(paste0("Removed ", numRecordsRemoved, " records with no data."), verbose) } } # Replace NA values with 0 if (!ignoreImputeZero && object@imputeZero) { - veupathUtils::setNaToZero(dt) + # Find numeric columns and replace NAs with 0 + # NOTE: This is a simplified inline version of veupathUtils::setNaToZero() + # which also supported column-specific targeting and validation. + # If edge cases arise, see veupathUtils/R/utils-numeric.R for the full + # implementation with setNaToZero(), findNumericCols(), and validateNumericCols(). + numericCols <- names(dt)[sapply(dt, is.numeric)] + if (length(numericCols) > 0) { + data.table::setnafill(dt, fill = 0, cols = numericCols) + } } if (!includeIds) { diff --git a/R/methods-ComputeResult.R b/R/methods-ComputeResult.R index 4f0c0f8..ca474bf 100644 --- a/R/methods-ComputeResult.R +++ b/R/methods-ComputeResult.R @@ -16,9 +16,9 @@ setGeneric("writeMeta", #'@export setMethod("writeMeta", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { - verbose <- veupathUtils::matchArg(verbose) + verbose <- mbioUtils::matchArg(verbose) - outJson <- veupathUtils::toJSON(object@computedVariableMetadata) + outJson <- mbioUtils::toJSON(object@computedVariableMetadata) if (is.null(pattern)) { pattern <- object@name @@ -30,7 +30,7 @@ setMethod("writeMeta", signature("ComputeResult"), function(object, pattern = NU outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json")) write(outJson, outFileName) - veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + mbioUtils::logWithTime(paste('New output file written:', outFileName), verbose) return(outFileName) }) @@ -52,7 +52,7 @@ setGeneric("writeData", #'@export setMethod("writeData", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { - verbose <- veupathUtils::matchArg(verbose) + verbose <- mbioUtils::matchArg(verbose) if (is.null(pattern)) { pattern <- object@name @@ -64,7 +64,7 @@ setMethod("writeData", signature("ComputeResult"), function(object, pattern = NU outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".tab")) data.table::fwrite(object@data, outFileName, sep = '\t', quote = FALSE) - veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + mbioUtils::logWithTime(paste('New output file written:', outFileName), verbose) return(outFileName) }) @@ -90,7 +90,7 @@ setGeneric("writeStatistics", #'@export setMethod("writeStatistics", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { - verbose <- veupathUtils::matchArg(verbose) + verbose <- mbioUtils::matchArg(verbose) # Convert all to character but maintain structure if (inherits(object@statistics, 'data.frame')) { @@ -111,7 +111,7 @@ setMethod("writeStatistics", signature("ComputeResult"), function(object, patter outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json")) write(outJson, outFileName) - veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + mbioUtils::logWithTime(paste('New output file written:', outFileName), verbose) return(outFileName) }) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R deleted file mode 100644 index 06e80c0..0000000 --- a/R/methods-Megastudy.R +++ /dev/null @@ -1,313 +0,0 @@ -#' get a VariableSpec -#' -#' This function returns a string representation of a VariableSpec. By -#' default it assumes the VariableSpec is available in a slot called -#' `variableSpec`. -#' -#' @param object An object containing a veupathUtils::VariableSpec -#' @return character -#' @export -setGeneric("getVariableSpec", - function(object, ...) standardGeneric("getVariableSpec"), - signature = "object" -) - -#' @export -setMethod('getVariableSpec', signature('StudySpecificVocabulariesByVariable'), function(object) { - return(object@variableSpec) -}) - -#' @export -setMethod('getVariableSpec', signature('ANY'), function(object) { - return(object@variableSpec) -}) - -#' @export -setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { - getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) - varSpecs <- list(object@variableSpec) - - #if the variable is a collection, then we want to return the member variable specs - if (object@isCollection && getCollectionMemberVarSpecs %in% c("Dynamic", "Always")) { - varSpecs <- as.list(object@members) - } else if (!object@isCollection && getCollectionMemberVarSpecs == "Always") { - varSpecs <- NULL - } - - return(varSpecs) -}) - -#' @export -setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { - getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) - - varSpecs <- unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs)) - if (all(unlist(lapply(varSpecs, is.null)))) { - return(NULL) - } - - return(varSpecs) -}) - -#' StuydIdColName as String -#' -#' This function returns the studyIdColName from an StudySpecificVocabulariesByVariable -#' -#' @param object veupathUtils::StudySpecificVocabulariesByVariable -#' @return character -#' @export -setGeneric("getStudyIdColumnName", - function(object) standardGeneric("getStudyIdColumnName"), - signature = "object" -) - -#' @export -setMethod('getStudyIdColumnName', signature('StudySpecificVocabulariesByVariable'), function(object) { - return(object@studyIdColumnName) -}) - -#' @export -setMethod('getStudyIdColumnName', signature('StudySpecificVocabulariesByVariableList'), function(object) { - #works bc of class validation that theyre all the same - return(veupathUtils::getStudyIdColumnName(object[[1]])) -}) - -#' VarSpecColName as String -#' -#' This function returns the variableSpec from an StudySpecificVocabulariesByVariable -#' -#' @param object veupathUtils::StudySpecificVocabulariesByVariable -#' @return character -#' @export -setGeneric("getVariableSpecColumnName", - function(object) standardGeneric("getVariableSpecColumnName"), - signature = "object" -) - -#' @export -setMethod('getVariableSpecColumnName', signature('StudySpecificVocabulariesByVariable'), function(object) { - #since we validate theyre all the same, can just take the first - return(veupathUtils::getColName(object@variableSpec)) -}) - -#' @export -setMethod('getVariableSpecColumnName', signature('StudySpecificVocabulariesByVariableList'), function(object) { - return(unlist(lapply(as.list(object), veupathUtils::getVariableSpecColumnName))) -}) - -#' @export -setGeneric("getEntityId", - function(object) standardGeneric("getEntityId"), - signature = "object" -) - -#' @export -setMethod('getEntityId', signature('VariableSpec'), function(object) { - return(object@entityId) -}) - -#' @export -setMethod('getEntityId', signature('StudySpecificVocabulariesByVariable'), function(object) { - return(veupathUtils::getEntityId(object@variableSpec)) -}) - -#should this be an s4 method? -findEntityIdColumnNameForVariableSpec <- function(varSpec, entityIdColumns) { - if (!inherits(varSpec, 'VariableSpec')) stop("The first argument must be of the S4 class `VariableSpec`.") - - return(entityIdColumns[grepl(varSpec@entityId, entityIdColumns)]) -} - -findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) { - if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.") - if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.") - if (!inherits(variableSpec, 'VariableSpec')) stop("The third argument must be of the S4 class `VariableSpec`.") - - vocabVariableSpecs <- lapply(as.list(vocabs), veupathUtils::getVariableSpec) - vocabVariableMetadata <- veupathUtils::findVariableMetadataFromVariableSpec(variables, veupathUtils::VariableSpecList(S4Vectors::SimpleList(vocabVariableSpecs))) - vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, "Always") - - # if we have found variable collection members in the VariableMetadata, need to check if the passed varspec was a member - # look through the list that includes the members, and if we match one, get the varspec of the parent/ collection - # use the varspec of the parent/ collection to get the VariableMetadata associated w the entire collection - # remember, individual members dont have their own VariableMetadata - if (length(vocabVariableSpecsAdjustedForVariableCollectionMembers) > 0) { - index <- which(purrr::map(vocabVariableSpecsAdjustedForVariableCollectionMembers, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(variableSpec)) - variableCollectionSpecs <- vocabVariableSpecsAdjustedForVariableCollectionMembers[[index]] - index <- which(purrr::map(vocabVariableMetadata, function(x) {veupathUtils::getColName(variableCollectionSpecs) %in% unlist(veupathUtils::getColName(x@members))}) == TRUE) - } else { - index <- which(purrr::map(vocabVariableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(variableSpec)) - } - - return(vocabs[[index]]) -} - - -findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { - if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.") - if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.") - getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) - - varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(lapply(as.list(vocabs), getVariableSpec))) - - if (getCollectionMemberVarSpecs != "Never") { - varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs) - varSpecsWithVocabs <- getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs) - if (is.null(varSpecsWithVocabs)) { - return(NULL) - } - varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(varSpecsWithVocabs)) - } - - return(varSpecsWithVocabs) -} - -getVariableColumnNames <- function(variableMetadata) { - if (!inherits(variableMetadata, 'VariableMetadata')) stop("The specified object must be of the S4 class `VariableMetadata`.") - - colNames <- veupathUtils::getColName(variableMetadata@variableSpec) - - if (variableMetadata@isCollection) { - colNames <- unlist(lapply(as.list(variableMetadata@members), veupathUtils::getColName)) - } - - return(colNames) -} - -#' Impute Zeroes (on tall data) -#' -#' This function returns a data.table which has explicit zero values -#' for all expected categories of some variable of interest in a megastudy. -#' -#' @param object veupathUtils::Megastudy -#' @return data.table -#' @include class-VariableMetadata.R -#' @export -setGeneric("getDTWithImputedZeroes", - function(object, variables, verbose = c(TRUE, FALSE)) standardGeneric("getDTWithImputedZeroes"), - signature = c("object", "variables") -) - -#' @importFrom digest digest -#' @export -setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadataList'), function (object, variables, verbose = c(TRUE, FALSE)) { - verbose <- veupathUtils::matchArg(verbose) - veupathUtils::logWithTime("Start imputing zeroes...", verbose) - - weightingVariablesMetadata <- findWeightingVariablesMetadata(variables) - if (is.null(weightingVariablesMetadata)) { - veupathUtils::logWithTime("No weighting variables present in the plot. No imputation will be done.", verbose) - return(object@data) - } - - .dt <- object@data - veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose) - allEntityIdColumns <- object@ancestorIdColumns - vocabs <- object@studySpecificVocabularies - collectionsDT <- object@collectionsDT - - # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot? - # plus going that route means using this class in plot.data means an api change for plot.data - # that api change might be worth making in any case, but not doing it now - ## TODO validate that any collections variables are present in collectionsDT - variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables) - variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never") - variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always") - variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs) - if (length(variableMetadataForStudyVocabVariables) < length(variableMetadataNeedingStudyVocabularies)) { - stop("Some provided variables require study vocabularies but dont have one.") - } - if (length(weightingVariablesMetadata) > 1) { - stop("Megastudy class does not yet support imputing zeroes when there is more than one weighting variable present.") - } - weightingVarSpecsForStudyVocabVariables <- findWeightingVariableSpecs(variableMetadataForStudyVocabVariables) - if (length(vocabs) > 1) { - weightingVarColName <- unique(unlist(lapply(weightingVarSpecsForStudyVocabVariables, veupathUtils::getColName))) - if (length(weightingVarColName) > 1) { - stop("All study vocabularies must belong to variables on the same entity using the same weighting variable.") - } - } else { - weightingVarColName <- veupathUtils::getColName(findVariableMetadataFromVariableSpec(variables, veupathUtils::getVariableSpec(vocabs[[1]]))[[1]]@weightingVariableSpec) - } - - veupathUtils::logWithTime("Finding variables with study vocabularies...", verbose) - variableSpecsToImputeZeroesFor <- veupathUtils::getVariableSpec(variableMetadataForStudyVocabVariables) - studyIdColName <- getStudyIdColumnName(vocabs) - varSpecColNames <- unlist(lapply(variableSpecsToImputeZeroesFor, veupathUtils::getColName)) - # this works bc we validate all vocabs must be on the same entity - varSpecEntityIdColName <- findEntityIdColumnNameForVariableSpec(veupathUtils::getVariableSpec(vocabs[[1]]), allEntityIdColumns) - variablesFromEntityOfInterest <- findVariableMetadataFromEntityId(variables, veupathUtils::getVariableSpec(vocabs[[1]])@entityId) - variableSpecsFromEntityOfInterest <- veupathUtils::getVariableSpec(variablesFromEntityOfInterest) - if (any(unlist(getHasStudyDependentVocabulary(variablesFromEntityOfInterest)) & - unlist(lapply(variableSpecsFromEntityOfInterest, identical, weightingVarSpecsForStudyVocabVariables[[1]])))) { - stop("Not all variables on the entity associated with the present study vocabulary have study vocabularies.") - } - veupathUtils::logWithTime("Imputing zeroes request validated.", verbose) - - # !!!! this assumes entity ids are passed in order, from a single branch - # alternative would i guess be to make this class aware of the entity diagram - upstreamEntityIdColNames <- allEntityIdColumns[1:(which(allEntityIdColumns %in% varSpecEntityIdColName)-1)] - if (!all(allEntityIdColumns[!allEntityIdColumns %in% varSpecEntityIdColName] %in% upstreamEntityIdColNames)) { - # if we have downstream entities, it doesnt make sense to do all this work. plot.data will just remove the imputed values. - # if/when the map supports missingness and NA values on downstream entities start to matter, we can revisit. - veupathUtils::logWithTime("Downstream entities present. No imputation will be done (for now... mwahahaha).", verbose) - return(.dt) - } - studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered - - # variables that are from the upstream entities need to be in collectionsDT - # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable - upstreamEntities <- veupathUtils::strSplit(upstreamEntityIdColNames, ".", 2, 1) - if (!!length(collectionsDT)) { - upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntities }) - if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) { - stop("All variables from the upstream entities must be in collectionsDT.") - } - } - - # for upstream entities data - upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE] - upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL - upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) - veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) - if (!!length(collectionsDT)) { - upstreamEntityVariables.dt <- collectionsDT - } - entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) - - # make all possible variable value combinations table - vocabDTs <- lapply(vocabs, function(x) {x@studyVocab}) - if (!!length(collectionsDT)) { - vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT[, upstreamEntityIdColNames], by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) - } - mergeBy <- studyEntityIdColName - if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames - allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = mergeBy, allow.cartesian=TRUE, all=TRUE) - - # find which ones we need to add - presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE]) - # need upstream entity ids for all combinations in order to properly find and merge missing values - allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE) - # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab - addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)] - - if (nrow(addCombinations.dt) == 0) { - veupathUtils::logWithTime("No new combinations to add. Returning existing table.", verbose) - return(.dt) - } else { - veupathUtils::logWithTime(paste("Adding", nrow(addCombinations.dt), "new combinations."), verbose) - } - - # go ahead and add them, first filling in values for all columns - addCombinations.dt[[weightingVarColName]] <- 0 - addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10) - # bind them to the existing rows - upstreamVariablesInCollectionsDT <- names(collectionsDT)[!names(collectionsDT) %in% upstreamEntityIdColNames] - if (!!length(collectionsDT) & !all(upstreamVariablesInCollectionsDT %in% names(.dt))) { - .dt <- merge(.dt, upstreamEntityVariables.dt) - } - .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE) - veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose) - - return(.dt) -}) diff --git a/R/methods-SampleMetadata.R b/R/methods-SampleMetadata.R index eb81fb5..f76ddf3 100644 --- a/R/methods-SampleMetadata.R +++ b/R/methods-SampleMetadata.R @@ -1,8 +1,8 @@ #' @rdname getSampleMetadata #' @aliases getSampleMetadata,SampleMetadata-method setMethod("getSampleMetadata", signature("SampleMetadata"), function(object, asCopy = c(TRUE, FALSE), includeIds = c(TRUE, FALSE)) { - asCopy <- veupathUtils::matchArg(asCopy) - includeIds <- veupathUtils::matchArg(includeIds) + asCopy <- mbioUtils::matchArg(asCopy) + includeIds <- mbioUtils::matchArg(includeIds) dt <- object@data # Check that incoming dt meets requirements diff --git a/R/methods-Statistic.R b/R/methods-Statistic.R deleted file mode 100644 index 0773843..0000000 --- a/R/methods-Statistic.R +++ /dev/null @@ -1,30 +0,0 @@ - - -#' Convert StatisicList to data.table -#' -#' This function returns a data.table representation of a StatisticList. -#' The content of the slots are stored as JSON. Each entry in the list becomes a named column. -#' @param object A StatisticList object -#' @return data.table -#' @export -setGeneric("getDataTable", - function(object) standardGeneric("getDataTable"), - signature = "object" -) - -#' @export -setMethod("getDataTable", signature("StatisticList"), function(object) { - colNames <- unlist(lapply(as.list(object), function(x) {x@name})) - dt <- data.table::as.data.table(as.list(object)) - data.table::setnames(dt, colNames) - - return(dt) -}) - -#' @export -setMethod("getDataTable", signature("Statistic"), function(object) { - colName <- object@name - dt <- data.table::as.data.table(colName = object) - - return(dt) -}) \ No newline at end of file diff --git a/R/methods-VariableMetadata.R b/R/methods-VariableMetadata.R index e3da533..3d31335 100644 --- a/R/methods-VariableMetadata.R +++ b/R/methods-VariableMetadata.R @@ -14,18 +14,18 @@ setGeneric("merge", #'@export setMethod("merge", signature("VariableMetadataList", "VariableMetadataList"), function(x,y) { - veupathUtils::VariableMetadataList(S4Vectors::SimpleList(c(as.list(x), as.list(y)))) + mbioUtils::VariableMetadataList(S4Vectors::SimpleList(c(as.list(x), as.list(y)))) }) #'@export setMethod("merge", signature("VariableSpecList", "VariableSpecList"), function(x,y) { - veupathUtils::VariableSpecList(S4Vectors::SimpleList(c(as.list(x), as.list(y)))) + mbioUtils::VariableSpecList(S4Vectors::SimpleList(c(as.list(x), as.list(y)))) }) #' R object as JSON string #' #' This function converts an R object to a JSON string. -#' see `methods(veupathUtils::toJSON)` for a list of support classes. +#' see `methods(mbioUtils::toJSON)` for a list of support classes. #' @param object object of a supported S4 class to convert to a JSON string representation #' @param named logical indicating whether the result should be a complete named JSON object or just the value of the object #' @return character vector of length 1 containing JSON string @@ -40,7 +40,7 @@ setGeneric("toJSON", #' @export setMethod("toJSON", signature("Bin"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) # possible well want to make these optional, rather than null start_json <- jsonlite::toJSON(jsonlite::unbox(object@binStart), na = 'null') @@ -69,7 +69,7 @@ setMethod("toJSON", signature("Bin"), function(object, named = c(TRUE, FALSE)) { #' @export setMethod("toJSON", signature("BinList"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- S4SimpleListToJSON(object, named) if (named) tmp <- paste0('{"bins":', tmp, "}") @@ -77,70 +77,13 @@ setMethod("toJSON", signature("BinList"), function(object, named = c(TRUE, FALSE return(tmp) }) -#' @export -setMethod("toJSON", signature("Range"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) - - if (all(is.na(c(object@minimum, object@maximum)))) { - range_string <- NA - } else { - range_string <- paste0('(', object@minimum, ' - ', object@maximum, ')') - } - - range_json <- jsonlite::toJSON(jsonlite::unbox(range_string)) - - if (named) { - range_json <- paste0('{"range":', range_json, "}") - } - - return(range_json) -}) - -#' @export -setMethod("toJSON", signature("Statistic"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) - - value_json <- jsonlite::toJSON(jsonlite::unbox(object@value), na = 'null') - tmp <- paste0('"value":', value_json) - - ci_json <- veupathUtils::toJSON(object@confidenceInterval, FALSE) - tmp <- paste0(tmp, ',"confidenceInterval":', ci_json) - - conf_level_json <- jsonlite::toJSON(jsonlite::unbox(object@confidenceLevel), na = 'null') - tmp <- paste0(tmp, ',"confidenceLevel":', conf_level_json) - - pvalue_json <- jsonlite::toJSON(jsonlite::unbox(object@pvalue)) - tmp <- paste0(tmp, ',"pvalue":', pvalue_json) - - tmp <- paste0("{", tmp, "}") - if (named) { - tmp <- paste0('{"', object@name, '":', tmp, "}") - } - - return(tmp) -}) - -#' @export -setMethod("toJSON", signature("StatisticList"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) - tmp <- S4SimpleListToJSON(object, TRUE) - - if (named) tmp <- paste0('{"statistics":', tmp, "}") - - return(tmp) -}) - -# these let jsonlite::toJSON work by using the veupathUtils::toJSON methods for our custom S4 classes -asJSONGeneric <- getGeneric("asJSON", package = "jsonlite") -setMethod(asJSONGeneric, "Statistic", function(x, ...) veupathUtils::toJSON(x, FALSE)) -setMethod(asJSONGeneric, "StatisticList", function(x, ...) veupathUtils::toJSON(x, FALSE)) ############################################################################################## #' @export setMethod("toJSON", signature("VariableClass"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- jsonlite::toJSON(jsonlite::unbox(object@value)) if (named) tmp <- paste0('{"variableClass":', tmp, '}') @@ -150,7 +93,7 @@ setMethod("toJSON", signature("VariableClass"), function(object, named = c(TRUE, #' @export setMethod("toJSON", signature("VariableSpec"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- list("variableId" = jsonlite::unbox(object@variableId), "entityId" = jsonlite::unbox(object@entityId)) @@ -161,7 +104,7 @@ setMethod("toJSON", signature("VariableSpec"), function(object, named = c(TRUE, #' @export setMethod("toJSON", signature("PlotReference"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- jsonlite::toJSON(jsonlite::unbox(object@value)) if (named) tmp <- paste0('{"plotReference":', tmp, '}') @@ -171,7 +114,7 @@ setMethod("toJSON", signature("PlotReference"), function(object, named = c(TRUE, #' @export setMethod("toJSON", signature("VariableSpecList"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- S4SimpleListToJSON(object, named) if (named) tmp <- paste0('{"variableSpecs":', tmp, "}") @@ -181,7 +124,7 @@ setMethod("toJSON", signature("VariableSpecList"), function(object, named = c(TR #' @export setMethod("toJSON", signature("DataType"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- jsonlite::unbox(jsonlite::unbox(tolower(object@value))) if (named) tmp <- list("dataType" = tmp) @@ -191,7 +134,7 @@ setMethod("toJSON", signature("DataType"), function(object, named = c(TRUE, FALS #' @export setMethod("toJSON", signature("DataShape"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- jsonlite::unbox(jsonlite::unbox(tolower(object@value))) if (named) tmp <- list("dataShape" = tmp) @@ -201,17 +144,17 @@ setMethod("toJSON", signature("DataShape"), function(object, named = c(TRUE, FAL #' @export setMethod("toJSON", signature("VariableMetadata"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- character() - variable_class_json <- veupathUtils::toJSON(object@variableClass, named = FALSE) - variable_spec_json <- veupathUtils::toJSON(object@variableSpec, named = FALSE) + variable_class_json <- mbioUtils::toJSON(object@variableClass, named = FALSE) + variable_spec_json <- mbioUtils::toJSON(object@variableSpec, named = FALSE) tmp <- paste0('"variableClass":', variable_class_json, ',"variableSpec":', variable_spec_json) if (!is.na(object@plotReference@value)) { - plot_reference_json <- veupathUtils::toJSON(object@plotReference, named = FALSE) + plot_reference_json <- mbioUtils::toJSON(object@plotReference, named = FALSE) tmp <- paste0(tmp, ',"plotReference":', plot_reference_json) } @@ -231,12 +174,12 @@ setMethod("toJSON", signature("VariableMetadata"), function(object, named = c(TR } if (!is.na(object@dataType@value)) { - data_type_json <- veupathUtils::toJSON(object@dataType, named = FALSE) + data_type_json <- mbioUtils::toJSON(object@dataType, named = FALSE) tmp <- paste0(tmp, ',"dataType":', data_type_json) } if (!is.na(object@dataType@value)) { - data_shape_json <- veupathUtils::toJSON(object@dataShape, named = FALSE) + data_shape_json <- mbioUtils::toJSON(object@dataShape, named = FALSE) tmp <- paste0(tmp, ',"dataShape":', data_shape_json) } @@ -249,14 +192,14 @@ setMethod("toJSON", signature("VariableMetadata"), function(object, named = c(TR tmp <- paste0(tmp, ',"imputeZero":', jsonlite::toJSON(jsonlite::unbox(object@imputeZero))) if (!is.na(object@weightingVariableSpec@variableId)) { - weighting_variable_spec_json <- veupathUtils::toJSON(object@weightingVariableSpec, named = FALSE) + weighting_variable_spec_json <- mbioUtils::toJSON(object@weightingVariableSpec, named = FALSE) tmp <- paste0(tmp, ',"weightingVariableSpec":', weighting_variable_spec_json) } tmp <- paste0(tmp, ',"hasStudyDependentVocabulary":', jsonlite::toJSON(jsonlite::unbox(object@hasStudyDependentVocabulary))) if (!!length(object@members)) { - members_json <- veupathUtils::toJSON(object@members, named = FALSE) + members_json <- mbioUtils::toJSON(object@members, named = FALSE) tmp <- paste0(tmp, ',"members":', members_json) } @@ -268,7 +211,7 @@ setMethod("toJSON", signature("VariableMetadata"), function(object, named = c(TR #' @export setMethod("toJSON", signature("VariableMetadataList"), function(object, named = c(TRUE, FALSE)) { - named <- veupathUtils::matchArg(named) + named <- mbioUtils::matchArg(named) tmp <- S4SimpleListToJSON(object, FALSE) if (named) tmp <- paste0('{"variables":', tmp, "}") @@ -316,10 +259,10 @@ setGeneric("findWeightingVariablesMetadata", #' @export setMethod("findWeightingVariablesMetadata", signature("VariableMetadataList"), function(variables) { - weightingVarSpecs <- veupathUtils::findWeightingVariableSpecs(variables) - weightingVarSpecsColumnNames <- unlist(lapply(weightingVarSpecs, veupathUtils::getColName)) + weightingVarSpecs <- mbioUtils::findWeightingVariableSpecs(variables) + weightingVarSpecsColumnNames <- unlist(lapply(weightingVarSpecs, mbioUtils::getColName)) - weightingVarIndex <- which(purrr::map(as.list(variables), function(x) {veupathUtils::getColName(x@variableSpec)}) %in% weightingVarSpecsColumnNames) + weightingVarIndex <- which(purrr::map(as.list(variables), function(x) {mbioUtils::getColName(x@variableSpec)}) %in% weightingVarSpecsColumnNames) if (!length(weightingVarIndex)) return(NULL) return(variables[weightingVarIndex]) @@ -339,7 +282,7 @@ setMethod("findWeightingVariableSpecs", signature("VariableMetadata"), function( #TODO should this return a VariableSpecList? #' @export setMethod("findWeightingVariableSpecs", signature("VariableMetadataList"), function(object) { - return(lapply(as.list(object), veupathUtils::findWeightingVariableSpecs)) + return(lapply(as.list(object), mbioUtils::findWeightingVariableSpecs)) }) #' EDA Variable Metadata with a Study-dependent Vocabulary @@ -378,7 +321,7 @@ setMethod("getHasStudyDependentVocabulary", signature("VariableMetadata"), funct #' @export setMethod("getHasStudyDependentVocabulary", signature("VariableMetadataList"), function(object) { - return(lapply(as.list(object), veupathUtils::getHasStudyDependentVocabulary)) + return(lapply(as.list(object), mbioUtils::getHasStudyDependentVocabulary)) }) #' EDA Variable Metadata which needs weighting @@ -419,7 +362,7 @@ setGeneric("findVariableMetadataFromPlotRef", #' @export setMethod("findVariableMetadataFromPlotRef", signature("VariableMetadataList"), function(variables, plotRef) { - index <- veupathUtils::findIndexFromPlotRef(variables, plotRef) + index <- mbioUtils::findIndexFromPlotRef(variables, plotRef) if (!length(index)) return(NULL) return(variables[[index]]) @@ -454,7 +397,7 @@ setGeneric("findVariableSpecFromPlotRef", #' @export setMethod("findVariableSpecFromPlotRef", signature("VariableMetadataList"), function(variables, plotRef) { - index <- veupathUtils::findIndexFromPlotRef(variables, plotRef) + index <- mbioUtils::findIndexFromPlotRef(variables, plotRef) if (!length(index)) return(NULL) return(variables[[index]]@variableSpec) @@ -496,11 +439,11 @@ setGeneric("findColNamesFromPlotRef", #' @export setMethod("findColNamesFromPlotRef", signature("VariableMetadataList"), function(variables, plotRef) { - colNames <- veupathUtils::findColNamesByPredicate(variables, function(x) {if (!is.na(x@plotReference@value) && x@plotReference@value == plotRef && !x@isCollection) TRUE}) + colNames <- mbioUtils::findColNamesByPredicate(variables, function(x) {if (!is.na(x@plotReference@value) && x@plotReference@value == plotRef && !x@isCollection) TRUE}) if (!length(colNames)) { - collectionVM <- veupathUtils::findCollectionVariableMetadata(variables) + collectionVM <- mbioUtils::findCollectionVariableMetadata(variables) if (!length(collectionVM)) return(NULL) - if (collectionVM@plotReference@value == plotRef) colNames <- unlist(lapply(as.list(collectionVM@members), veupathUtils::getColName)) + if (collectionVM@plotReference@value == plotRef) colNames <- unlist(lapply(as.list(collectionVM@members), mbioUtils::getColName)) } return(colNames) @@ -520,11 +463,11 @@ setGeneric("findAllColNames", #' @export setMethod("findAllColNames", signature("VariableMetadataList"), function(variables) { - colNames <- veupathUtils::findColNamesByPredicate(variables, function(x) {if (!x@isCollection) TRUE}) + colNames <- mbioUtils::findColNamesByPredicate(variables, function(x) {if (!x@isCollection) TRUE}) if (!length(colNames)) { - collectionVM <- veupathUtils::findCollectionVariableMetadata(variables) + collectionVM <- mbioUtils::findCollectionVariableMetadata(variables) if (!length(collectionVM)) return(NULL) - colNames <- unlist(lapply(as.list(collectionVM@members), veupathUtils::getColName)) + colNames <- unlist(lapply(as.list(collectionVM@members), mbioUtils::getColName)) } return(colNames) @@ -549,7 +492,7 @@ setMethod("findDataTypesFromPlotRef", signature("VariableMetadataList"), functio dataTypes <- purrr::map(as.list(variables), function(x) { if(!is.na(x@plotReference@value) && x@plotReference@value == plotRef) { return(x@dataType@value) } }) - return(veupathUtils::toStringOrNull(unlist(dataTypes))) + return(mbioUtils::toStringOrNull(unlist(dataTypes))) }) #' EDA Variable Data Shapes matching a PlotReference @@ -571,7 +514,7 @@ setMethod("findDataShapesFromPlotRef", signature("VariableMetadataList"), functi dataShapes <- purrr::map(as.list(variables), function(x) { if(!is.na(x@plotReference@value) && x@plotReference@value == plotRef) { return(x@dataShape@value) } }) - return(veupathUtils::toStringOrNull(unlist(dataShapes))) + return(mbioUtils::toStringOrNull(unlist(dataShapes))) }) #' EDA Variable Column Name of a VariableSpec @@ -600,12 +543,12 @@ setMethod("getColName", signature("VariableSpec"), function(varSpec) { } if (entityId == '' || is.na(entityId)) return(varSpec@variableId) - return(veupathUtils::toStringOrNull(paste0(entityId, ".", varId))) + return(mbioUtils::toStringOrNull(paste0(entityId, ".", varId))) }) #' @export setMethod("getColName", signature("VariableSpecList"), function(varSpec) { - lapply(as.list(varSpec), veupathUtils::getColName) + lapply(as.list(varSpec), mbioUtils::getColName) }) #' @export @@ -628,7 +571,7 @@ setGeneric("findColNamesByPredicate", #' @export setMethod("findColNamesByPredicate", signature("VariableMetadataList"), function(variables, predicateFunction) { # For each variable in the variable list, return the column name if the predicate is true for that variable - colNames <- purrr::map(as.list(variables), function(x) {if (identical(predicateFunction(x), TRUE)) {return(veupathUtils::getColName(x@variableSpec))}}) + colNames <- purrr::map(as.list(variables), function(x) {if (identical(predicateFunction(x), TRUE)) {return(mbioUtils::getColName(x@variableSpec))}}) colNames <- unlist(colNames) return (colNames) @@ -642,10 +585,10 @@ setGeneric("findVariableMetadataFromVariableSpec", #' @export setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpecList"), function(variables, object) { - variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never")) - colNamesToMatch <- unlist(lapply(as.list(object), veupathUtils::getColName)) + variableSpecs <- unlist(lapply(as.list(variables), mbioUtils::getVariableSpec, "Never")) + colNamesToMatch <- unlist(lapply(as.list(object), mbioUtils::getColName)) - index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) %in% colNamesToMatch) + index <- which(purrr::map(variableSpecs, function(x) {mbioUtils::getColName(x)}) %in% colNamesToMatch) if (!length(index)) return(NULL) @@ -654,9 +597,9 @@ setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataLis #' @export setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpec"), function(variables, object) { - variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never")) + variableSpecs <- unlist(lapply(as.list(variables), mbioUtils::getVariableSpec, "Never")) - index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(object)) + index <- which(purrr::map(variableSpecs, function(x) {mbioUtils::getColName(x)}) == mbioUtils::getColName(object)) if (!length(index)) return(NULL) return(variables[index]) diff --git a/R/utils-classes.R b/R/utils-classes.R index 24cc3ac..045cd9f 100644 --- a/R/utils-classes.R +++ b/R/utils-classes.R @@ -5,7 +5,7 @@ S4SimpleListToJSON <- function(S4SimpleList, named = c(TRUE, FALSE)) { if (!inherits(S4SimpleList, 'SimpleList')) stop("S4SimpleListToJSON only accepts an S4Vectors::SimpleList as input.", class(S4SimpleList), "was provided.") tmp <- as.list(S4SimpleList) - tmp <- lapply(tmp, veupathUtils::toJSON, named) + tmp <- lapply(tmp, mbioUtils::toJSON, named) tmp <- paste(tmp, collapse = ",") if (tmp != "") tmp <- paste0("[", tmp, "]") diff --git a/R/utils-cut.R b/R/utils-cut.R index 4a3bb4f..2573525 100644 --- a/R/utils-cut.R +++ b/R/utils-cut.R @@ -52,7 +52,7 @@ cut_width <- function(x, width, center = NULL, boundary = NULL, closed = c("righ x <- as.numeric(x) width <- as.numeric(width) - closed <- veupathUtils::matchArg(closed) + closed <- mbioUtils::matchArg(closed) x_range <- range(x, na.rm = TRUE, finite = TRUE) if (length(x_range) == 0) { @@ -196,7 +196,7 @@ find_origin <- function(x_range, width, boundary) { } breaks <- function(x, method = c('equalInterval', 'quantile', 'sd'), nbins = NULL, binwidth = NULL) { - method <- veupathUtils::matchArg(method) + method <- mbioUtils::matchArg(method) if (method != 'sd') { if ((!is.null(nbins) && !is.null(binwidth)) || (is.null(nbins) && is.null(binwidth))) { stop("Specify exactly one of n and width for methods `equalInterval` and `quantile`.") diff --git a/R/utils-numeric.R b/R/utils-numeric.R index 9f798e0..c1887e5 100644 --- a/R/utils-numeric.R +++ b/R/utils-numeric.R @@ -1,305 +1,14 @@ -#' Find Bin Ranges for a Continuous Variable -#' -#' This function will find bin start, end and labels for a -#' continuous variable. Optionally, it can return a value/ count -#' per bin. By default returns 10 bins for equalRanges and quantile -#' methods and 6 for sd (standard deviations). -#' @param x Numeric (or Date) vector to find bins for -#' @param method A string indicating which method to use to find bins ('equalRanges', 'quantile', 'sd') -#' @param numBins A number indicating how many bins are desired -#' @param getValue A boolean indicating whether to return the counts per bin -#' @export -getDiscretizedBins <- function(x, method = c('equalInterval', 'quantile', 'sd'), numBins = NULL, getValue = c(TRUE, FALSE)) { - method <- veupathUtils::matchArg(method) - getValue <- veupathUtils::matchArg(getValue) - if (is.null(numBins)) numBins <- 10 - - isDate <- FALSE - if (class(x) == 'Date') { - x <- as.numeric(x) - isDate <- TRUE - } - - binEdges <- unname(breaks(x, method, numBins)) - if (anyDuplicated(binEdges)) { - warning("There is insufficient data to produce the requested number of bins. Returning as many bins as possible.") - binEdges <- unique(binEdges) - } - - if (isDate) { - binEdges <- as.Date(binEdges, origin = "1970-01-01") # Origin matches as.numeric default origin - } - - binStarts <- binEdges[1:(length(binEdges)-1)] - binEnds <- binEdges[2:length(binEdges)] - if (length(binEdges) == 1) binEnds <- binEnds[[2]] - - # only format human-friendly labels. binStarts and binEnds should provide exact values - # must also guarantee that the first binStart and last binEnd encompass the full data range even after formatting - if (isDate) { - # Dates are already human-friendly. - formattedBinStarts <- binStarts - formattedBinEnds <- binEnds - } else { - formattedBinStarts <- formatC(binStarts) - formattedBinEnds <- formatC(binEnds) - } - # think the alternative is to write a recursive fxn to call formatC w more digits until we get a result we like. - # that seems costly, so ill wait to do that until we see how much an issue this really is - if (as.numeric(formattedBinStarts[[1]]) > as.numeric(binStarts[[1]])) formattedBinStarts[[1]] <- as.character(binStarts[[1]]) - if (as.numeric(formattedBinEnds[[length(binEnds)]]) < as.numeric(binEnds[[length(binEnds)]])) formattedBinEnds[[length(binEnds)]] <- as.character(binEnds[[length(binEnds)]]) - - binLabels <- paste0("(",formattedBinStarts,", ", formattedBinEnds, "]") - binLabels[[1]] <- gsub("(","[",binLabels[[1]], fixed=T) - - if (getValue) { - if (length(binEdges) == 1) { - values <- 1 - } else { - values <- c(table(cut(x, binEdges, include.lowest=TRUE))) - } - } else { - values <- rep(NA_real_, length(binStarts)) - } - - # For numeric vars, coerce bin starts and ends to character so as to not lose any precision. - # It's possible we wouldnt lose precision regardless, but that's something we can look into in the future. - # We don't want to do this for dates, because then we loose the date being a date - if (isDate) { - bins <- lapply(1:length(binStarts), FUN = function(x) { Bin(binStart = binStarts[[x]], - binEnd = binEnds[[x]], - binLabel = binLabels[[x]], - value = values[[x]])}) - - } else { - bins <- lapply(1:length(binStarts), FUN = function(x) { Bin(binStart = as.character(binStarts[[x]]), - binEnd = as.character(binEnds[[x]]), - binLabel = binLabels[[x]], - value = values[[x]])}) - } - - return(BinList(S4Vectors::SimpleList(bins))) -} - -#' Non-Zero Rounding -#' -#' This function will recursively attempt to round a value to -#' greater and greater precision until it results in a non-zero -#' value. One consequence of this is that the precision of the -#' output value may not be exactly what was requested. -#' @param x Numeric value to round -#' @param digits Number indicating the desired precision -#' @return number rounded as nearly to the requested precision as -#' possible without returning zero. -#' @export -nonZeroRound <- function(x, digits) { - if (x == 0) { - warning("Input is already zero and cannot be rounded to a non-zero number.") - return(x) - } - if (round(x,digits) == 0) { - Recall(x,digits+1) - } else { - round(x,digits) - } -} - -#' Replace numeric NAs with 0 - update by reference -#' -#' This function replaces NAs in numeric columns with 0. -#' @param x data.table, data.frame, or list -#' @param cols vector of column names for which the NA replacement should occur. -#' Default is all numeric columns. -#' @return x with desired NAs replaced with 0. -#' @export -setNaToZero <- function(df, cols = NULL) { - - # if cols not set, use all numeric cols - if (is.null(cols)) cols <- findNumericCols(df) - cols <- validateNumericCols(df, cols=cols) - - data.table::setnafill(df, fill = 0, cols = cols) -} - - -#' Replace numeric NAs with 0 -#' -#' This function replaces numeric NAs with 0. -#' @param x list, data.frame, array, or vector -#' @param cols Optional. When appropriate, vector of column names -#' for which the NA replacement should occur. -#' Default is all numeric columns. -#' @return object the same type as x, where desired NAs are replaced with 0. -#' @export -naToZero <- function(x, ...) { - UseMethod("naToZero") -} - -#' @export -naToZero.data.table <- function(x, cols = NULL) { - - # if cols not set, use all numeric cols - if (is.null(cols)) cols <- findNumericCols(x) - cols <- validateNumericCols(x, cols) - - x[, cols] <- x[, ..cols][, lapply(.SD, function(y){y[is.na(y)] <- 0; y})] - return(x) -} - -#' @export -naToZero.data.frame <- function(x, cols = NULL) { - - # if cols not set, use all numeric cols - if (is.null(cols)) cols <- findNumericCols(x) - cols <- validateNumericCols(x, cols) - - x[cols][is.na(x[cols])] <- 0 - return(x) -} - -#' @export -naToZero.list <- function(x, cols = NULL) { - - # if cols not set, use all numeric cols - if (is.null(cols)) cols <- findNumericCols(x) - cols <- validateNumericCols(x, cols) - - x[cols] <- lapply(x[cols], function(y) {y[is.na(y)] <- 0; return(y)}) - return(x) -} - -#' @export -naToZero.default <- function(x) { - numericEntries <- purrr::map_lgl(x, is.numeric) - x[numericEntries][is.na(x[numericEntries])] <- 0 - return(x) -} - - #' Find numeric columns #' -#' This function finds all numeric columns in a list -#' @param x list, data.frame, array, or vector +#' This function finds all numeric columns in a data structure +#' @param x list, data.frame, data.table, or vector #' @return vector of numeric column names in x. If no numeric columns found, returns NULL -#' @importFrom purrr map_lgl #' @export findNumericCols <- function(x) { - numericCols <- names(x)[purrr::map_lgl(x, is.numeric)] + numericCols <- names(x)[sapply(x, is.numeric)] # If no numeric cols, return NULL if (!length(numericCols)) return(NULL) - + return(numericCols) } - - -#' Validate numeric columns -#' -#' Given a vector of column names or indices, this function ensures all -#' referenced columns are numeric -#' @param x list, data.frame, array, data.table -#' @param cols vector of column names or column indices. NAs will be removed. -#' @return given column names -#' @importFrom purrr map_lgl -#' @import data.table -#' @export -validateNumericCols <- function(x, cols, ...) { - UseMethod("validateNumericCols") -} - -#' @importFrom purrr map_lgl -#' @export -validateNumericCols.data.table <- function(x, cols) { - if (any(is.na(cols))) {cols <- cols[!is.na(cols)]; warning("validateNumericCols warning: NAs in cols removed")} - if (!length(cols)) {warning("validateNumericCols warning: no numeric columns given"); return(cols)} - if (is.character(cols)) { - if (!all(cols %in% names(x))) stop('validateNumericCols failed: Column name not found in the input') - } else { - if (max(cols) > ncol(x) | min(cols) < 1) stop('validateNumericCols failed: column index does not represent a valid column') - } - if (!all(purrr::map_lgl(x[, ..cols], is.numeric))) stop('validateNumericCols failed: All columns must be numeric') - return(cols) -} - -#' @importFrom purrr map_lgl -#' @export -validateNumericCols.list <- function(x, cols) { - if (any(is.na(cols))) {cols <- cols[!is.na(cols)]; warning("validateNumericCols warning: NAs in cols removed")} - if (!length(cols)) {warning("validateNumericCols warning: no numeric columns given"); return(cols)} - if (is.character(cols)) { - if (!all(cols %in% names(x))) stop('validateNumericCols failed: Column name not found in the input') - } else { - if (max(cols) > length(names(x)) | min(cols) < 1) stop('validateNumericCols failed: column index does not represent a valid column') - } - if (!all(purrr::map_lgl(x[cols], is.numeric))) stop('validateNumericCols failed: All columns must be numeric') - return(cols) -} - - -#' @importFrom purrr map_lgl -#' @export -validateNumericCols.default <- function(x, cols) { - if (any(is.na(cols))) {cols <- cols[!is.na(cols)]; warning("validateNumericCols warning: NAs in cols removed")} - if (!length(cols)) {warning("validateNumericCols warning: no numeric columns given"); return(cols)} - if (is.character(cols)) { - if (!all(cols %in% names(x))) stop('validateNumericCols failed: Column name not found in the input') - } else { - if (max(cols) > ncol(x) | min(cols) < 1) stop('validateNumericCols failed: column index does not represent a valid column') - } - if (!all(purrr::map_lgl(x[cols], is.numeric))) stop('validateNumericCols failed: All columns must be numeric') - return(cols) -} - -# -# For any number, return an absolute delta (numeric) at the last -# significant digit in the number, using the number of digits specified -# -# e.g. assuming 3 significant digits -# -# 1.23 -> 0.01 -# 11.0 -> 0.1 -# 12.3 -> 0.1 -# 101000 -> 1000 -# 1.20e-05 -> 0.01e-05 == 1.0e-07 -# 0.0123e-05 -> 0.0001e-05 == 1.0e-09 -# -2.34e-02 -> 0.01e-02 == 1.0e-04 -# -signifDigitEpsilon <- function(x, digits) { - - # '#' flag ensures trailing zeroes - # take abs() here because we don't care about sign - rounded <- formatC(abs(x), digits = digits, width = 1L, flag = '#') - - # split into vector of single characters - characters <- strsplit(rounded, '') - - result <- c() - seenSignificant <- FALSE - significantCount <- 0 - # walk through string, looking for first non-zero, non decimal point character - for (c in unlist(characters)) { - if (!(c %in% c('0', '.'))) { - seenSignificant <- TRUE - } - if (c == '.') { - result <- c(result, c) - } else if (seenSignificant) { - significantCount <- significantCount + 1 - if (significantCount < digits) { - result <- c(result, '0') - } else if (significantCount == digits) { - result <- c(result, '1') - } else { - # we're out of the significant digits - # we must be in the exponent part (if present) or in trailing zeroes (e.g. in 101000 example) - # so just copy it over - result <- c(result, c) - } - } else { - result <- c(result, '0') - } - } - - # return joined result as a number - as.numeric(paste(result, collapse="")) -} diff --git a/R/utils.R b/R/utils.R index c241271..3ee60b1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -115,7 +115,7 @@ stripEntityIdFromColumnHeader <- function(columnNames) { columnsToFix <- grepl(".", columnNames, fixed=T) if (sum(columnsToFix) > 0) { - columnNames[columnsToFix] <- veupathUtils::strSplit(columnNames[columnsToFix], ".", index=2) + columnNames[columnsToFix] <- mbioUtils::strSplit(columnNames[columnsToFix], ".", index=2) } return(columnNames) diff --git a/README.md b/README.md index 033c1ae..f37f900 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,30 @@ - [![R-CMD-check](https://github.com/microbiomeDB/veupathUtils/workflows/R-CMD-check/badge.svg)](https://github.com/microbiomeDB/veupathUtils/actions) + [![R-CMD-check](https://github.com/microbiomeDB/mbioUtils/workflows/R-CMD-check/badge.svg)](https://github.com/microbiomeDB/mbioUtils/actions) -# veupathUtils +# mbioUtils -veupathUtils is an R package which provides helper functions for solving common problems in the MicrobiomeDB project. +mbioUtils is an R package which provides utility functions and data structures for microbiomeDB R packages. It was forked from veupathUtils 2.6.7 and stripped of VEuPathDB-specific functionality. ## Installation -Use the R package [remotes](https://cran.r-project.org/web/packages/remotes/index.html) to install veupathUtils. From the R command prompt: +Use the R package [remotes](https://cran.r-project.org/web/packages/remotes/index.html) to install mbioUtils. From the R command prompt: ```R -remotes::install_github('microbiomeDB/veupathUtils') +remotes::install_github('microbiomeDB/mbioUtils') ``` ## Usage This package is primarily intended for use as a dependency in other R packages. In order to establish that depedency the developer of the dependent package must follow these steps: -1. add ```veupathUtils``` to the ```Imports``` section of the dependent package's ```DESCRIPTION``` file. +1. add ```mbioUtils``` to the ```Imports``` section of the dependent package's ```DESCRIPTION``` file. 2. add a ```Remotes``` section to the dependent package's ```DESCRIPTION``` file. -3. add ```microbiomeDB/veupathUtils``` to the ```Remotes``` section of the dependent package's ```DESCRIPTION``` file. -4. add ```#' @import veupathUtils``` to the dependent package's package-level documentation file (usually called ```{mypackage}-package.R```). +3. add ```microbiomeDB/mbioUtils``` to the ```Remotes``` section of the dependent package's ```DESCRIPTION``` file. +4. add ```#' @import mbioUtils``` to the dependent package's package-level documentation file (usually called ```{mypackage}-package.R```). 5. run ```devtools::document()```. The developer of the dependent package can either install this package using ```remotes``` as descripted in the "Installation" section above, -or if they mean to also develop veupathUtils simultaneously, can use ```devtools::load_all("{path-to-veupathUtils}")``` to load this package in +or if they mean to also develop mbioUtils simultaneously, can use ```devtools::load_all("{path-to-mbioUtils}")``` to load this package in their R session. ## Contributing diff --git a/data/megastudyDataReal.rda b/data/megastudyDataReal.rda deleted file mode 100644 index 1cdde00..0000000 Binary files a/data/megastudyDataReal.rda and /dev/null differ diff --git a/data/studyVocabsReal.rda b/data/studyVocabsReal.rda deleted file mode 100644 index 7760af5..0000000 Binary files a/data/studyVocabsReal.rda and /dev/null differ diff --git a/man/ComputeResult-class.Rd b/man/ComputeResult-class.Rd index 1930df3..962cad9 100644 --- a/man/ComputeResult-class.Rd +++ b/man/ComputeResult-class.Rd @@ -20,7 +20,7 @@ This includes their representation in R, as JSON and how they are written to fil \item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.} -\item{\code{computedVariableMetadata}}{veupathUtils::VariableMetadataList detailing the computed variables.} +\item{\code{computedVariableMetadata}}{mbioUtils::VariableMetadataList detailing the computed variables.} \item{\code{statistics}}{An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.} diff --git a/man/Megastudy-class.Rd b/man/Megastudy-class.Rd deleted file mode 100644 index 1527fa1..0000000 --- a/man/Megastudy-class.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/class-Megastudy.R -\docType{class} -\name{Megastudy-class} -\alias{Megastudy-class} -\alias{Megastudy} -\title{Megastudy} -\description{ -A class to encapsulate everything we need for our special handling -of 'megastudies' in EDA. Currently that is imputing zeroes on tall data -given that each (sub-)study has different expected vocabularies for -that data. -} -\section{Slots}{ - -\describe{ -\item{\code{data}}{A data.table} - -\item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.} - -\item{\code{studySpecificVocabularies}}{veupathUtils::StudySpecificVocabulariesByVariableList} - -\item{\code{collectionIds}}{A data.table including collection ids and any variables of interest for the collection entity. -If none provided, the collection ids will be inferred from those present in \code{data}.} -}} - diff --git a/man/Range-class.Rd b/man/Range-class.Rd deleted file mode 100644 index c4f55e5..0000000 --- a/man/Range-class.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/class-Range.R -\docType{class} -\name{Range-class} -\alias{Range-class} -\alias{Range} -\title{Data Range} -\description{ -A class to a data range, represented as minimum and maximum values. -The range could be numeric or Date/ Posix -} -\section{Slots}{ - -\describe{ -\item{\code{minimum}}{A number, Date or Posix value} - -\item{\code{maximum}}{A number, Date or Posix value} -}} - diff --git a/man/Statistic-class.Rd b/man/Statistic-class.Rd deleted file mode 100644 index 9328a3b..0000000 --- a/man/Statistic-class.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/class-Statistic.R -\docType{class} -\name{Statistic-class} -\alias{Statistic-class} -\alias{Statistic} -\title{Statistic} -\description{ -A class to specify a named statistic, its value, confidence interval and p-value. -This is primarily to help maintain consistency in meeting the EDA API. -} -\section{Slots}{ - -\describe{ -\item{\code{name}}{A string specifying what statistic was calculated} - -\item{\code{value}}{A number} - -\item{\code{confidenceInterval}}{An optional Range specifying minimum and maximum values for the confidence interval} - -\item{\code{confidenceLevel}}{An optional decimal number indicating the degree of confidence represented by the confidence interval} - -\item{\code{pvalue}}{An optional string representing the p-value associated with the statistic. The string should either be coercible to a number or be '<0.0001'.} -}} - diff --git a/man/StudySpecificVocabulariesByVariable-class.Rd b/man/StudySpecificVocabulariesByVariable-class.Rd deleted file mode 100644 index d66571f..0000000 --- a/man/StudySpecificVocabulariesByVariable-class.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/class-Megastudy.R -\docType{class} -\name{StudySpecificVocabulariesByVariable-class} -\alias{StudySpecificVocabulariesByVariable-class} -\alias{StudySpecificVocabulariesByVariable} -\title{Study Specific Vocabularies By Variable} -\description{ -A class to specify expected values per study for some variable -of interest. -} -\section{Slots}{ - -\describe{ -\item{\code{studyIdColumnName}}{A string specifying the name of the column in the vocab data table that contains the study id} - -\item{\code{variableSpecColumnName}}{A string specifying the name of the column in the vocab data table that contains the variable vocabulary values} - -\item{\code{studyVocab}}{A data.table with columns studyIdColumnName and variableSpecColumnName that specifies expected vocabularies for each study} -}} - diff --git a/man/findNumericCols.Rd b/man/findNumericCols.Rd index 42b8e06..ab2b3d2 100644 --- a/man/findNumericCols.Rd +++ b/man/findNumericCols.Rd @@ -7,11 +7,11 @@ findNumericCols(x) } \arguments{ -\item{x}{list, data.frame, array, or vector} +\item{x}{list, data.frame, data.table, or vector} } \value{ vector of numeric column names in x. If no numeric columns found, returns NULL } \description{ -This function finds all numeric columns in a list +This function finds all numeric columns in a data structure } diff --git a/man/getDTWithImputedZeroes.Rd b/man/getDTWithImputedZeroes.Rd deleted file mode 100644 index 4b2dae5..0000000 --- a/man/getDTWithImputedZeroes.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Megastudy.R -\name{getDTWithImputedZeroes} -\alias{getDTWithImputedZeroes} -\title{Impute Zeroes (on tall data)} -\usage{ -getDTWithImputedZeroes(object, variables, verbose = c(TRUE, FALSE)) -} -\arguments{ -\item{object}{veupathUtils::Megastudy} -} -\value{ -data.table -} -\description{ -This function returns a data.table which has explicit zero values -for all expected categories of some variable of interest in a megastudy. -} diff --git a/man/getDataTable.Rd b/man/getDataTable.Rd deleted file mode 100644 index 2d127f9..0000000 --- a/man/getDataTable.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Statistic.R -\name{getDataTable} -\alias{getDataTable} -\title{Convert StatisicList to data.table} -\usage{ -getDataTable(object) -} -\arguments{ -\item{object}{A StatisticList object} -} -\value{ -data.table -} -\description{ -This function returns a data.table representation of a StatisticList. -The content of the slots are stored as JSON. Each entry in the list becomes a named column. -} diff --git a/man/getDiscretizedBins.Rd b/man/getDiscretizedBins.Rd deleted file mode 100644 index 5eccf14..0000000 --- a/man/getDiscretizedBins.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-numeric.R -\name{getDiscretizedBins} -\alias{getDiscretizedBins} -\title{Find Bin Ranges for a Continuous Variable} -\usage{ -getDiscretizedBins( - x, - method = c("equalInterval", "quantile", "sd"), - numBins = NULL, - getValue = c(TRUE, FALSE) -) -} -\arguments{ -\item{x}{Numeric (or Date) vector to find bins for} - -\item{method}{A string indicating which method to use to find bins ('equalRanges', 'quantile', 'sd')} - -\item{numBins}{A number indicating how many bins are desired} - -\item{getValue}{A boolean indicating whether to return the counts per bin} -} -\description{ -This function will find bin start, end and labels for a -continuous variable. Optionally, it can return a value/ count -per bin. By default returns 10 bins for equalRanges and quantile -methods and 6 for sd (standard deviations). -} diff --git a/man/getStudyIdColumnName.Rd b/man/getStudyIdColumnName.Rd deleted file mode 100644 index c06d036..0000000 --- a/man/getStudyIdColumnName.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Megastudy.R -\name{getStudyIdColumnName} -\alias{getStudyIdColumnName} -\title{StuydIdColName as String} -\usage{ -getStudyIdColumnName(object) -} -\arguments{ -\item{object}{veupathUtils::StudySpecificVocabulariesByVariable} -} -\value{ -character -} -\description{ -This function returns the studyIdColName from an StudySpecificVocabulariesByVariable -} diff --git a/man/getVariableSpec.Rd b/man/getVariableSpec.Rd deleted file mode 100644 index a7978f6..0000000 --- a/man/getVariableSpec.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Megastudy.R -\name{getVariableSpec} -\alias{getVariableSpec} -\title{get a VariableSpec} -\usage{ -getVariableSpec(object, ...) -} -\arguments{ -\item{object}{An object containing a veupathUtils::VariableSpec} -} -\value{ -character -} -\description{ -This function returns a string representation of a VariableSpec. By -default it assumes the VariableSpec is available in a slot called -\code{variableSpec}. -} diff --git a/man/getVariableSpecColumnName.Rd b/man/getVariableSpecColumnName.Rd deleted file mode 100644 index 84bf745..0000000 --- a/man/getVariableSpecColumnName.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Megastudy.R -\name{getVariableSpecColumnName} -\alias{getVariableSpecColumnName} -\title{VarSpecColName as String} -\usage{ -getVariableSpecColumnName(object) -} -\arguments{ -\item{object}{veupathUtils::StudySpecificVocabulariesByVariable} -} -\value{ -character -} -\description{ -This function returns the variableSpec from an StudySpecificVocabulariesByVariable -} diff --git a/man/mbioUtils-package.Rd b/man/mbioUtils-package.Rd new file mode 100644 index 0000000..7467ed4 --- /dev/null +++ b/man/mbioUtils-package.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mbioUtils-package.R +\docType{package} +\name{mbioUtils-package} +\alias{mbioUtils} +\alias{mbioUtils-package} +\title{mbioUtils: Utility Functions for MicrobiomeDB R Packages} +\description{ +mbioUtils contains utility functions and data structures for microbiomeDB R packages, including microbiomeComputations, MicrobiomeDB, and microbiomeData. Forked from veupathUtils 2.6.7. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/microbiomeDB/mbioUtils} + \item Report bugs at \url{https://github.com/microbiomeDB/mbioUtils/issues} +} + +} +\author{ +\strong{Maintainer}: Danielle Callan \email{dcallan@upenn.edu} + +Authors: +\itemize{ + \item Ann Blevins \email{annsize@upenn.edu} (Original veupathUtils author) +} + +} +\keyword{internal} diff --git a/man/megastudyDataReal.Rd b/man/megastudyDataReal.Rd deleted file mode 100644 index 3c936d2..0000000 --- a/man/megastudyDataReal.Rd +++ /dev/null @@ -1,75 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{megastudyDataReal} -\alias{megastudyDataReal} -\title{Some Real Life Popbio Megastudy Test Data} -\format{ -\subsection{\code{megastudyDataReal}}{ - -A data frame with 14 rows and 9 columns: -\describe{ -\item{EUPATH_0000609.Sample_stable_id}{Stable ID for the \code{Sample} entity} -\item{OBI_0000659.ParentOfSample_stable_id}{Stable ID for the \code{Collections} entity} -\item{GAZ_00000448.GeographicLocation_stable_id}{Stable ID for the \verb{Collection sites} entity} -\item{EUPATH_0000605.Study_stable_id}{Stable ID for the \code{Studies} entity} -\item{EUPATH_0000609.PATO_0000047}{Sample -> Sex} -\item{EUPATH_0000609.POPBIO_8000017}{Sample -> Unbiased Specimen Count} -\item{EUPATH_0000609.OBI_0001909}{Sample -> Species} -\item{EUPATH_0000609.EUPATH_0043227}{Sample -> Female Insect Feeding Status} -\item{EUPATH_0000609.UBERON_0000105}{Sample -> Life Cycle Stage} -} -} -} -\source{ -\url{https://vectorbase.org/vectorbase/app/workspace/maps/A4nuqcm/import} -} -\usage{ -megastudyDataReal -} -\description{ -It is the data used in the subset associated w this saved analysis: -https://vectorbase.org/vectorbase/app/workspace/maps/A4nuqcm/import -as of 9 Jan 2024. -As of the time of writing the filters in this analysis are like this: -"filters": [ -{ -"entityId": "EUPATH_0000605", -"variableId": "POPBIO_8000215", -"stringSet": \link[= - "VBP0000844" - ]{ "VBP0000844" }, -"type": "stringSet" -}, -{ -"entityId": "GAZ_00000448", -"variableId": "EUPATH_0000542", -"stringSet": \link[= - "Canyon Rim Church" - ]{ "Canyon Rim Church" }, -"type": "stringSet" -}, -{ -"variableId": "EUPATH_0043256", -"entityId": "OBI_0000659", -"type": "dateRange", -"min": "2022-05-01T00:00:00Z", -"max": "2022-11-01T00:00:00Z" -}, -{ -"type": "numberRange", -"variableId": "OBI_0001620", -"entityId": "GAZ_00000448", -"min": 40.698307645373106, -"max": 40.71193901146775 -}, -{ -"type": "longitudeRange", -"variableId": "OBI_0001621", -"entityId": "GAZ_00000448", -"left": -111.8212938308716, -"right": -111.79749727249147 -} -] -} -\keyword{datasets} diff --git a/man/naToZero.Rd b/man/naToZero.Rd deleted file mode 100644 index 8ae2380..0000000 --- a/man/naToZero.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-numeric.R -\name{naToZero} -\alias{naToZero} -\title{Replace numeric NAs with 0} -\usage{ -naToZero(x, ...) -} -\arguments{ -\item{x}{list, data.frame, array, or vector} - -\item{cols}{Optional. When appropriate, vector of column names -for which the NA replacement should occur. -Default is all numeric columns.} -} -\value{ -object the same type as x, where desired NAs are replaced with 0. -} -\description{ -This function replaces numeric NAs with 0. -} diff --git a/man/nonZeroRound.Rd b/man/nonZeroRound.Rd deleted file mode 100644 index 039fafb..0000000 --- a/man/nonZeroRound.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-numeric.R -\name{nonZeroRound} -\alias{nonZeroRound} -\title{Non-Zero Rounding} -\usage{ -nonZeroRound(x, digits) -} -\arguments{ -\item{x}{Numeric value to round} - -\item{digits}{Number indicating the desired precision} -} -\value{ -number rounded as nearly to the requested precision as -possible without returning zero. -} -\description{ -This function will recursively attempt to round a value to -greater and greater precision until it results in a non-zero -value. One consequence of this is that the precision of the -output value may not be exactly what was requested. -} diff --git a/man/setNaToZero.Rd b/man/setNaToZero.Rd deleted file mode 100644 index ef87028..0000000 --- a/man/setNaToZero.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-numeric.R -\name{setNaToZero} -\alias{setNaToZero} -\title{Replace numeric NAs with 0 - update by reference} -\usage{ -setNaToZero(df, cols = NULL) -} -\arguments{ -\item{cols}{vector of column names for which the NA replacement should occur. -Default is all numeric columns.} - -\item{x}{data.table, data.frame, or list} -} -\value{ -x with desired NAs replaced with 0. -} -\description{ -This function replaces NAs in numeric columns with 0. -} diff --git a/man/toJSON.Rd b/man/toJSON.Rd index 9bbfb4d..53cb60b 100644 --- a/man/toJSON.Rd +++ b/man/toJSON.Rd @@ -16,5 +16,5 @@ character vector of length 1 containing JSON string } \description{ This function converts an R object to a JSON string. -see \code{methods(veupathUtils::toJSON)} for a list of support classes. +see \code{methods(mbioUtils::toJSON)} for a list of support classes. } diff --git a/man/validateNumericCols.Rd b/man/validateNumericCols.Rd deleted file mode 100644 index cd79fb6..0000000 --- a/man/validateNumericCols.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-numeric.R -\name{validateNumericCols} -\alias{validateNumericCols} -\title{Validate numeric columns} -\usage{ -validateNumericCols(x, cols, ...) -} -\arguments{ -\item{x}{list, data.frame, array, data.table} - -\item{cols}{vector of column names or column indices. NAs will be removed.} -} -\value{ -given column names -} -\description{ -Given a vector of column names or indices, this function ensures all -referenced columns are numeric -} diff --git a/man/veupathUtils-package.Rd b/man/veupathUtils-package.Rd deleted file mode 100644 index d60a5da..0000000 --- a/man/veupathUtils-package.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/veupathUtils-package.R -\docType{package} -\name{veupathUtils-package} -\alias{veupathUtils} -\alias{veupathUtils-package} -\title{veupathUtils: General Helper Functions for MicrobiomeDB Projects} -\description{ -veupathUtils contains various R helper functions intended to be useful across a variety of projects, including the plot.data and microbiomeComputations packages. -} -\seealso{ -Useful links: -\itemize{ - \item \url{https://github.com/microbiomeDB/veupathUtils} - \item Report bugs at \url{https://github.com/microbiomeDB/veupathUtils/issues} -} - -} -\author{ -\strong{Maintainer}: Ann Blevins \email{annsize@upenn.edu} - -Authors: -\itemize{ - \item Danielle Callan \email{dcallan@upenn.edu} -} - -} -\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R index 6b3cd5a..f5b7dbf 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) -library(veupathUtils) +library(mbioUtils) -test_check("veupathUtils") +test_check("mbioUtils") diff --git a/tests/testthat/test-ComputeResult.R b/tests/testthat/test-ComputeResult.R index 958c26c..3e60e60 100644 --- a/tests/testthat/test-ComputeResult.R +++ b/tests/testthat/test-ComputeResult.R @@ -8,25 +8,25 @@ test_that('ComputeResult validation works', { names(df) <- stripEntityIdFromColumnHeader(names(df)) df$alphaDiversity <- .1 - computedVariableMetadata <- veupathUtils::VariableMetadata( - variableClass = veupathUtils::VariableClass(value = "computed"), - variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), - plotReference = veupathUtils::PlotReference(value = "yAxis"), + computedVariableMetadata <- mbioUtils::VariableMetadata( + variableClass = mbioUtils::VariableClass(value = "computed"), + variableSpec = mbioUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = mbioUtils::PlotReference(value = "yAxis"), displayName = "Alpha Diversity", displayRangeMin = 0, displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), - dataType = veupathUtils::DataType(value = "NUMBER"), - dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + dataType = mbioUtils::DataType(value = "NUMBER"), + dataShape = mbioUtils::DataShape(value = "CONTINUOUS") ) expect_error(new("ComputeResult", recordIdColumn = 'entity.SampleID', - computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + computedVariableMetadata = mbioUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), data = df)) expect_error(new("ComputeResult", name = c('alphaDiv', 'test'), recordIdColumn = 'entity.SampleID', - computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + computedVariableMetadata = mbioUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), data = df)) expect_error(new("ComputeResult", name = NULL, @@ -57,25 +57,25 @@ test_that('ComputeResult validation works', { expect_error(new("ComputeResult", name = c('alphaDiv'), recordIdColumn = 'entity.SampleID', - computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + computedVariableMetadata = mbioUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), data = df)) df$alphaDiversity <- .1 - computedVariableMetadata <- veupathUtils::VariableMetadata( - variableClass = veupathUtils::VariableClass(value = "native"), - variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), - plotReference = veupathUtils::PlotReference(value = "yAxis"), + computedVariableMetadata <- mbioUtils::VariableMetadata( + variableClass = mbioUtils::VariableClass(value = "native"), + variableSpec = mbioUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = mbioUtils::PlotReference(value = "yAxis"), displayName = "Alpha Diversity", displayRangeMin = 0, displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), - dataType = veupathUtils::DataType(value = "NUMBER"), - dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + dataType = mbioUtils::DataType(value = "NUMBER"), + dataShape = mbioUtils::DataShape(value = "CONTINUOUS") ) expect_error(new("ComputeResult", name = c('alphaDiv'), recordIdColumn = 'entity.SampleID', - computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + computedVariableMetadata = mbioUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), data = df)) # TODO test that extra cols in data will err, dot notation of output cols or not, column order @@ -90,24 +90,24 @@ test_that("ComputeResult writeMeta method returns well formatted json", { names(df) <- stripEntityIdFromColumnHeader(names(df)) df$alphaDiversity <- .1 - computedVariableMetadata <- veupathUtils::VariableMetadata( - variableClass = veupathUtils::VariableClass(value = "computed"), - variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), - plotReference = veupathUtils::PlotReference(value = "yAxis"), + computedVariableMetadata <- mbioUtils::VariableMetadata( + variableClass = mbioUtils::VariableClass(value = "computed"), + variableSpec = mbioUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = mbioUtils::PlotReference(value = "yAxis"), displayName = "Alpha Diversity", displayRangeMin = 0, displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), - dataType = veupathUtils::DataType(value = "NUMBER"), - dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + dataType = mbioUtils::DataType(value = "NUMBER"), + dataShape = mbioUtils::DataShape(value = "CONTINUOUS") ) result <- new("ComputeResult", name = "alphaDiv", recordIdColumn = 'entity.SampleID', - computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + computedVariableMetadata = mbioUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), data = df) - jsonlist <- jsonlite::fromJSON(veupathUtils::toJSON(result@computedVariableMetadata)) + jsonlist <- jsonlite::fromJSON(mbioUtils::toJSON(result@computedVariableMetadata)) expect_equal(names(jsonlist), 'variables') expect_equal(names(jsonlist$variables), c("variableClass","variableSpec","plotReference","displayName","displayRangeMin","displayRangeMax","dataType","dataShape","isCollection","imputeZero","hasStudyDependentVocabulary")) diff --git a/tests/testthat/test-class-CollectionWithMetadata.R b/tests/testthat/test-class-CollectionWithMetadata.R index df3a823..a751aac 100644 --- a/tests/testthat/test-class-CollectionWithMetadata.R +++ b/tests/testthat/test-class-CollectionWithMetadata.R @@ -156,7 +156,7 @@ test_that("pruneFeatures works", { ) # pruneFeatures touched SampleMetadata, which this CollectionWithMetadata object has none. it shouldnt fail for that though. - testing <- pruneFeatures(testing, veupathUtils::predicateFactory('proportionNonZero', 0.5)) + testing <- pruneFeatures(testing, mbioUtils::predicateFactory('proportionNonZero', 0.5)) expect_equal(nrow(testing@data), 200) expect_equal(ncol(testing@data), 3) }) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R deleted file mode 100644 index ffffead..0000000 --- a/tests/testthat/test-class-Megastudy.R +++ /dev/null @@ -1,863 +0,0 @@ -megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'), - 'study.author'=c('Cool Guy', 'Cool Guy', 'Cool Guy', 'Uncool Guy', 'Uncool Guy', 'Uncool Guy'), - 'collection.id'=c(1,1,2,1,2,2), - 'collection.attractant'=c('A','A','B','C','D','D'), - 'sample.id'=c(1,2,3,4,5,6), - 'sample.species'=c('species1','species2','species1','species1','species1','species2'), - 'sample.sex'=c('male','male','female','male','female','male'), - 'sample.specimen_count'=c(10,20,15,15,10,20), - 'assay.id'=c(11,12,13,14,15,16), - 'assay.pathogen_prevalence'=c(.1,.2,.3,.4,.5,.6), - 'assay.pathogen_presence'=c('Yes','Yes','No','No','Yes','No'), - 'assay.pathogen2_presence'=c('Yes','No','Yes','No','Yes','No'), - 'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'), - 'assay.weighting_variable'=c(5,10,15,20,25,30)) - -## the collectionsDT needs to include all collections, unless explicitly filtered against -## it also needs to tell us values for any collection variables that are in the plot -## were trying to impute samples, based on collections that dont have samples. -## so the only way to get collection variables values for those samples, is to get them from the collectionsDT -collectionsDT <- data.table( - 'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'), - 'collection.id' = c(1, 2, 3, 1, 2, 3, 4), - 'collection.attractant' = c('A', 'B', 'C', 'C', 'D', 'D', 'E')) - -sexVocabs.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'), - 'sample.sex' = c('female', 'male', 'non-binary', 'other', 'do not wish to specify', 'male', 'female')) - -sexVocabs <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='sample',variableId='sex'), - studyVocab=sexVocabs.dt -) - -sexVocabsSMALL.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'b', 'b'), - 'sample.sex' = c('female', 'male', 'male', 'female')) - -sexVocabsSMALL <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='sample',variableId='sex'), - studyVocab=sexVocabsSMALL.dt -) - -sexVocabsSingleStudy.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'a', 'a', 'a'), - 'sample.sex' = c('female', 'male', 'non-binary', 'other', 'do not wish to specify')) - -sexVocabsSingleStudy <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='sample',variableId='sex'), - studyVocab=sexVocabsSingleStudy.dt -) - -speciesVocabs.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'a', 'b', 'b', 'b'), - 'sample.species' = c('species1', 'species2', 'species3', 'species1', 'species2', 'species5')) - -speciesVocabs <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='sample',variableId='species'), - studyVocab=speciesVocabs.dt -) - -speciesVocabsSMALL.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'b', 'b'), - 'sample.species' = c('species1', 'species2', 'species1', 'species2')) - -speciesVocabsSMALL <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='sample',variableId='species'), - studyVocab=speciesVocabsSMALL.dt -) - -pathogenVocabs.dt <- data.table::data.table( - 'study.id' = c('a', 'a', 'b', 'b'), - 'assay.pathogen_presence' = c('Yes', 'No', 'Yes', 'No')) - -pathogenVocabs <- veupathUtils::StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=veupathUtils::VariableSpec(entityId='assay',variableId='pathogen_presence'), - studyVocab=pathogenVocabs.dt -) - -test_that("Megastudy and associated validation works", { - # works at all - m <- Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) - - expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT')) - expect_equal(length(m@studySpecificVocabularies), 1) - expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[1]]@studyVocab[,1]), 2) - expect_equal(slotNames(m@studySpecificVocabularies[[1]]), c("studyIdColumnName","variableSpec","studyVocab")) - - # works w multiple vocab lists - m <- Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) - - expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT')) - expect_equal(length(m@studySpecificVocabularies), 2) - expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[2]]@studyVocab[,1]), 2) - expect_equal(slotNames(m@studySpecificVocabularies[[2]]), c("studyIdColumnName","variableSpec","studyVocab")) - - # errs if no ancestors/ids? - expect_error(Megastudy(data=megastudyDT, - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=NA, - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=NULL, - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=c(), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=character(), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=NA_character_, - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - # errs if extra ancestor ids provided but not in dt - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id', 'imNOTan.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))) - - # errs if special vocabs on different entities - expect_error(Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, pathogenVocabs)))) - - # errs if studyIdColumnName and varSpec col name not in dt - expect_error(StudySpecificVocabulariesByVariable( - studyIdColumnName='imNOTan.id', - variableSpec=VariableSpec(entityId='sample',variableId='species'), - vocabulary=speciesVocabs.dt) - ) - - # errs if no study id provided for study vocab - expect_error(StudySpecificVocabulariesByVariable(variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName=NA, variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName=NULL, variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName=c(), variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName=character(), variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName=NA_character_, variableSpec=VariableSpec(entityId='sample',variableId='species'), studyVocab=speciesVocabs.dt)) - - # errs if no var spec provided for study vocab - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName='study.id', studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName='study.id', variableSpec=NA, studyVocab=speciesVocabs.dt)) - expect_error(StudySpecificVocabulariesByVariable(studyIdColumnName='study.id', variableSpec=NULL, studyVocab=speciesVocabs.dt)) -}) - -# TODO this could go in its own file maybe -test_that("imputeZeroes method is sane", { - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) - - # case where neither study nor collection vars in the plot - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42 - expect_equal(nrow(imputedDT), 42) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) - - # case where some collection ids are missing - # in real life, we have some collections where all samples are 0 and so not loaded - # in this case, the collection ids are missing from the data table R gets handed. - # we want to impute zeroes for their samples anyhow. - - # in this version, some collection level variables are in both data and collectionsDT - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), - collectionsDT=collectionsDT) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69 - expect_equal(nrow(imputedDT), 69) ## TODO check these numbers - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63) - - # in this version, collection level variables are only in collectionsDT - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), - collectionsDT=collectionsDT) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69 - expect_equal(nrow(imputedDT), 69) ## TODO check these numbers - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63) - - ################################################################################################################# - - - # case where one study vocab is missing a study - mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE] - mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_ - mDTSexSingleStudy <- unique(mDTSexSingleStudy) - - mSexSingleStudy <- Megastudy(data=mDTSexSingleStudy, - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabsSingleStudy))) - - imputedDT <- getDTWithImputedZeroes(mSexSingleStudy, variables, FALSE) - # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species in study A (15) + 3 species in study B * 2 collections per study = 42 - expect_equal(nrow(imputedDT), 36) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 30) - - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) - - # collection entity var is present - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 42) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) - - # both collection and study entity vars are present - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 42) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) - - # all values in vocab already present - megastudyDTSMALL <- rbind(megastudyDT, - data.table::data.table(study.id=c('a','b'), - study.author=c('Cool Guy','Uncool Guy'), - collection.id=c(2,1), - collection.attractant=c('B','C'), - sample.id=c(7,8), - sample.species=c('species2','species2'), - sample.sex=c('female','female'), - sample.specimen_count=c(5,5), - assay.id=c(17,18), - assay.pathogen_prevalence=c(.7,.8), - assay.pathogen_presence=c('No','Yes'), - assay.pathogen2_presence=c('Yes','No'), - assay.pathogen3_presence=c('No','Yes'), - assay.weighting_variable=c(35,40))) - - mCOMPLETE <- Megastudy(data=megastudyDTSMALL[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabsSMALL, sexVocabsSMALL))) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE) - # 2 species * 2 sexes * 2 collections * 2 studies = 16 - expect_equal(nrow(imputedDT), 16) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 8) - - # no weighting var in plot - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(mCOMPLETE@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) - - # an assay var is present - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id','assay.id', 'assay.pathogen_prevalence', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author')], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'pathogen_prevalence', entityId = 'assay'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(m@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'pathogen_presence', entityId = 'assay'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(m@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) - - # multiple special vocabs in same plot, w one shared weighting var - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs,sexVocabs))) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary=TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 42) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) - - # special vocab on sample, regular weighting var on assay - # phase this in -# m <- Megastudy(data=megastudyDT, -# ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id'), -# studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) -# -# variables <- new("VariableMetadataList", SimpleList( -# new("VariableMetadata", -# variableClass = new("VariableClass", value = 'native'), -# variableSpec = new("VariableSpec", variableId = 'weighting_variable', entityId = 'assay'), -# plotReference = new("PlotReference", value = 'xAxis'), -# dataType = new("DataType", value = 'NUMBER'), -# dataShape = new("DataShape", value = 'CONTINUOUS')), -# new("VariableMetadata", -# variableClass = new("VariableClass", value = 'native'), -# variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), -# plotReference = new("PlotReference", value = 'yAxis'), -# dataType = new("DataType", value = 'NUMBER'), -# dataShape = new("DataShape", value = 'CONTINUOUS')), -# new("VariableMetadata", -# variableClass = new("VariableClass", value = 'native'), -# variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), -# plotReference = new("PlotReference", value = 'overlay'), -# dataType = new("DataType", value = 'STRING'), -# dataShape = new("DataShape", value = 'CATEGORICAL'), -# weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), -# hasStudyDependentVocabulary=TRUE), -# new("VariableMetadata", -# variableClass = new("VariableClass", value = 'native'), -# variableSpec = new("VariableSpec", variableId = 'pathogen_prevalence', entityId = 'assay'), -# plotReference = new("PlotReference", value = 'facet1'), -# dataType = new("DataType", value = 'STRING'), -# dataShape = new("DataShape", value = 'CATEGORICAL'), -# weightingVariableSpec = VariableSpec(variableId='weighting_variable',entityId='assay'), -# hasStudyDependentVocabulary = FALSE) -# )) -# -# imputedDT <- getDTWithImputedZeroes(m, variables) -# expect_equal(names(imputedDT), c("sample.species","sample.specimen_count","collection.attractant","study.author","study.id","collection.id","sample.id")) -# expect_equal(nrow(imputedDT), 12) -# expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) -# - # no special vocab or weight present - m <- Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id')) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'pathogen_prevalence', entityId = 'assay'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(m@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) - - # only regular weight var present - m <- Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id')) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'pathogen_prevalence', entityId = 'assay'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS'), - weightingVariableSpec = VariableSpec(variableId='weighting_variable',entityId='assay')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) - )) - - imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(m@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) - - # sample var without a study vocab present when another sample var has one - m <- Megastudy(data=megastudyDT, - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), - plotReference = new("PlotReference", value = 'facet1'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) - )) - - expect_error(getDTWithImputedZeroes(m, variables, FALSE)) - - # variable collection exists in plot - pathogenVariableCollectionVocabs.dt <- data.table::data.table( - study.id = c('a','a','b','b'), - assay.pathogen_presence_variable_collection = c('Yes','No','Yes','No') - ) - pathogenVariableCollectionVocabs <- StudySpecificVocabulariesByVariable( - studyIdColumnName='study.id', - variableSpec=VariableSpec(entityId='assay',variableId='pathogen_presence_variable_collection'), - studyVocab=pathogenVariableCollectionVocabs.dt - ) - - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'assay.id', 'sample.specimen_count', 'collection.attractant', 'study.author', 'assay.pathogen_presence', 'assay.pathogen2_presence', 'assay.pathogen3_presence'), with=FALSE], - ancestorIdColumns=c('study.id', 'collection.id', 'sample.id', 'assay.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(pathogenVariableCollectionVocabs))) - - variables <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'pathogen_presence_variable_collection', entityId = 'assay'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - isCollection = TRUE, - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE, - members = VariableSpecList(S4Vectors::SimpleList(VariableSpec(variableId='pathogen_presence', entityId='assay'), - VariableSpec(variableId='pathogen2_presence', entityId='assay'), - VariableSpec(variableId='pathogen3_presence', entityId='assay'))) - ), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')) - )) - - ## Im inclined to commenting this for now. i dont think we have a case like this yet, - ## and im not even completely sure yet what the api would be for when we do... - - #imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - #expect_equal(all(c("assay.pathogen_presence","assay.pathogen2_presence","assay.pathogen3_presence","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 2 studies * 2 collections per study * 2 values for each of 3 pathogen variables = 32? - # im not sure this test makes any sense, bc were imputing 0 on a sample for a collection on assay - # im going to comment until we see a real use case - #expect_equal(nrow(imputedDT), 32) - #expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 26) -}) - -test_that("we have reasonable perf w a real-ish use case", { - megastudyVariablesReal <- new("VariableMetadataList", SimpleList( - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'POPBIO_8000017', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'yAxis'), - dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS'), - hasStudyDependentVocabulary = FALSE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'PATO_0000047', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'xAxis'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'OBI_0001909', entityId = 'EUPATH_0000609'), - plotReference = new("PlotReference", value = 'overlay'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'UBERON_0000105', entityId = 'EUPATH_0000609'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'EUPATH_0043227', entityId = 'EUPATH_0000609'), - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='POPBIO_8000017',entityId='EUPATH_0000609'), - hasStudyDependentVocabulary = TRUE) - )) - - ## minimal use case, with real vocabularies. based on a heavily subsetted megastudy. - megastudyReal <- Megastudy( - data=megastudyDataReal, - ancestorIdColumns=c( - 'EUPATH_0000605.Study_stable_id', - 'GAZ_00000448.GeographicLocation_stable_id', - 'OBI_0000659.ParentOfSample_stable_id', - 'EUPATH_0000609.Sample_stable_id' - ), - studySpecificVocabularies=studyVocabsReal - ) - - benchmark <- microbenchmark::microbenchmark(getDTWithImputedZeroes(megastudyReal, megastudyVariablesReal, verbose = FALSE)) - print(benchmark) - expect_true(mean(benchmark$time)/1000000 < 55) ## this is in milliseconds - expect_true(median(benchmark$time)/1000000 < 55) -}) \ No newline at end of file diff --git a/tests/testthat/test-class-Statistic.R b/tests/testthat/test-class-Statistic.R deleted file mode 100644 index b74e928..0000000 --- a/tests/testthat/test-class-Statistic.R +++ /dev/null @@ -1,93 +0,0 @@ -test_that("Statistic validation works", { - # only name and value slot is required, and takes a single value - expect_error(new("Statistic", - "value" = NA_real_)) - expect_error(new("Statistic", - "name" = NA_character_, - "value" = 123)) - expect_error(new("Statistic", - "name" = c("foo", "bar"), - "value" = 123)) - expect_error(new("Statistic", - "name" = "foo", - "value" = c(123, 456))) - - stat <- new("Statistic", - "name" = "PI", - "value" = 3.141592) - - expect_equal(stat@pvalue, NA_character_) - expect_equal(stat@confidenceInterval@minimum, NA_real_) - expect_equal(stat@confidenceLevel, NA_real_) - - #pvalue must have a single value between 0 and 1 - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "pvalue" = 1.2)) - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "pvalue" = -0.05)) - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "pvalue" = c(0.001, 0.0001))) - - #if CI provided, value is within it - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "confidenceInterval" = Range("minimum" = 0.001, "maximum" = 0.0001))) - - #CI has CI level - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "confidenceInterval" = Range("minimum" = 0.001, "maximum" = 10))) - - #CI level is between 0 and 1 and is accompanied by a CI - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "confidenceInterval" = Range("minimum" = 0.001, "maximum" = 0.0001), - "confidenceLevel" = 1.2)) - - expect_error(new("Statistic", - "name" = "PI", - "value" = 3.141592, - "confidenceLevel" = .95)) -}) - -test_that("toJSON result is properly formatted for Statistic", { - stat <- new("Statistic", - "name" = "PI", - "value" = 3.141592, - "confidenceInterval" = Range("minimum" = 0.001, "maximum" = 10), - "confidenceLevel" = .95, - "pvalue" = '<0.0001') - - statjson <- veupathUtils::toJSON(stat) - statlist <- jsonlite::fromJSON(statjson) - - expect_equal(names(statlist), 'PI') - expect_equal(names(statlist$PI), c('value', 'confidenceInterval', 'confidenceLevel', 'pvalue')) - expect_equal(statlist$PI$confidenceInterval, c('(0.001 - 10)')) - - stat2 <- new("Statistic", - "name" = "PI2", - "value" = 3.141592, - "confidenceInterval" = Range("minimum" = 1, "maximum" = 5), - "confidenceLevel" = .95, - "pvalue" = ".47") - stats <- StatisticList(S4Vectors::SimpleList(stat, stat2)) - - statsjson <- veupathUtils::toJSON(stats) - statslist <- jsonlite::fromJSON(statsjson) - - expect_equal(names(statslist), 'statistics') - expect_equal(names(statslist$statistics), c('PI', 'PI2')) - expect_equal(names(statslist$statistics$PI2), c('value', 'confidenceInterval', 'confidenceLevel', 'pvalue')) - expect_equal(statslist$statistics$PI2$confidenceInterval, c(NA, '(1 - 5)')) - expect_equal(class(statslist$statistics$PI2$pvalue[2]), 'character') -}) diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index 07add20..7986c28 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -203,7 +203,7 @@ test_that("correlation returns an appropriately structured result for metadata v statsData <- result@statistics@statistics expect_s3_class(statsData, 'data.frame') expect_equal(names(statsData), c('data1','data2','correlationCoef','pValue')) - nNumericCols <- length(veupathUtils::findNumericCols(sampleMetadata@data[,2:ncol(sampleMetadata@data)])) + nNumericCols <- length(mbioUtils::findNumericCols(sampleMetadata@data[,2:ncol(sampleMetadata@data)])) expect_equal(nrow(statsData), ((nNumericCols * nNumericCols) - 3)/2) # Should be number of number of numeric vars * number of numeric vars expect_equal(as.character(unique(statsData$data1)), c('entity.contA', 'entity.contB')) expect_equal(as.character(unique(statsData$data2)), c('entity.contB', 'entity.contC')) diff --git a/tests/testthat/test-methods-VariableMetadata.R b/tests/testthat/test-methods-VariableMetadata.R index 2ea6a48..96ee002 100644 --- a/tests/testthat/test-methods-VariableMetadata.R +++ b/tests/testthat/test-methods-VariableMetadata.R @@ -189,7 +189,7 @@ test_that("toJSON result is properly formatted for VariableMetadata", { new("VariableSpec", variableId = 'c', entityId = 'b') )) ) - vmjson <- veupathUtils::toJSON(vm) + vmjson <- mbioUtils::toJSON(vm) vmlist <- jsonlite::fromJSON(vmjson) expect_equal(names(vmlist), 'variableMetadata') @@ -214,7 +214,7 @@ test_that("toJSON result is properly formatted for VariableMetadata", { dataShape = new("DataShape", value = 'BINARY'), vocabulary = c('a', 'b') ) - vmjson <- veupathUtils::toJSON(vm) + vmjson <- mbioUtils::toJSON(vm) vmlist <- jsonlite::fromJSON(vmjson) expect_equal(names(vmlist), 'variableMetadata') diff --git a/tests/testthat/test-utils-numeric.R b/tests/testthat/test-utils-numeric.R deleted file mode 100644 index 5f3416a..0000000 --- a/tests/testthat/test-utils-numeric.R +++ /dev/null @@ -1,251 +0,0 @@ -## Tests for numeric utils functions - -test_that("getDiscretizedBins returns sane results", { - set.seed(1) - x <- rnorm(100) - - # NULL numBins - expect_equal(length(getDiscretizedBins(x, 'equalInterval')), 10) - expect_equal(length(getDiscretizedBins(x, 'quantile')), 10) - expect_equal(length(getDiscretizedBins(x, 'sd')), 6) - - # with and without values - bins <- getDiscretizedBins(x, 'quantile', 10, FALSE) - expect_equal(all(is.na(unlist(lapply(bins, FUN = function(x) {x@value})))), TRUE) - bins <- getDiscretizedBins(x, 'quantile', 10, TRUE) - expect_equal(any(is.na(unlist(lapply(bins, FUN = function(x) {x@value})))), FALSE) - - # non-numeric, NULL or NA for input - expect_error(getDiscretizedBins(c('a','b','c'))) - expect_error(getDiscretizedBins(NA)) - expect_error(getDiscretizedBins(NULL)) - - dates <- as.Date(c('1999-12-11','1999-06-14','1999-02-26','1999-05-24','1999-02-25','1999-09-06', - '1999-07-24','1999-05-29','1999-01-03','1999-06-28','1999-02-13','1999-03-17')) - - dateBins <- getDiscretizedBins(dates) - # Test that the bin labels match the bin starts and ends, as well as that the - # dates of bins make sense - expect_equal(length(dateBins), 10) - expect_equal(strsplit(dateBins[[1]]@binLabel, ', ')[[1]][1], paste0("[",dateBins[[1]]@binStart)) - expect_equal(strsplit(dateBins[[1]]@binLabel, ', ')[[1]][2], paste0(dateBins[[1]]@binEnd, "]")) - expect_equal(dateBins[[1]]@binStart, min(dates)) - expect_equal(dateBins[[10]]@binEnd, max(dates)) - - expect_equal(length(getDiscretizedBins(dates, 'quantile')), 10) - expect_equal(length(getDiscretizedBins(dates, 'sd')), 5) - - ## different types of dates... - dates <- as.Date(c("1969-06-05T00:00:00", "1969-06-06T00:00:00", "1969-06-07T00:00:00","1969-06-08T00:00:00", "1969-06-09T00:00:00", "1969-06-10T00:00:00", - "1969-06-11T00:00:00", "1969-06-12T00:00:00", "1969-06-13T00:00:00", - "1969-06-14T00:00:00", "1969-06-15T00:00:00", "1969-06-16T00:00:00", - "1969-06-17T00:00:00", "1969-06-18T00:00:00", "1969-06-19T00:00:00", - "1969-06-20T00:00:00", "1969-06-21T00:00:00", "1969-06-22T00:00:00", - "1969-06-23T00:00:00", "1969-06-24T00:00:00")) - - dateBins <- getDiscretizedBins(dates) - # Test that the bin labels match the bin starts and ends, as well as that the - # dates of bins make sense - expect_equal(length(dateBins), 10) - expect_equal(strsplit(dateBins[[1]]@binLabel, ', ')[[1]][1], paste0("[",dateBins[[1]]@binStart)) - expect_equal(strsplit(dateBins[[1]]@binLabel, ', ')[[1]][2], paste0(dateBins[[1]]@binEnd, "]")) - expect_equal(dateBins[[1]]@binStart, min(dates)) - expect_equal(dateBins[[10]]@binEnd, max(dates)) - - expect_equal(length(getDiscretizedBins(dates, 'quantile')), 10) - expect_equal(length(getDiscretizedBins(dates, 'sd')), 4) - - ## return as many bins as possible for these cases - # almost no data - expect_equal(length(getDiscretizedBins(1)), 1) - expect_equal(length(getDiscretizedBins(1, 'quantile')), 1) - expect_equal(length(getDiscretizedBins(1, 'sd')), 1) - x <- rnorm(2) - expect_equal(length(getDiscretizedBins(x)), 10) - expect_equal(sum(unlist(lapply(getDiscretizedBins(x), FUN = function(x){0 != x@value}))), 2) - expect_equal(length(getDiscretizedBins(x, 'quantile')), 10) - expect_equal(sum(unlist(lapply(getDiscretizedBins(x, 'quantile'), FUN = function(x){0 != x@value}))), 2) - expect_equal(length(getDiscretizedBins(x, 'sd')), 2) - - # skewed data - x <- rnbinom(100, 10, 0.5) - expect_equal(length(getDiscretizedBins(x)), 10) - expect_equal(length(getDiscretizedBins(x, 'equalInterval', 50)), 50) - expect_equal(length(getDiscretizedBins(x, 'quantile')), 10) - expect_equal(length(getDiscretizedBins(x, 'quantile', 50)) <= 50, TRUE) - expect_equal(length(getDiscretizedBins(x, 'sd')) <= 6, TRUE) -}) - -test_that("nonZeroRound only returns 0 if it receives one", { - expect_equal(nonZeroRound(0),0) - expect_equal(nonZeroRound(123456789.987654321, 4) == 0, FALSE) - expect_equal(nonZeroRound(0.987654321, 4) == 0, FALSE) - expect_equal(nonZeroRound(0.00000019, 4) == 0, FALSE) - expect_equal(nonZeroRound(0.00000019, 4), 0.0000002) -}) - -test_that("setNaToZero replaces intended NAs", { - - df <- iris - - # Add NAs to all columns - nMissing <- 10 - df <- as.data.frame(lapply(df, function(x) {x[sample(1:length(x), size=nMissing)] <- NA; return(x)})) - - # With specified columns - setNaToZero(df, cols=c("Sepal.Length", "Sepal.Width")) - expect_equal(class(df), 'data.frame') - expect_equal(sum(is.na(df)), 3*nMissing) - expect_equal(sum(df[, c('Sepal.Length', 'Sepal.Width')] == 0), 2*nMissing) - - # Err if given a non-numeric column - expect_error(naToZero(df, cols = c("Sepal.Length", "Sepal.Width", "Species"))) - - # With defualt cols and data.table. Should change only numeric columns - dt <- data.table::as.data.table(df) - setNaToZero(dt) - expect_equal(class(dt), c('data.table','data.frame')) - expect_equal(sum(is.na(dt)), nMissing) - expect_equal(sum(dt[, c('Petal.Length', 'Petal.Width')] == 0), 2*nMissing) - -}) - -test_that("naToZero replaces intended NAs", { - - df <- iris - - # Add NAs to all columns - nMissing <- 10 - df <- as.data.frame(lapply(df, function(x) {x[sample(1:length(x), size=nMissing)] <- NA; return(x)})) - - # With specified columns - df <- naToZero(df, cols=c("Sepal.Length", "Sepal.Width")) - expect_equal(class(df), 'data.frame') - expect_equal(sum(is.na(df)), 3*nMissing) - expect_equal(sum(df[, c('Sepal.Length', 'Sepal.Width')] == 0), 2*nMissing) - - # With defualt cols and data.table input. Should change only numeric columns - dt <- data.table::as.data.table(df) - dt <- naToZero(dt) - expect_equal(class(dt), c('data.table','data.frame')) - expect_equal(sum(is.na(dt)), nMissing) - expect_equal(sum(dt[, c('Petal.Length', 'Petal.Width')] == 0), 2*nMissing) - - # Testing additional object types - # Lists - lst <- lapply(iris, function(x) {x[sample(1:length(x), size=nMissing)] <- NA; return(x)}) - lst <- naToZero(lst) - expect_equal(class(lst), 'list') - expect_equal(sum(unlist(lapply(lst, function(x) {sum(is.na(x))}))), nMissing) - expect_equal(sum(unlist(lapply(lst[c('Petal.Length', 'Petal.Width')], function(x) {sum(x==0)}))), 2*nMissing) - - # Err if given a non-numeric column - expect_error(naToZero(lst, cols = c("Sepal.Length", "Sepal.Width", "Species"))) - - - # Vector - vec <- c(1,2,3,NA) - vec <- naToZero(vec) - expect_equal(class(vec), 'numeric') - expect_true(all(!is.na(vec))) - expect_equal(vec, c(1,2,3,0)) - - # Matrix - mat <- matrix(rnorm(36), nrow=6) - mat[sample(1:36, size=nMissing, replace=F)] <- NA - mat <- naToZero(mat) - expect_equal(class(mat), c('matrix', 'array')) - expect_true(any(!is.na(mat))) - expect_equal(sum(mat == 0), nMissing) - - # Do nothing to strings - vec <- c('1','2','3',NA) - vec <- naToZero(vec) - expect_equal(class(vec), 'character') - expect_equal(vec, c('1','2','3',NA)) - -}) - - -test_that("finding and validating numeric columns works", { - - df <- iris - - # With specified columns - numericCols <- findNumericCols(df) - expect_equal(numericCols, c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width')) - - # In a data.table - dt <- data.table::as.data.table(df) - numericCols <- findNumericCols(dt) - expect_equal(numericCols, c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width')) - - validatedCols <- validateNumericCols(dt, c('Sepal.Length', 'Sepal.Width')) - expect_equal(validatedCols, c('Sepal.Length', 'Sepal.Width')) - - validatedCols <- validateNumericCols(dt, c(1, 2, 3)) - expect_equal(validatedCols, c(1, 2, 3)) - - # In a list - lst <- as.list(iris) - numericCols <- findNumericCols(lst) - expect_equal(numericCols, c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width')) - - validatedCols <- validateNumericCols(lst, c('Sepal.Length', 'Sepal.Width')) - expect_equal(validatedCols, c('Sepal.Length', 'Sepal.Width')) - - validatedCols <- validateNumericCols(lst, c(2, 3, 4)) - expect_equal(validatedCols, c(2, 3, 4)) - - # If no numeric cols - dt_string <- dt[, lapply(.SD, as.character)] - numericCols <- findNumericCols(dt_string) - expect_equal(numericCols, NULL) - - lst_string <- lapply(iris, function(x) {x <- as.character(x); return(x)}) - numericCols <- findNumericCols(lst_string) - expect_equal(numericCols, NULL) - - # validateNumericCols should err if given non-numeric column names - expect_error(validateNumericCols(dt, c('Sepal.Length', 'Species'))) - expect_error(validateNumericCols(lst, c('Sepal.Length', 'Species'))) - - # err if column names do not exist - expect_error(validateNumericCols(dt, c('a', 'Species'))) - expect_error(validateNumericCols(lst, c('a', 'Species'))) - - # err if indices too large - expect_error(validateNumericCols(dt, c(1, 2, 100))) - expect_error(validateNumericCols(lst, c(1, 2, 100))) - - # err if indices too small - expect_error(validateNumericCols(dt, c(-1, 2, 100))) - expect_error(validateNumericCols(lst, c(-1, 2, 100))) - - # return NULL for NULL input - validatedCols <- validateNumericCols(dt, NULL) - expect_equal(validatedCols, NULL) - - validatedCols <- validateNumericCols(lst, NULL) - expect_equal(validatedCols, NULL) - - # remove NAs in cols arg - validatedCols <- validateNumericCols(dt, c(1, NA, 4)) - expect_equal(validatedCols, c(1, 4)) - - validatedCols <- validateNumericCols(lst, c(1, NA, 4)) - expect_equal(validatedCols, c(1, 4)) - -}) - -test_that("signifDigitEpsilon returns appropriate results", { - expect_equal(signifDigitEpsilon(1.23, 3), 0.01) - expect_equal(signifDigitEpsilon(11.0, 3), 0.1) - expect_equal(signifDigitEpsilon(12.3, 3), 0.1) - expect_equal(signifDigitEpsilon(101000, 3), 1000) - expect_equal(signifDigitEpsilon(1.20e-05, 3), 1.0e-07) - expect_equal(signifDigitEpsilon(0.0123e-05, 3), 1.0e-09) - expect_equal(signifDigitEpsilon(-2.34e-02, 3), 1.0e-04) - expect_equal(signifDigitEpsilon(1234567, 7), 1) - expect_equal(signifDigitEpsilon(-1234567, 7), 1) -}) \ No newline at end of file