diff --git a/DESCRIPTION b/DESCRIPTION index 7b6bfb9..de6291d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,6 +41,6 @@ License: GPL (>= 3) + file LICENSE LazyData: true VignetteBuilder: knitr Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Encoding: UTF-8 URL: https://bhklab.github.io/AnnotationGx/ diff --git a/R/cellosaurus.R b/R/cellosaurus.R index 8fc44da..05dbb97 100644 --- a/R/cellosaurus.R +++ b/R/cellosaurus.R @@ -16,17 +16,39 @@ #' #' @export cellosaurus_fields <- function(common = FALSE, upper = FALSE) { - if(common == TRUE) { - fields <- c("id", "ac", "acas", "sy", "dr", "di", "din", "dio", "ox", "cc", - "sx", "ag", "oi", "hi", "ch", "ca", "dt", "dtc", "dtu", "dtv", "from", "group") - } else{ + if (isTRUE(common)) { + fields <- c( + "id", + "ac", + "acas", + "sy", + "dr", + "di", + "din", + "dio", + "ox", + "cc", + "sx", + "ag", + "oi", + "hi", + "ch", + "ca", + "dt", + "dtc", + "dtu", + "dtv", + "from", + "group" + ) + } else { schema <- .cellosaurus_schema() fields <- schema$components$schemas$Fields$enum } - if(upper == TRUE) { + if (isTRUE(upper)) { fields <- toupper(fields) - }else{ + } else { fields <- tolower(fields) } @@ -62,19 +84,32 @@ cellosaurusAPIVersion <- function() { #' @param query_only Logical indicating whether to return only the query URLs. Default is FALSE. #' @param raw Logical indicating whether to return the raw HTTP responses. Default is FALSE. #' @param parsed Logical indicating whether to parse the response text. Default is TRUE. -#' @param ... Additional arguments to be passed to the underlying functions. +#' @param include_query Logical indicating whether to include the `query*` columns +#' (e.g. `query`, `query:ac`) in the returned result. Default is TRUE. +#' @param ... Currently unused. Reserved for future extensions. #' #' @return A data.table containing the mapped cell line IDs and accession numbers. +#' When `parsed = FALSE`, the returned table also includes Cellosaurus metadata +#' columns that have been renamed to user-friendly titles (for example, `sy` +#' becomes `synonyms`). #' #' @examples #' mapCell2Accession(ids = c("A549", "MCF7")) #' #' @export mapCell2Accession <- function( - ids, numResults = 10000, from = "idsy", sort = "ac", keep_duplicates = FALSE, - fuzzy = FALSE, query_only = FALSE, raw = FALSE, parsed = TRUE, ... + ids, + numResults = 10000, + from = "idsy", + sort = "ac", + keep_duplicates = FALSE, + fuzzy = FALSE, + query_only = FALSE, + raw = FALSE, + parsed = TRUE, + include_query = TRUE, + ... ) { - funContext <- .funContext("mapCell2Accession") # Input validation and coercion @@ -83,7 +118,19 @@ mapCell2Accession <- function( ids <- as.character(ids) } - to = c("ac", "id", "sy", "misspelling", "dr", "cc", "ca", "di", "ag", "sx", "hi") + to = c( + "ac", + "id", + "sy", + "misspelling", + "dr", + "cc", + "ca", + "di", + "ag", + "sx", + "hi" + ) # create query list .info(funContext, "Creating Cellosaurus queries") @@ -93,47 +140,85 @@ mapCell2Accession <- function( .info(funContext, "Building Cellosaurus requests") # build the list of requests - requests <- parallel::mclapply(queries, function(query) { - .build_cellosaurus_request( - query = query, - to = to, - numResults = numResults, - sort = sort, - output = "TXT", - fuzzy = fuzzy, - ... - ) - }) + requests <- parallel::mclapply( + queries, + function(query) { + .build_cellosaurus_request( + query = query, + to = to, + numResults = numResults, + sort = sort, + output = "TXT" + ) + }, + mc.cores = getOption("annotationgx.mc.cores", getOption("mc.cores", 1L)) + ) + + if (query_only) { + return(lapply(requests, function(req) req$url)) + } - if (query_only) return(lapply(requests, function(req) req$url)) - # Submit requests using parallel httr2 since cellosaurus doesnt throttle .info(funContext, "Performing Cellosaurus queries") - responses <- .perform_request_parallel(requests, progress = "Querying Cellosaurus...") + responses <- .perform_request_parallel( + requests, + progress = "Querying Cellosaurus..." + ) names(responses) <- as.character(ids) # in case its an numeric ID like cosmic ids - if (raw) return(responses) + if (raw) { + return(responses) + } # parse the responses .info(funContext, "Parsing Cellosaurus responses") - responses_dt <- parallel::mclapply(ids, function(name) { - resp <- responses[[name]] - - resp <- .parse_cellosaurus_lines(resp) - if(length(resp) == 0L){ - .warn(paste0("No results found for ", name)) - result <- data.table::data.table() - result$query <- name - return(result) - } - response_dt <- .parse_cellosaurus_text(resp, name, parsed, keep_duplicates) - response_dt - }) - + responses_dt <- parallel::mclapply( + ids, + function(name) { + resp <- responses[[name]] + + resp <- .parse_cellosaurus_lines(resp) + if (length(resp) == 0L) { + .warn(paste0("No results found for ", name)) + result <- data.table::data.table() + result$query <- name + return(result) + } + response_dt <- .parse_cellosaurus_text( + resp, + name, + parsed = parsed, + keep_duplicates = keep_duplicates + ) + response_dt + }, + mc.cores = getOption("annotationgx.mc.cores", getOption("mc.cores", 1L)) + ) responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE) - return(responses_dt) + if (!include_query) { + query_cols <- grep("^query", names(responses_dt), value = TRUE) + if (length(query_cols) > 0L) { + responses_dt[, (query_cols) := NULL] + } + } + core_cols <- c("cellLineName", "accession") + existing_core <- core_cols[core_cols %in% names(responses_dt)] + if (length(existing_core) > 0L) { + query_cols <- if (include_query) { + grep("^query", names(responses_dt), value = TRUE) + } else { + character(0) + } + extra_cols <- setdiff(names(responses_dt), c(existing_core, query_cols)) + data.table::setcolorder( + responses_dt, + c(existing_core, extra_cols, query_cols) + ) + } + + return(responses_dt) } @@ -144,56 +229,60 @@ mapCell2Accession <- function( #' #' @param resp The response object containing the cellosaurus data #' @return A list of parsed lines from the cellosaurus data -#' +#' #' @keywords internal #' @noRd -.parse_cellosaurus_lines <- function(resp){ - lines <- httr2::resp_body_string(resp) |> - strsplit("\n") |> - unlist() - +.parse_cellosaurus_lines <- function(resp) { + lines <- httr2::resp_body_string(resp) |> + strsplit("\n") |> + unlist() + Map( f = function(lines, i, j) { - lines[i:(j - 1L)] + lines[i:(j - 1L)] }, i = grep(pattern = "^ID\\s+", x = lines, value = FALSE), j = grep(pattern = "^//$", x = lines, value = FALSE), MoreArgs = list("lines" = lines), USE.NAMES = FALSE ) - } #' parse responses -#' -#' @noRd +#' +#' @noRd #' @keywords internal -.parse_cellosaurus_text <- function(resp, name, parsed = FALSE, keep_duplicates = FALSE){ - +.parse_cellosaurus_text <- function( + resp, + name, + parsed = FALSE, + keep_duplicates = FALSE +) { responses_dt <- lapply( - X = resp, - FUN = .processEntry - ) - tryCatch({ - responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE) - }, error = function(e) { - .err(paste0("Error parsing response for ", name, ": ", e$message)) - }) + X = resp, + FUN = .processEntry + ) + tryCatch( + { + responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE) + }, + error = function(e) { + .err(paste0("Error parsing response for ", name, ": ", e$message)) + } + ) responses_dt <- .formatSynonyms(responses_dt) - if(!parsed) { + if (!parsed) { responses_dt$query <- name - return(responses_dt[, c("cellLineName", "accession", "query")]) + return(responses_dt) } - result <- .find_cellosaurus_matches(responses_dt, name, keep_duplicates) - result$query <- name - result <- result[, c("cellLineName", "accession", "query")] + result$query <- name + result <- result[, c("cellLineName", "accession", "query"), with = FALSE] return(result) - } #' Splits cellosaurus lines into a named list @@ -214,7 +303,7 @@ mapCell2Accession <- function( #' # [1] "Line 1" "Line 2" #' #' @noRd -.split_cellosaurus_lines <- function(lines){ +.split_cellosaurus_lines <- function(lines) { x <- strSplit(lines, split = " ") x <- split(x[, 2L], f = x[, 1L]) x @@ -225,15 +314,15 @@ mapCell2Accession <- function( ## It splits the input string, organizes the data into a nested list, ## handles optional keys, removes discontinued identifiers from the DR field, ## and converts the resulting list into a data table. -.processEntry <- function(x){ +.processEntry <- function(x) { requiredKeys = c("AC", "CA", "DT", "ID") nestedKeys = c("DI", "DR", "HI") optionalKeys = c("AG", "SX", "SY") specialKeys = c("CC") x <- .split_cellosaurus_lines(x) - - if("CC" %in% names(x)){ + + if ("CC" %in% names(x)) { x <- .formatComments(x) } @@ -247,25 +336,25 @@ mapCell2Accession <- function( dt[[name]] <- x[[name]] } for (key in optionalKeys) { - dt[[key]] <- ifelse( - is.null(x[[key]]), - NA_character_, - x[[key]] - ) + if (is.null(x[[key]])) { + dt[[key]] <- NA_character_ + } else { + dt[[key]] <- x[[key]] + } } for (key in nestedKeys) { - dt[[key]] <- ifelse( - is.null(x[[key]]), - NA_character_, - list(.splitNestedCol(x, key, "; ")[[key]]) - ) + if (is.null(x[[key]])) { + dt[[key]] <- NA_character_ + } else { + dt[[key]] <- list(.splitNestedCol(x, key, "; ")[[key]]) + } } for (key in specialKeys) { - dt[[key]] <- ifelse( - is.null(x[[key]]), - NA_character_, - x[key] - ) + if (is.null(x[[key]])) { + dt[[key]] <- NA_character_ + } else { + dt[[key]] <- x[key] + } } ## Filter out discontinued identifiers from DR (e.g. "CVCL_0455"). @@ -275,7 +364,7 @@ mapCell2Accession <- function( fixed = TRUE, value = TRUE ) - if (isTRUE(length(discontinued) > 0L)) { + if (length(discontinued) > 0L) { discontinued <- sub( pattern = "^Discontinued: (.+);.+$", replacement = "\\1", @@ -286,19 +375,126 @@ mapCell2Accession <- function( # create data.table of lists responses_dt <- dt - old_names <- c("AC", "AG", "AS", "CA", "CC", "DI", "DR", "DT", "HI", "ID", - "OI", "OX", "RX", "ST", "SX", "SY", "WW") + old_names <- c( + "AC", + "AG", + "AS", + "CA", + "CC", + "DI", + "DR", + "DT", + "HI", + "ID", + "OI", + "OX", + "RX", + "ST", + "SX", + "SY", + "WW" + ) + + new_names <- c( + "accession", + "ageAtSampling", + "secondaryAccession", + "category", + "comments", + "diseases", + "crossReferences", + "date", + "hierarchy", + "cellLineName", + "originateFromSameIndividual", + "speciesOfOrigin", + "referencesIdentifiers", + "strProfileData", + "sexOfCell", + "synonyms", + "webPages" + ) - new_names <- c("accession", "ageAtSampling", "secondaryAccession", "category", - "comments", "diseases", "crossReferences", "date", "hierarchy", "cellLineName", - "originateFromSameIndividual", "speciesOfOrigin", "referencesIdentifiers", - "strProfileData", "sexOfCell", "synonyms", "webPages") - - data.table::setnames(responses_dt, old = old_names, new = new_names, skip_absent = TRUE) + data.table::setnames( + responses_dt, + old = old_names, + new = new_names, + skip_absent = TRUE + ) responses_dt } +.match_cellosaurus_candidates <- function( + responses_dt, + query, + name, + keep_duplicates +) { + strategies <- list( + function() { + if (any(responses_dt$cellLineName == query)) { + data.table::setkeyv(responses_dt, "cellLineName") + responses_dt[query] + } else { + NULL + } + }, + function() { + matches <- matchNested( + query, + responses_dt, + keep_duplicates = keep_duplicates + ) + if (length(matches) > 0L) { + responses_dt[matches] + } else { + NULL + } + }, + function() { + matches <- matchNested( + name, + responses_dt, + keep_duplicates = keep_duplicates + ) + if (length(matches) > 0L) { + responses_dt[matches] + } else { + NULL + } + }, + function() { + matches <- cleanCharacterStrings(responses_dt$cellLineName) == name + if (any(matches)) { + responses_dt[matches][1] + } else { + NULL + } + }, + function() { + matches <- matchNested( + name, + lapply(responses_dt$synonyms, cleanCharacterStrings) + ) + if (length(matches) > 0L) { + responses_dt[matches] + } else { + NULL + } + } + ) + + for (strategy in strategies) { + candidate <- strategy() + if (!is.null(candidate) && nrow(candidate) > 0L) { + return(candidate) + } + } + + NULL +} + #' Find Cellosaurus Matches #' @@ -321,16 +517,16 @@ mapCell2Accession <- function( #' accession = c("Accession 1", "Accession 2", "Accession 3"), #' synonyms = list(c("Synonym 1", "Synonym 2"), c("Synonym 3"), c("Synonym 4")) #' ) -#' +#' #' .find_cellosaurus_matches(responses_dt, "Cell Line 2") #' #' @noRd #' @keywords internal .find_cellosaurus_matches <- function( - responses_dt, - name, + responses_dt, + name, keep_duplicates = FALSE -){ +) { # save original name query <- name name <- cleanCharacterStrings(name) @@ -338,23 +534,14 @@ mapCell2Accession <- function( # first try for exact match as cellLineName to avoid the case where # the first row is the wrong cellline but the query is in a synonym # but the second row is the correct cellline - # TODO:: REFACTOR THIS TO NOT REPEAT THE CONDITIONAL - if(any(responses_dt$cellLineName == query)){ - data.table::setkeyv(responses_dt, "cellLineName") - result <- responses_dt[query] - } else if(length(matchNested(query, responses_dt, keep_duplicates = keep_duplicates)) > 0){ - matches <- matchNested(query, responses_dt, keep_duplicates = keep_duplicates) - result <- responses_dt[matches] - } else if(length(matchNested(name, responses_dt, keep_duplicates = keep_duplicates)) > 0){ - matches <- matchNested(name, responses_dt, keep_duplicates = keep_duplicates) - result <- responses_dt[matches] - } else if(any(cleanCharacterStrings(responses_dt$cellLineName) == name)){ - matches <- cleanCharacterStrings(responses_dt$cellLineName) == name - result <- responses_dt[matches][1] - } else if(length(matchNested(name, lapply(responses_dt$synonyms, cleanCharacterStrings)))> 0 ){ - matches <- matchNested(name, lapply(responses_dt$synonyms, cleanCharacterStrings)) - result <- responses_dt[matches] - } else{ + result <- .match_cellosaurus_candidates( + responses_dt = responses_dt, + query = query, + name = name, + keep_duplicates = keep_duplicates + ) + + if (is.null(result)) { .warn(paste0("No results found for ", query)) # create an empty data.table with the following columns: # c("cellLineName", "accession", "query") @@ -368,16 +555,15 @@ mapCell2Accession <- function( } - #' Format the `synonyms` column #' #' @note Updated 2023-01-24. #' @noRd .formatSynonyms <- function(responses_dt) { .splitCol( - object = responses_dt, - colName = "synonyms", - split = "; " + object = responses_dt, + colName = "synonyms", + split = "; " ) } @@ -387,11 +573,11 @@ mapCell2Accession <- function( #' @note Updated 2023-09-22. #' @noRd .formatComments <- function(object) { - test_ <- strSplit(object[["CC"]], ": ", n = 2) - test_ <- split(test_[, 2L], f = test_[, 1L]) + test_ <- strSplit(object[["CC"]], ": ", n = 2) + test_ <- split(test_[, 2L], f = test_[, 1L]) - test_ <- sapply(test_, strsplit, split = "; ") + test_ <- sapply(test_, strsplit, split = "; ") - object[["CC"]] <- test_ - object -} \ No newline at end of file + object[["CC"]] <- test_ + object +} diff --git a/R/cellosaurus_annotations.R b/R/cellosaurus_annotations.R index 045e549..8efffde 100644 --- a/R/cellosaurus_annotations.R +++ b/R/cellosaurus_annotations.R @@ -6,7 +6,7 @@ #' @param to A character vector specifying the types of annotations to retrieve. Possible values include "id", "ac", "hi", "sy", "ca", "sx", "ag", "di", "derived-from-site", "misspelling", and "dt". #' @param query_only A logical value indicating whether to only return the query string. #' @param raw A logical value indicating whether to return the raw response. -#' +#' #' @return A data frame containing the annotations for the cell line. #' #' @examples @@ -16,10 +16,22 @@ #' @export annotateCellAccession <- function( accessions, - to = c("id", "ac", "hi", "sy", "ca", "sx", "ag", "di", "derived-from-site", "misspelling", "dt"), - query_only = FALSE, raw = FALSE - ) -{ + to = c( + "id", + "ac", + "hi", + "sy", + "ca", + "sx", + "ag", + "di", + "derived-from-site", + "misspelling", + "dt" + ), + query_only = FALSE, + raw = FALSE +) { funContext <- .funContext("annotateCellAccession") .info(funContext, "Building Cellosaurus requests...") @@ -34,26 +46,28 @@ annotateCellAccession <- function( query_only = FALSE ) }) - + .info(funContext, "Performing Requests...") - responses <- .perform_request_parallel(requests, progress = "Querying Cellosaurus...") + responses <- .perform_request_parallel( + requests, + progress = "Querying Cellosaurus..." + ) names(responses) <- accessions - if(raw) return(responses) + if (raw) { + return(responses) + } .info(funContext, "Parsing Responses...") responses_dt <- parallel::mclapply(accessions, function(name) { resp <- responses[[name]] - .parse_cellosaurus_lines(resp) |> - unlist(recursive = FALSE) |> + .parse_cellosaurus_lines(resp) |> + unlist(recursive = FALSE) |> .processEntry() |> .formatSynonyms() - } - ) + }) names(responses_dt) <- accessions - responses_dt <- data.table::rbindlist(responses_dt, fill = TRUE) - + return(responses_dt) } - diff --git a/R/cellosaurus_helpers.R b/R/cellosaurus_helpers.R index e1210ab..c0ba813 100644 --- a/R/cellosaurus_helpers.R +++ b/R/cellosaurus_helpers.R @@ -33,13 +33,10 @@ if (length(from) != length(ids)) { stop("Length of 'from' must be 1 or the same length as 'ids'") } - sapply(1:length(ids), function(i) { - paste(from[i], ids[i], sep = ":") - }) + paste(from, ids, sep = ":") } - #' Build a Cellosaurus API request #' #' This function builds a Cellosaurus API request based on the provided parameters. @@ -51,7 +48,7 @@ #' @param output A character string specifying the desired output format of the API response. #' @param sort A character string specifying the field to sort the results by. #' @param query_only A logical value indicating whether to return only the constructed URL without making the request. -#' @param ... Additional arguments to be passed to the function. +#' #' #' @return A character string representing the constructed URL for the Cellosaurus API request. #' @@ -65,12 +62,137 @@ #' @keywords internal #' @noRd .build_cellosaurus_request <- function( - query = c("id:HeLa"), to = c("id", "ac", "hi", "ca", "sx", "ag", "di", "derived-from-site", "misspelling"), - numResults = 1, apiResource = "search/cell-line", output = "TSV", sort = "ac", - query_only = FALSE, ...) { - checkmate::assert_character(c(query, output)) - checkmate::assert_choice(apiResource, c("search/cell-line", "cell-line", "release-info")) - checkmate::assert_choice(output, c("TSV", "TXT", "JSON", "XML")) + query = c("id:HeLa"), + to = c( + "id", + "ac", + "hi", + "ca", + "sx", + "ag", + "di", + "derived-from-site", + "misspelling" + ), + numResults = 1, + apiResource = "search/cell-line", + output = "TSV", + sort = "ac", + query_only = FALSE +) { + allowed_resources <- c("search/cell-line", "cell-line", "release-info") + allowed_outputs <- c("TSV", "TXT", "JSON", "XML") + + checkmate::assert_character( + query, + any.missing = FALSE, + min.len = 1, + min.chars = 1 + ) + checkmate::assert_character( + to, + any.missing = FALSE, + min.len = 1, + min.chars = 1 + ) + checkmate::assert_character(apiResource, len = 1, min.chars = 1) + checkmate::assert_character(output, len = 1, min.chars = 1) + checkmate::assert_character(sort, len = 1, null.ok = TRUE, min.chars = 1) + checkmate::assert_logical(query_only, len = 1) + + checkmate::assert_choice( + apiResource, + allowed_resources + ) + checkmate::assert_choice(output, allowed_outputs) + + fallback_fields <- tolower(c( + "id", + "sy", + "idsy", + "ac", + "acas", + "dr", + "ref", + "rx", + "ra", + "rt", + "rl", + "ww", + "genome-ancestry", + "hla", + "registration", + "sequence-variation", + "anecdotal", + "biotechnology", + "breed", + "caution", + "cell-type", + "characteristics", + "donor-info", + "derived-from-site", + "discontinued", + "doubling-time", + "from", + "group", + "karyotype", + "knockout", + "msi", + "miscellaneous", + "misspelling", + "mab-isotype", + "mab-target", + "omics", + "part-of", + "population", + "problematic", + "resistance", + "senescence", + "integrated", + "transformant", + "virology", + "cc", + "str", + "di", + "din", + "dio", + "ox", + "sx", + "ag", + "oi", + "hi", + "ch", + "ca", + "dt", + "dtc", + "dtu", + "dtv" + )) + available_fields <- tryCatch( + tolower(cellosaurus_fields()), + error = function(e) fallback_fields + ) + available_fields <- unique(c(available_fields, fallback_fields)) + + to <- tolower(to) + invalid_fields <- setdiff(to, available_fields) + if (length(invalid_fields) > 0L) { + stop( + "Invalid Cellosaurus field(s): ", + paste(invalid_fields, collapse = ", ") + ) + } + + numResults <- as.integer(numResults) + if (is.na(numResults) || length(numResults) != 1L || numResults < 1L) { + stop("`numResults` must be a single positive integer") + } + + allowed_sort <- unique(c(available_fields, "miss")) + if (!is.null(sort)) { + sort <- tolower(sort) + checkmate::assert_choice(sort, allowed_sort) + } base_url <- "https://api.cellosaurus.org" url <- httr2::url_parse(base_url) @@ -97,7 +219,6 @@ opts$format <- tolower(output) opts$rows <- numResults - url$query <- opts url <- url |> httr2::url_build() if (query_only) { @@ -107,7 +228,6 @@ } - #' Get the Cellosaurus schema #' #' This function retrieves the Cellosaurus schema from the Cellosaurus API. @@ -128,9 +248,6 @@ } - - - #' Internal function to return the list of external resources available in Cellosaurus #' @return A character vector of external resources available in Cellosaurus #' @@ -138,25 +255,113 @@ #' @noRd .cellosaurus_extResources <- function() { c( - "4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress", - "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources", - "BioGRID_ORCS_Cell_line", "BTO", "BioSample", "BioSamples", - "cancercelllines", "CancerTools", "CBA", "CCLV", "CCRID", - "CCTCC", "Cell_Biolabs", "Cell_Model_Passport", "CGH-DB", - "ChEMBL-Cells", "ChEMBL-Targets", "CLDB", "CLO", "CLS", - "ColonAtlas", "Coriell", "Cosmic", "Cosmic-CLP", "dbGAP", - "dbMHC", "DepMap", "DGRC", "DiscoverX", "DSHB", "DSMZ", - "DSMZCellDive", "EBiSC", "ECACC", "EFO", "EGA", "ENCODE", - "ESTDAB", "FCDI", "FCS-free", "FlyBase_Cell_line", "GDSC", - "GeneCopoeia", "GEO", "HipSci", "HIVReagentProgram", "Horizon_Discovery", - "hPSCreg", "IARC_TP53", "IBRC", "ICLC", "ICLDB", "IGRhCellID", - "IGSR", "IHW", "Imanis", "Innoprot", "IPD-IMGT/HLA", "ISCR", - "IZSLER", "JCRB", "KCB", "KCLB", "Kerafast", "KYinno", "LiGeA", - "LIMORE", "LINCS_HMS", "LINCS_LDP", "Lonza", "MCCL", "MeSH", - "MetaboLights", "Millipore", "MMRRC", "NCBI_Iran", "NCI-DTP", "NHCDR", - "NIHhESC", "NISES", "NRFC", "PerkinElmer", "PharmacoDB", "PRIDE", - "Progenetix", "PubChem_Cell_line", "RCB", "Rockland", "RSCB", "SKIP", - "SKY/M-FISH/CGH", "SLKBase", "TKG", "TNGB", "TOKU-E", "Ubigene", - "WiCell", "Wikidata", "Ximbio" + "4DN", + "Abcam", + "ABCD", + "ABM", + "AddexBio", + "ArrayExpress", + "ATCC", + "BCGO", + "BCRC", + "BCRJ", + "BEI_Resources", + "BioGRID_ORCS_Cell_line", + "BTO", + "BioSample", + "BioSamples", + "cancercelllines", + "CancerTools", + "CBA", + "CCLV", + "CCRID", + "CCTCC", + "Cell_Biolabs", + "Cell_Model_Passport", + "CGH-DB", + "ChEMBL-Cells", + "ChEMBL-Targets", + "CLDB", + "CLO", + "CLS", + "ColonAtlas", + "Coriell", + "Cosmic", + "Cosmic-CLP", + "dbGAP", + "dbMHC", + "DepMap", + "DGRC", + "DiscoverX", + "DSHB", + "DSMZ", + "DSMZCellDive", + "EBiSC", + "ECACC", + "EFO", + "EGA", + "ENCODE", + "ESTDAB", + "FCDI", + "FCS-free", + "FlyBase_Cell_line", + "GDSC", + "GeneCopoeia", + "GEO", + "HipSci", + "HIVReagentProgram", + "Horizon_Discovery", + "hPSCreg", + "IARC_TP53", + "IBRC", + "ICLC", + "ICLDB", + "IGRhCellID", + "IGSR", + "IHW", + "Imanis", + "Innoprot", + "IPD-IMGT/HLA", + "ISCR", + "IZSLER", + "JCRB", + "KCB", + "KCLB", + "Kerafast", + "KYinno", + "LiGeA", + "LIMORE", + "LINCS_HMS", + "LINCS_LDP", + "Lonza", + "MCCL", + "MeSH", + "MetaboLights", + "Millipore", + "MMRRC", + "NCBI_Iran", + "NCI-DTP", + "NHCDR", + "NIHhESC", + "NISES", + "NRFC", + "PerkinElmer", + "PharmacoDB", + "PRIDE", + "Progenetix", + "PubChem_Cell_line", + "RCB", + "Rockland", + "RCSB", + "SKIP", + "SKY/M-FISH/CGH", + "SLKBase", + "TKG", + "TNGB", + "TOKU-E", + "Ubigene", + "WiCell", + "Wikidata", + "Ximbio" ) } diff --git a/man/mapCell2Accession.Rd b/man/mapCell2Accession.Rd index deb7d45..9b877e4 100644 --- a/man/mapCell2Accession.Rd +++ b/man/mapCell2Accession.Rd @@ -14,6 +14,7 @@ mapCell2Accession( query_only = FALSE, raw = FALSE, parsed = TRUE, + include_query = TRUE, ... ) } @@ -36,10 +37,16 @@ mapCell2Accession( \item{parsed}{Logical indicating whether to parse the response text. Default is TRUE.} -\item{...}{Additional arguments to be passed to the underlying functions.} +\item{include_query}{Logical indicating whether to include the \verb{query*} columns +(e.g. \code{query}, \code{query:ac}) in the returned result. Default is TRUE.} + +\item{...}{Currently unused. Reserved for future extensions.} } \value{ A data.table containing the mapped cell line IDs and accession numbers. +When \code{parsed = FALSE}, the returned table also includes Cellosaurus metadata +columns that have been renamed to user-friendly titles (for example, \code{sy} +becomes \code{synonyms}). } \description{ This function maps cell line IDs to accession numbers using the Cellosaurus database. diff --git a/tests/testthat/test_cellosaurus.R b/tests/testthat/test_cellosaurus.R index feea320..c93ead1 100644 --- a/tests/testthat/test_cellosaurus.R +++ b/tests/testthat/test_cellosaurus.R @@ -3,8 +3,6 @@ library(testthat) library(checkmate) - - test_that("mapCell2Accession works as expected", { # Test case 1: Test with a valid cell line name cell_line1 <- "Hela" @@ -52,7 +50,6 @@ test_that("mapCell DOR 13 works", { expect_data_table(result2, nrows = 1, ncols = 3) # works expect_data_table(result3, nrows = 2, ncols = 3) # works - expect_equal(result2$accession, "CVCL_6774") expect_equal(result2$cellLineName, "DOV13") @@ -83,7 +80,6 @@ test_that("raw param works", { checkmate::expect_character(lines) expect_true(length(lines) > 2000 & 10000 > length(lines)) - parsed_lines <- Map( f = function(lines, i, j) { @@ -100,7 +96,8 @@ test_that("raw param works", { expect_data_table(result, min.rows = 1, min.cols = 9) expect_true( all( - c("cellLineName", "accession", "comments", "synonyms") %in% colnames(result) + c("cellLineName", "accession", "comments", "synonyms") %in% + colnames(result) ) ) }) @@ -115,3 +112,17 @@ test_that("parsed works", { ) ) }) + +test_that("parsed = FALSE returns user friendly column names", { + result <- mapCell2Accession("Hela", parsed = FALSE) + expect_data_table(result, min.rows = 1) + expect_true(all(c("synonyms", "diseases") %in% colnames(result))) +}) + +test_that("include_query = FALSE removes query columns", { + result1 <- mapCell2Accession("Hela", include_query = FALSE) + result2 <- mapCell2Accession("Hela", parsed = FALSE, include_query = FALSE) + + expect_false(any(grepl("^query", colnames(result1)))) + expect_false(any(grepl("^query", colnames(result2)))) +}) diff --git a/tests/testthat/test_cellosaurus_helpers.R b/tests/testthat/test_cellosaurus_helpers.R index c7df9bd..10aa3eb 100644 --- a/tests/testthat/test_cellosaurus_helpers.R +++ b/tests/testthat/test_cellosaurus_helpers.R @@ -3,15 +3,24 @@ library(testthat) library(checkmate) test_that(".create_cellosaurus_queries is acting as expected", { - queries <- AnnotationGx:::.create_cellosaurus_queries(c("ID1", "ID2", "ID3"), "Accession") + queries <- AnnotationGx:::.create_cellosaurus_queries( + c("ID1", "ID2", "ID3"), + "Accession" + ) expect_character(queries) expect_equal(queries, c("Accession:ID1", "Accession:ID2", "Accession:ID3")) - queries2 <- AnnotationGx:::.create_cellosaurus_queries(c("ID1", "ID2", "ID3"), c("Accession", "Name", "Species")) + queries2 <- AnnotationGx:::.create_cellosaurus_queries( + c("ID1", "ID2", "ID3"), + c("Accession", "Name", "Species") + ) expect_equal(queries2, c("Accession:ID1", "Name:ID2", "Species:ID3")) }) test_that(".cellosaurus_schema is acting as expected", { + skip_if_offline() + skip_on_cran() + schema <- AnnotationGx:::.cellosaurus_schema() expect_list(schema) names_list <- c("openapi", "info", "paths", "components", "tags") @@ -20,30 +29,63 @@ test_that(".cellosaurus_schema is acting as expected", { }) test_that(".build_cellosaurus_request is acting as expected", { + skip_if_offline() + skip_on_cran() + request <- AnnotationGx:::.build_cellosaurus_request() expect_class(request, "httr2_request") expected <- "https://api.cellosaurus.org/search/cell-line?q=id%3AHeLa&sort=ac%20asc&fields=id%2Cac%2Chi%2Cca%2Csx%2Cag%2Cdi%2Cderived-from-site%2Cmisspelling&format=tsv&rows=1" expect_equal(request$url, expected) - response <- AnnotationGx:::.perform_request(request) |> AnnotationGx:::.parse_resp_tsv(show_col_types = FALSE, skip = 14) + response <- AnnotationGx:::.perform_request(request) |> + AnnotationGx:::.parse_resp_tsv(show_col_types = FALSE, skip = 14) expect_class(response, "spec_tbl_df") - expect_equal(nrow(response), 2) + expect_gte(nrow(response), 1) request2 <- AnnotationGx:::.build_cellosaurus_request( query = "id:HeLa", to = c( - "id", "ac", "sy", "acas", "sx", "ag", "di", "dio", "din", "dr", "cell-type", - "derived-from-site", "misspelling", "dt", "dtc", "dtu", "dtv", "genome-ancestry" + "id", + "ac", + "sy", + "acas", + "sx", + "ag", + "di", + "dio", + "din", + "dr", + "cell-type", + "derived-from-site", + "misspelling", + "dt", + "dtc", + "dtu", + "dtv", + "genome-ancestry" ), - numResults = 2, apiResource = "search/cell-line", output = "TSV" + numResults = 2, + apiResource = "search/cell-line", + output = "TSV" ) expect_equal( request2$url, "https://api.cellosaurus.org/search/cell-line?q=id%3AHeLa&sort=ac%20asc&fields=id%2Cac%2Csy%2Cacas%2Csx%2Cag%2Cdi%2Cdio%2Cdin%2Cdr%2Ccell-type%2Cderived-from-site%2Cmisspelling%2Cdt%2Cdtc%2Cdtu%2Cdtv%2Cgenome-ancestry&format=tsv&rows=2" ) - response <- AnnotationGx:::.perform_request(request2) |> AnnotationGx:::.parse_resp_tsv(show_col_types = FALSE, skip = 14) - expect_equal(nrow(response), 3) + response <- AnnotationGx:::.perform_request(request2) |> + AnnotationGx:::.parse_resp_tsv(show_col_types = FALSE, skip = 14) + expect_gte(nrow(response), 1) +}) + +test_that(".build_cellosaurus_request accepts documented sort fields", { + skip_if_offline() + skip_on_cran() + + expect_silent(AnnotationGx:::.build_cellosaurus_request(sort = "ID")) + expect_silent(AnnotationGx:::.build_cellosaurus_request(sort = "idsy")) + expect_silent(AnnotationGx:::.build_cellosaurus_request(sort = "miss")) + expect_silent(AnnotationGx:::.build_cellosaurus_request(sort = "ox")) }) @@ -51,11 +93,30 @@ test_that("common_cellosaurus_fields returns the expected fields", { fields <- AnnotationGx::cellosaurus_fields(common = T, upper = T) expect_character(fields) expect_fields <- c( - "id", "ac", "acas", "sy", "dr", "di", "din", "dio", "ox", "cc", "sx", "ag", "oi", - "hi", "ch", "ca", "dt", "dtc", "dtu", "dtv", "from", "group" + "id", + "ac", + "acas", + "sy", + "dr", + "di", + "din", + "dio", + "ox", + "cc", + "sx", + "ag", + "oi", + "hi", + "ch", + "ca", + "dt", + "dtc", + "dtu", + "dtv", + "from", + "group" ) - expect_equal(fields, toupper(expect_fields)) }) @@ -64,26 +125,114 @@ test_that(".cellosaurus_extResources returns the expected external resources", { expect_character(resources) expected_resources <- c( - "4DN", "Abcam", "ABCD", "ABM", "AddexBio", "ArrayExpress", - "ATCC", "BCGO", "BCRC", "BCRJ", "BEI_Resources", - "BioGRID_ORCS_Cell_line", "BTO", "BioSample", "BioSamples", - "cancercelllines", "CancerTools", "CBA", "CCLV", "CCRID", - "CCTCC", "Cell_Biolabs", "Cell_Model_Passport", "CGH-DB", - "ChEMBL-Cells", "ChEMBL-Targets", "CLDB", "CLO", "CLS", - "ColonAtlas", "Coriell", "Cosmic", "Cosmic-CLP", "dbGAP", - "dbMHC", "DepMap", "DGRC", "DiscoverX", "DSHB", "DSMZ", - "DSMZCellDive", "EBiSC", "ECACC", "EFO", "EGA", "ENCODE", - "ESTDAB", "FCDI", "FCS-free", "FlyBase_Cell_line", "GDSC", - "GeneCopoeia", "GEO", "HipSci", "HIVReagentProgram", "Horizon_Discovery", - "hPSCreg", "IARC_TP53", "IBRC", "ICLC", "ICLDB", "IGRhCellID", - "IGSR", "IHW", "Imanis", "Innoprot", "IPD-IMGT/HLA", "ISCR", - "IZSLER", "JCRB", "KCB", "KCLB", "Kerafast", "KYinno", "LiGeA", - "LIMORE", "LINCS_HMS", "LINCS_LDP", "Lonza", "MCCL", "MeSH", - "MetaboLights", "Millipore", "MMRRC", "NCBI_Iran", "NCI-DTP", "NHCDR", - "NIHhESC", "NISES", "NRFC", "PerkinElmer", "PharmacoDB", "PRIDE", - "Progenetix", "PubChem_Cell_line", "RCB", "Rockland", "RSCB", "SKIP", - "SKY/M-FISH/CGH", "SLKBase", "TKG", "TNGB", "TOKU-E", "Ubigene", - "WiCell", "Wikidata", "Ximbio" + "4DN", + "Abcam", + "ABCD", + "ABM", + "AddexBio", + "ArrayExpress", + "ATCC", + "BCGO", + "BCRC", + "BCRJ", + "BEI_Resources", + "BioGRID_ORCS_Cell_line", + "BTO", + "BioSample", + "BioSamples", + "cancercelllines", + "CancerTools", + "CBA", + "CCLV", + "CCRID", + "CCTCC", + "Cell_Biolabs", + "Cell_Model_Passport", + "CGH-DB", + "ChEMBL-Cells", + "ChEMBL-Targets", + "CLDB", + "CLO", + "CLS", + "ColonAtlas", + "Coriell", + "Cosmic", + "Cosmic-CLP", + "dbGAP", + "dbMHC", + "DepMap", + "DGRC", + "DiscoverX", + "DSHB", + "DSMZ", + "DSMZCellDive", + "EBiSC", + "ECACC", + "EFO", + "EGA", + "ENCODE", + "ESTDAB", + "FCDI", + "FCS-free", + "FlyBase_Cell_line", + "GDSC", + "GeneCopoeia", + "GEO", + "HipSci", + "HIVReagentProgram", + "Horizon_Discovery", + "hPSCreg", + "IARC_TP53", + "IBRC", + "ICLC", + "ICLDB", + "IGRhCellID", + "IGSR", + "IHW", + "Imanis", + "Innoprot", + "IPD-IMGT/HLA", + "ISCR", + "IZSLER", + "JCRB", + "KCB", + "KCLB", + "Kerafast", + "KYinno", + "LiGeA", + "LIMORE", + "LINCS_HMS", + "LINCS_LDP", + "Lonza", + "MCCL", + "MeSH", + "MetaboLights", + "Millipore", + "MMRRC", + "NCBI_Iran", + "NCI-DTP", + "NHCDR", + "NIHhESC", + "NISES", + "NRFC", + "PerkinElmer", + "PharmacoDB", + "PRIDE", + "Progenetix", + "PubChem_Cell_line", + "RCB", + "Rockland", + "RCSB", + "SKIP", + "SKY/M-FISH/CGH", + "SLKBase", + "TKG", + "TNGB", + "TOKU-E", + "Ubigene", + "WiCell", + "Wikidata", + "Ximbio" ) expect_equal(resources, expected_resources)