diff --git a/R/unichem.R b/R/unichem.R index 8bbefde..0f21b66 100644 --- a/R/unichem.R +++ b/R/unichem.R @@ -1,12 +1,11 @@ - # Unichem API documentation: https://www.ebi.ac.uk/unichem/info/webservices #' Get the list of sources in UniChem. -#' +#' #' @param all_columns `boolean` Whether to return all columns. Defaults to FALSE. -#' +#' -#' +#' #' Returns a `data.table` with the following columns: #' - `CompoundCount` (integer): Total of compounds provided by that source #' - `BaseURL` (string): Source Base URL for compounds @@ -21,120 +20,284 @@ #' - `ReleaseNumber` (integer): Release number of the source database data stored in UniChEM #' - `URL` (string): Main URL for the source #' - `UpdateComments` (string): Notes about the update process of that source to UniChEM -#' -#' +#' +#' #' @return A data.table with the list of sources in UniChem. -#' +#' #' @export getUnichemSources <- function(all_columns = FALSE) { - funContext <- .funContext("AnnotationGx::getUnichemSources") - - response <- .build_unichem_query("sources") |> - .build_request() |> - .perform_request() |> - .parse_resp_json() - - if(response$response != "Success"){ - .err(funContext, "Unichem API request failed.") - } + funContext <- .funContext("AnnotationGx::getUnichemSources") - .debug(funContext, sprintf("Unichem sourceCount: %s", response$totalSources)) + response <- .build_unichem_query("sources") |> + .build_request() |> + .perform_request() |> + .parse_resp_json() - sources_dt <- .asDT(response$sources) + if (response$response != "Success") { + .err(funContext, "Unichem API request failed.") + } - old_names <- c( - "UCICount", "baseIdUrl", "description", "lastUpdated", "name", - "nameLabel", "nameLong", "sourceID", "srcDetails", "srcReleaseDate", - "srcReleaseNumber", "srcUrl", "updateComments") + .debug( + funContext, + sprintf("Unichem sourceCount: %s", response$totalSources) + ) - new_names <- c( - "CompoundCount", "BaseURL", "Description", "LastUpdated", "Name", - "NameLabel", "NameLong", "SourceID", "Details", "ReleaseDate", - "ReleaseNumber", "URL", "UpdateComments") - - data.table::setnames(sources_dt, old_names, new_names) - - new_order <- c( - "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", - "BaseURL", "URL", "Details", - "Description", "ReleaseNumber", "ReleaseDate", "LastUpdated", - "UpdateComments" - ) + sources_dt <- .asDT(response$sources) + + old_names <- c( + "UCICount", + "baseIdUrl", + "description", + "lastUpdated", + "name", + "nameLabel", + "nameLong", + "sourceID", + "srcDetails", + "srcReleaseDate", + "srcReleaseNumber", + "srcUrl", + "updateComments" + ) + + new_names <- c( + "CompoundCount", + "BaseURL", + "Description", + "LastUpdated", + "Name", + "NameLabel", + "NameLong", + "SourceID", + "Details", + "ReleaseDate", + "ReleaseNumber", + "URL", + "UpdateComments" + ) + data.table::setnames(sources_dt, old_names, new_names) - sources_dt <- sources_dt[, new_order, with = FALSE] + new_order <- c( + "Name", + "NameLabel", + "NameLong", + "SourceID", + "CompoundCount", + "BaseURL", + "URL", + "Details", + "Description", + "ReleaseNumber", + "ReleaseDate", + "LastUpdated", + "UpdateComments" + ) - if(all_columns) return(sources_dt) + sources_dt <- sources_dt[, new_order, with = FALSE] - sources_dt[, c("Name", "SourceID")] + if (all_columns) { + return(sources_dt) + } + sources_dt[, c("Name", "SourceID")] } #' Query UniChem for a compound. -#' +#' #' This function queries the UniChem API for a compound based on the provided parameters. -#' -#' @param compound `character` or `integer` The compound identifier to search for. +#' +#' @param compound `character`, `integer`, or a list of such values. When a vector +#' or list is supplied, each element is queried and the results are returned as +#' a named list. #' @param type `character` The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID". -#' @param sourceID `integer` The source ID to search for if the type is "sourceID". Defaults to NULL. +#' @param sourceID `integer` The source ID to search for if the type is "sourceID". +#' When querying multiple compounds, this can be a vector the same length as +#' `compound` or a single value recycled to all queries. Defaults to `NA`. #' @param request_only `boolean` Whether to return the request only. Defaults to FALSE. #' @param raw `boolean` Whether to return the raw response. Defaults to FALSE. +#' @param progress `logical` or `character`. Passed through to +#' `.perform_request_parallel()` when multiple compounds are supplied. Use a +#' character string to customise the progress label. Defaults to +#' `"Querying UniChem..."`. #' @param ... Additional arguments. -#' -#' @return A list with the external mappings and the UniChem mappings. -#' +#' +#' @return For a single query, a list with the external mappings and the UniChem +#' mappings. For multiple queries, a named list of such results (one per +#' compound). If `raw = TRUE`, raw responses are returned instead. +#' #' @examples #' queryUnichemCompound(type = "sourceID", compound = "444795", sourceID = 22) -#' +#' #' @export queryUnichemCompound <- function( - compound, type, sourceID = NA_integer_, request_only = FALSE, raw = FALSE, ... -){ - checkmate::assert_string(type) - checkmate::assert_atomic(compound) - checkmate::assert_integerish(sourceID) - checkmate::assertLogical(request_only) - checkmate::assertLogical(raw) - - request <- .build_unichem_compound_req(type, compound, sourceID,...) - if(request_only) return(request) - - response <- request |> - .perform_request() |> - .parse_resp_json() - - if(raw) return(response) - - if(response$response != "Success"){ - msg <- paste( - "Unichem API request failed for compound", compound, " - with type", type, - " . Error:", response$error - ) - .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) + compound, + type, + sourceID = NA_integer_, + request_only = FALSE, + raw = FALSE, + progress = "Querying UniChem...", + ... +) { + checkmate::assert_string(type) + checkmate::assert_flag(request_only) + checkmate::assert_flag(raw) + checkmate::assert( + checkmate::check_flag(progress), + checkmate::check_string(progress, min.chars = 1) + ) + + compounds <- if (is.list(compound)) { + unlist(compound, recursive = TRUE, use.names = TRUE) + } else { + compound + } + checkmate::assert_atomic_vector(compounds, min.len = 1) + + many_queries <- length(compounds) > 1 + + validate_source_ids <- function(src_ids) { + if (type != "sourceID") { + return(rep(NA_integer_, length(compounds))) } - # Mapping names to be consistent with other API calls - mapped_sources_dt <- .asDT(response$compounds$sources) - old_names <- c("compoundId", "shortName", "longName", "id", "url") + checkmate::assert_integerish(src_ids, any.missing = FALSE) - new_names <- c("compoundID", "Name", "NameLong", "sourceID", "sourceURL") - data.table::setnames(mapped_sources_dt, old = old_names, new = new_names) + if (length(src_ids) == 1L) { + src_ids <- rep(src_ids, length(compounds)) + } else if (length(src_ids) != length(compounds)) { + stop( + "`sourceID` must be length 1 or match the number of compounds ", + "when type = 'sourceID'" + ) + } + checkmate::assert_integerish(src_ids, any.missing = FALSE) + + valid_ids <- getUnichemSources()$SourceID + invalid_ids <- setdiff(unique(src_ids), valid_ids) + if (length(invalid_ids) > 0L) { + stop( + "`sourceID` contains value(s) not available in UniChem: ", + paste(invalid_ids, collapse = ", "), + "\nValid source IDs: ", + paste(valid_ids, collapse = ", ") + ) + } + src_ids + } + + source_ids <- validate_source_ids(sourceID) + + build_request <- function(cmp, src) { + .build_unichem_compound_req( + type = type, + compound = cmp, + sourceID = if (is.na(src)) NULL else src, + ... + ) + } + + parse_response <- function(parsed, cmp_label) { + if (parsed$response != "Success") { + msg <- paste( + "Unichem API request failed for compound", + cmp_label, + "with type", + type, + ". Error:", + parsed$error + ) + .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) + } + + mapped_sources_dt <- .asDT(parsed$compounds$sources) + old_names <- c("compoundId", "shortName", "longName", "id", "url") + new_names <- c( + "compoundID", + "Name", + "NameLong", + "sourceID", + "sourceURL" + ) + data.table::setnames( + mapped_sources_dt, + old = old_names, + new = new_names + ) External_Mappings <- mapped_sources_dt[, new_names, with = FALSE] - + UniChem_Mappings <- list( - UniChem.UCI = response$compounds$uci, - UniChem.InchiKey = response$compounds$standardInchiKey, - UniChem.Inchi = response$compounds$inchi$inchi, - UniChem.formula = response$compounds$inchi$formula, - UniChem.connections = response$compounds$inchi$connections, - UniChem.hAtoms = response$compounds$inchi$hAtoms + UniChem.UCI = parsed$compounds$uci, + UniChem.InchiKey = parsed$compounds$standardInchiKey, + UniChem.Inchi = parsed$compounds$inchi$inchi, + UniChem.formula = parsed$compounds$inchi$formula, + UniChem.connections = parsed$compounds$inchi$connections, + UniChem.hAtoms = parsed$compounds$inchi$hAtoms ) list( - External_Mappings = External_Mappings, - UniChem_Mappings = UniChem_Mappings + External_Mappings = External_Mappings, + UniChem_Mappings = UniChem_Mappings + ) + } + + if (many_queries) { + requests <- Map(build_request, compounds, source_ids) + name_candidates <- names(compounds) + if ( + !is.null(name_candidates) && length(name_candidates) == length(compounds) + ) { + names(requests) <- name_candidates + } else { + names(requests) <- as.character(compounds) + } + + if (request_only) { + return(requests) + } + + responses <- .perform_request_parallel(requests, progress = progress) + names(responses) <- names(requests) + + parsed_responses <- Map(.parse_resp_json, responses) + + if (raw) { + names(parsed_responses) <- names(responses) + return(parsed_responses) + } + + results <- Map( + function(parsed, cmp_label) { + tryCatch( + parse_response(parsed, cmp_label), + error = function(e) { + structure( + list(error = conditionMessage(e)), + class = c("unichem_error", "list") + ) + } + ) + }, + parsed_responses, + names(responses) ) -} \ No newline at end of file + names(results) <- names(responses) + return(results) + } + + request <- build_request(compounds[[1L]], source_ids[[1L]]) + if (request_only) { + return(request) + } + + response <- request |> .perform_request() + parsed <- .parse_resp_json(response) + + if (raw) { + return(parsed) + } + + parse_response(parsed, compounds[[1L]]) +} diff --git a/man/queryUnichemCompound.Rd b/man/queryUnichemCompound.Rd index 150e170..9b71347 100644 --- a/man/queryUnichemCompound.Rd +++ b/man/queryUnichemCompound.Rd @@ -10,24 +10,36 @@ queryUnichemCompound( sourceID = NA_integer_, request_only = FALSE, raw = FALSE, + progress = "Querying UniChem...", ... ) } \arguments{ -\item{compound}{\code{character} or \code{integer} The compound identifier to search for.} +\item{compound}{\code{character}, \code{integer}, or a list of such values. When a vector +or list is supplied, each element is queried and the results are returned as +a named list.} \item{type}{\code{character} The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID".} -\item{sourceID}{\code{integer} The source ID to search for if the type is "sourceID". Defaults to NULL.} +\item{sourceID}{\code{integer} The source ID to search for if the type is "sourceID". +When querying multiple compounds, this can be a vector the same length as +\code{compound} or a single value recycled to all queries. Defaults to \code{NA}.} \item{request_only}{\code{boolean} Whether to return the request only. Defaults to FALSE.} \item{raw}{\code{boolean} Whether to return the raw response. Defaults to FALSE.} +\item{progress}{\code{logical} or \code{character}. Passed through to +\code{.perform_request_parallel()} when multiple compounds are supplied. Use a +character string to customise the progress label. Defaults to +\code{"Querying UniChem..."}.} + \item{...}{Additional arguments.} } \value{ -A list with the external mappings and the UniChem mappings. +For a single query, a list with the external mappings and the UniChem +mappings. For multiple queries, a named list of such results (one per +compound). If \code{raw = TRUE}, raw responses are returned instead. } \description{ This function queries the UniChem API for a compound based on the provided parameters. diff --git a/tests/testthat/test_unichem.R b/tests/testthat/test_unichem.R index 2d9d985..eb3c2fb 100644 --- a/tests/testthat/test_unichem.R +++ b/tests/testthat/test_unichem.R @@ -4,13 +4,23 @@ library(checkmate) test_that("getUnichemSources returns a data.table with the correct columns", { sources <- getUnichemSources(all_columns = TRUE) - + expected_columns <- c( - "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", - "BaseURL", "URL", "Details", "Description", "ReleaseNumber", - "ReleaseDate", "LastUpdated", "UpdateComments" + "Name", + "NameLabel", + "NameLong", + "SourceID", + "CompoundCount", + "BaseURL", + "URL", + "Details", + "Description", + "ReleaseNumber", + "ReleaseDate", + "LastUpdated", + "UpdateComments" ) - + expect_data_table( sources, all.missing = FALSE, @@ -19,40 +29,84 @@ test_that("getUnichemSources returns a data.table with the correct columns", { col.names = 'named', info = "The data.table should have the correct columns. The min number of rows and columns may change over time and is set on - from UniChem as of March 2024.", - ) + from UniChem as of March 2024." + ) + + expect_setequal(names(sources), expected_columns) }) test_that("queryUnichemCompound returns the expected results", { # Test case 1 - result1 <- queryUnichemCompound(type = "sourceID", compound = "444795", sourceID = 22) + result1 <- queryUnichemCompound( + type = "sourceID", + compound = "444795", + sourceID = 22 + ) expect_true(is.list(result1)) expect_true("External_Mappings" %in% names(result1)) expect_true("UniChem_Mappings" %in% names(result1)) - + # Test case 2 - expect_error(queryUnichemCompound(type = "inchikey", compound = "InchiKey123")) + expect_error(queryUnichemCompound( + type = "inchikey", + compound = "InchiKey123" + )) +}) +test_that("queryUnichemCompound handles vector inputs", { + fallback <- c( + "444795", + "444796" + ) + + results <- queryUnichemCompound( + type = "sourceID", + compound = fallback, + sourceID = 22, + progress = FALSE + ) + + expect_length(results, length(fallback)) + expect_named(results, fallback) + expect_true(all(vapply(results, is.list, logical(1)))) }) +test_that("queryUnichemCompound handles non source id lists", { + compounds <- c("538323", "538324") + results <- queryUnichemCompound( + compound = compounds, + type = "uci", + progress = FALSE + ) + + expect_named(results, compounds) +}) test_that("queryUnichemCompound returns the expected results 2", { # Test case 1 - result1 <- queryUnichemCompound(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = T) + result1 <- queryUnichemCompound( + type = "inchikey", + compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + raw = T + ) expect_true(is.list(result1)) - checkmate::expect_names( - names(result1), - subset.of=c("compounds", "notFound", "response", "totalCompounds")) + names(result1), + subset.of = c("compounds", "notFound", "response", "totalCompounds") + ) checkmate::expect_names( names(result1$compounds), - subset.of=c("inchi", "sources", "standardInchiKey", "uci") + subset.of = c("inchi", "sources", "standardInchiKey", "uci") ) - result2 <- queryUnichemCompound(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = F) + result2 <- queryUnichemCompound( + type = "inchikey", + compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + raw = F + ) expect_true(is.list(result2)) @@ -64,10 +118,12 @@ test_that("queryUnichemCompound returns the expected results 2", { checkmate::expect_names( names(result2$UniChem_Mappings), subset.of = c( - "UniChem.UCI", "UniChem.InchiKey", 'UniChem.Inchi', - 'UniChem.formula','UniChem.connections','UniChem.hAtoms' + "UniChem.UCI", + "UniChem.InchiKey", + 'UniChem.Inchi', + 'UniChem.formula', + 'UniChem.connections', + 'UniChem.hAtoms' ) ) - - })