From 276cf3f011ea197b67fd5224fc7106d47a88753b Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 10:47:09 -0400 Subject: [PATCH 1/7] feat: enable parallel querying --- R/unichem.R | 292 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 216 insertions(+), 76 deletions(-) diff --git a/R/unichem.R b/R/unichem.R index 8bbefde..2b84f7a 100644 --- a/R/unichem.R +++ b/R/unichem.R @@ -1,12 +1,11 @@ - # Unichem API documentation: https://www.ebi.ac.uk/unichem/info/webservices #' Get the list of sources in UniChem. -#' +#' #' @param all_columns `boolean` Whether to return all columns. Defaults to FALSE. -#' +#' -#' +#' #' Returns a `data.table` with the following columns: #' - `CompoundCount` (integer): Total of compounds provided by that source #' - `BaseURL` (string): Source Base URL for compounds @@ -21,10 +20,10 @@ #' - `ReleaseNumber` (integer): Release number of the source database data stored in UniChEM #' - `URL` (string): Main URL for the source #' - `UpdateComments` (string): Notes about the update process of that source to UniChEM -#' -#' +#' +#' #' @return A data.table with the list of sources in UniChem. -#' +#' #' @export getUnichemSources <- function(all_columns = FALSE) { funContext <- .funContext("AnnotationGx::getUnichemSources") @@ -32,109 +31,250 @@ getUnichemSources <- function(all_columns = FALSE) { response <- .build_unichem_query("sources") |> .build_request() |> .perform_request() |> - .parse_resp_json() - - if(response$response != "Success"){ + .parse_resp_json() + + if (response$response != "Success") { .err(funContext, "Unichem API request failed.") } - .debug(funContext, sprintf("Unichem sourceCount: %s", response$totalSources)) + .debug( + funContext, + sprintf("Unichem sourceCount: %s", response$totalSources) + ) sources_dt <- .asDT(response$sources) old_names <- c( - "UCICount", "baseIdUrl", "description", "lastUpdated", "name", - "nameLabel", "nameLong", "sourceID", "srcDetails", "srcReleaseDate", - "srcReleaseNumber", "srcUrl", "updateComments") + "UCICount", + "baseIdUrl", + "description", + "lastUpdated", + "name", + "nameLabel", + "nameLong", + "sourceID", + "srcDetails", + "srcReleaseDate", + "srcReleaseNumber", + "srcUrl", + "updateComments" + ) new_names <- c( - "CompoundCount", "BaseURL", "Description", "LastUpdated", "Name", - "NameLabel", "NameLong", "SourceID", "Details", "ReleaseDate", - "ReleaseNumber", "URL", "UpdateComments") - + "CompoundCount", + "BaseURL", + "Description", + "LastUpdated", + "Name", + "NameLabel", + "NameLong", + "SourceID", + "Details", + "ReleaseDate", + "ReleaseNumber", + "URL", + "UpdateComments" + ) + data.table::setnames(sources_dt, old_names, new_names) new_order <- c( - "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", - "BaseURL", "URL", "Details", - "Description", "ReleaseNumber", "ReleaseDate", "LastUpdated", + "Name", + "NameLabel", + "NameLong", + "SourceID", + "CompoundCount", + "BaseURL", + "URL", + "Details", + "Description", + "ReleaseNumber", + "ReleaseDate", + "LastUpdated", "UpdateComments" ) - sources_dt <- sources_dt[, new_order, with = FALSE] - if(all_columns) return(sources_dt) + if (all_columns) { + return(sources_dt) + } sources_dt[, c("Name", "SourceID")] - } #' Query UniChem for a compound. -#' +#' #' This function queries the UniChem API for a compound based on the provided parameters. -#' -#' @param compound `character` or `integer` The compound identifier to search for. +#' +#' @param compound `character`, `integer`, or a list of such values. When a vector +#' or list is supplied, each element is queried and the results are returned as +#' a named list. #' @param type `character` The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID". -#' @param sourceID `integer` The source ID to search for if the type is "sourceID". Defaults to NULL. +#' @param sourceID `integer` The source ID to search for if the type is "sourceID". +#' When querying multiple compounds, this can be a vector the same length as +#' `compound` or a single value recycled to all queries. Defaults to `NA`. #' @param request_only `boolean` Whether to return the request only. Defaults to FALSE. #' @param raw `boolean` Whether to return the raw response. Defaults to FALSE. +#' @param progress `logical` or `character`. Passed through to +#' `.perform_request_parallel()` when multiple compounds are supplied. Use a +#' character string to customise the progress label. Defaults to +#' `"Querying UniChem..."`. #' @param ... Additional arguments. -#' -#' @return A list with the external mappings and the UniChem mappings. -#' +#' +#' @return For a single query, a list with the external mappings and the UniChem +#' mappings. For multiple queries, a named list of such results (one per +#' compound). If `raw = TRUE`, raw responses are returned instead. +#' #' @examples #' queryUnichemCompound(type = "sourceID", compound = "444795", sourceID = 22) -#' +#' #' @export queryUnichemCompound <- function( - compound, type, sourceID = NA_integer_, request_only = FALSE, raw = FALSE, ... -){ + compound, + type, + sourceID = NA_integer_, + request_only = FALSE, + raw = FALSE, + progress = "Querying UniChem...", + ... +) { checkmate::assert_string(type) - checkmate::assert_atomic(compound) - checkmate::assert_integerish(sourceID) - checkmate::assertLogical(request_only) - checkmate::assertLogical(raw) - - request <- .build_unichem_compound_req(type, compound, sourceID,...) - if(request_only) return(request) - - response <- request |> - .perform_request() |> - .parse_resp_json() - - if(raw) return(response) - - if(response$response != "Success"){ - msg <- paste( - "Unichem API request failed for compound", compound, " - with type", type, - " . Error:", response$error + checkmate::assert_flag(request_only) + checkmate::assert_flag(raw) + checkmate::assert( + checkmate::check_flag(progress), + checkmate::check_string(progress, min.chars = 1) + ) + + compounds <- if (is.list(compound)) { + unlist(compound, recursive = TRUE, use.names = FALSE) + } else { + compound + } + checkmate::assert_atomic_vector(compounds, min.len = 1) + + many_queries <- length(compounds) > 1 + + validate_source_ids <- function(src_ids) { + if (type != "sourceID") { + return(rep(NA_integer_, length(compounds))) + } + + checkmate::assert_integerish(src_ids, any.missing = FALSE) + total_sources <- max(getUnichemSources()$SourceID) + + if (length(src_ids) == 1L) { + src_ids <- rep(src_ids, length(compounds)) + } else if (length(src_ids) != length(compounds)) { + stop( + "`sourceID` must be length 1 or match the number of compounds ", + "when type = 'sourceID'" + ) + } + checkmate::assert_integerish(src_ids, lower = 1, upper = total_sources) + src_ids + } + + source_ids <- validate_source_ids(sourceID) + + build_request <- function(cmp, src) { + .build_unichem_compound_req( + type = type, + compound = cmp, + sourceID = if (is.na(src)) NULL else src, + ... ) - .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) } - # Mapping names to be consistent with other API calls - mapped_sources_dt <- .asDT(response$compounds$sources) - old_names <- c("compoundId", "shortName", "longName", "id", "url") - - new_names <- c("compoundID", "Name", "NameLong", "sourceID", "sourceURL") - data.table::setnames(mapped_sources_dt, old = old_names, new = new_names) - - External_Mappings <- mapped_sources_dt[, new_names, with = FALSE] - - UniChem_Mappings <- list( - UniChem.UCI = response$compounds$uci, - UniChem.InchiKey = response$compounds$standardInchiKey, - UniChem.Inchi = response$compounds$inchi$inchi, - UniChem.formula = response$compounds$inchi$formula, - UniChem.connections = response$compounds$inchi$connections, - UniChem.hAtoms = response$compounds$inchi$hAtoms - ) + parse_response <- function(parsed, cmp_label) { + if (parsed$response != "Success") { + msg <- paste( + "Unichem API request failed for compound", + cmp_label, + "with type", + type, + ". Error:", + parsed$error + ) + .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) + } - list( - External_Mappings = External_Mappings, - UniChem_Mappings = UniChem_Mappings - ) + mapped_sources_dt <- .asDT(parsed$compounds$sources) + old_names <- c("compoundId", "shortName", "longName", "id", "url") + new_names <- c( + "compoundID", + "Name", + "NameLong", + "sourceID", + "sourceURL" + ) + data.table::setnames( + mapped_sources_dt, + old = old_names, + new = new_names + ) + + External_Mappings <- mapped_sources_dt[, new_names, with = FALSE] + + UniChem_Mappings <- list( + UniChem.UCI = parsed$compounds$uci, + UniChem.InchiKey = parsed$compounds$standardInchiKey, + UniChem.Inchi = parsed$compounds$inchi$inchi, + UniChem.formula = parsed$compounds$inchi$formula, + UniChem.connections = parsed$compounds$inchi$connections, + UniChem.hAtoms = parsed$compounds$inchi$hAtoms + ) + + list( + External_Mappings = External_Mappings, + UniChem_Mappings = UniChem_Mappings + ) + } + + if (many_queries) { + requests <- Map(build_request, compounds, source_ids) + names(requests) <- if (!is.null(names(compound))) { + names(compound) + } else { + as.character(compounds) + } + + if (request_only) { + return(requests) + } + + responses <- .perform_request_parallel(requests, progress = progress) + names(responses) <- names(requests) + + parsed_responses <- Map(.parse_resp_json, responses) -} \ No newline at end of file + if (raw) { + names(parsed_responses) <- names(responses) + return(parsed_responses) + } + + results <- Map( + parse_response, + parsed_responses, + names(responses) + ) + + names(results) <- names(responses) + return(results) + } + + request <- build_request(compounds[[1L]], source_ids[[1L]]) + if (request_only) { + return(request) + } + + response <- request |> .perform_request() + parsed <- .parse_resp_json(response) + + if (raw) { + return(parsed) + } + + parse_response(parsed, compounds[[1L]]) +} From 45063a0f2c00f2aa448b180656e4251e7bec5208 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 10:47:37 -0400 Subject: [PATCH 2/7] test: add unichem tests for multi-input --- tests/testthat/test_unichem.R | 92 +++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/tests/testthat/test_unichem.R b/tests/testthat/test_unichem.R index 2d9d985..f036f85 100644 --- a/tests/testthat/test_unichem.R +++ b/tests/testthat/test_unichem.R @@ -4,13 +4,23 @@ library(checkmate) test_that("getUnichemSources returns a data.table with the correct columns", { sources <- getUnichemSources(all_columns = TRUE) - + expected_columns <- c( - "Name", "NameLabel", "NameLong", "SourceID", "CompoundCount", - "BaseURL", "URL", "Details", "Description", "ReleaseNumber", - "ReleaseDate", "LastUpdated", "UpdateComments" + "Name", + "NameLabel", + "NameLong", + "SourceID", + "CompoundCount", + "BaseURL", + "URL", + "Details", + "Description", + "ReleaseNumber", + "ReleaseDate", + "LastUpdated", + "UpdateComments" ) - + expect_data_table( sources, all.missing = FALSE, @@ -20,39 +30,81 @@ test_that("getUnichemSources returns a data.table with the correct columns", { info = "The data.table should have the correct columns. The min number of rows and columns may change over time and is set on from UniChem as of March 2024.", - ) + ) }) test_that("queryUnichemCompound returns the expected results", { # Test case 1 - result1 <- queryUnichemCompound(type = "sourceID", compound = "444795", sourceID = 22) + result1 <- queryUnichemCompound( + type = "sourceID", + compound = "444795", + sourceID = 22 + ) expect_true(is.list(result1)) expect_true("External_Mappings" %in% names(result1)) expect_true("UniChem_Mappings" %in% names(result1)) - + # Test case 2 - expect_error(queryUnichemCompound(type = "inchikey", compound = "InchiKey123")) + expect_error(queryUnichemCompound( + type = "inchikey", + compound = "InchiKey123" + )) +}) + +test_that("queryUnichemCompound handles vector inputs", { + fallback <- c( + "444795", + "444796" + ) + + results <- queryUnichemCompound( + type = "sourceID", + compound = fallback, + sourceID = 22, + progress = FALSE + ) + expect_length(results, length(fallback)) + expect_named(results, fallback) + expect_true(all(vapply(results, is.list, logical(1)))) }) +test_that("queryUnichemCompound handles non source id lists", { + compounds <- c("538323", "538324") + results <- queryUnichemCompound( + compound = compounds, + type = "uci", + progress = FALSE + ) + + expect_named(results, compounds) +}) test_that("queryUnichemCompound returns the expected results 2", { # Test case 1 - result1 <- queryUnichemCompound(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = T) + result1 <- queryUnichemCompound( + type = "inchikey", + compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + raw = T + ) expect_true(is.list(result1)) - checkmate::expect_names( - names(result1), - subset.of=c("compounds", "notFound", "response", "totalCompounds")) + names(result1), + subset.of = c("compounds", "notFound", "response", "totalCompounds") + ) checkmate::expect_names( names(result1$compounds), - subset.of=c("inchi", "sources", "standardInchiKey", "uci") + subset.of = c("inchi", "sources", "standardInchiKey", "uci") ) - result2 <- queryUnichemCompound(type = "inchikey", compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", raw = F) + result2 <- queryUnichemCompound( + type = "inchikey", + compound = "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + raw = F + ) expect_true(is.list(result2)) @@ -64,10 +116,12 @@ test_that("queryUnichemCompound returns the expected results 2", { checkmate::expect_names( names(result2$UniChem_Mappings), subset.of = c( - "UniChem.UCI", "UniChem.InchiKey", 'UniChem.Inchi', - 'UniChem.formula','UniChem.connections','UniChem.hAtoms' + "UniChem.UCI", + "UniChem.InchiKey", + 'UniChem.Inchi', + 'UniChem.formula', + 'UniChem.connections', + 'UniChem.hAtoms' ) ) - - }) From 3c028edbadb71985096722f712699513494f8b25 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 10:48:59 -0400 Subject: [PATCH 3/7] docs: roxygenize --- man/queryUnichemCompound.Rd | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/man/queryUnichemCompound.Rd b/man/queryUnichemCompound.Rd index 150e170..9b71347 100644 --- a/man/queryUnichemCompound.Rd +++ b/man/queryUnichemCompound.Rd @@ -10,24 +10,36 @@ queryUnichemCompound( sourceID = NA_integer_, request_only = FALSE, raw = FALSE, + progress = "Querying UniChem...", ... ) } \arguments{ -\item{compound}{\code{character} or \code{integer} The compound identifier to search for.} +\item{compound}{\code{character}, \code{integer}, or a list of such values. When a vector +or list is supplied, each element is queried and the results are returned as +a named list.} \item{type}{\code{character} The type of compound identifier to search for. Valid types are "uci", "inchi", "inchikey", and "sourceID".} -\item{sourceID}{\code{integer} The source ID to search for if the type is "sourceID". Defaults to NULL.} +\item{sourceID}{\code{integer} The source ID to search for if the type is "sourceID". +When querying multiple compounds, this can be a vector the same length as +\code{compound} or a single value recycled to all queries. Defaults to \code{NA}.} \item{request_only}{\code{boolean} Whether to return the request only. Defaults to FALSE.} \item{raw}{\code{boolean} Whether to return the raw response. Defaults to FALSE.} +\item{progress}{\code{logical} or \code{character}. Passed through to +\code{.perform_request_parallel()} when multiple compounds are supplied. Use a +character string to customise the progress label. Defaults to +\code{"Querying UniChem..."}.} + \item{...}{Additional arguments.} } \value{ -A list with the external mappings and the UniChem mappings. +For a single query, a list with the external mappings and the UniChem +mappings. For multiple queries, a named list of such results (one per +compound). If \code{raw = TRUE}, raw responses are returned instead. } \description{ This function queries the UniChem API for a compound based on the provided parameters. From cd0818f8e80cca48dc1a87e79612867f28448e06 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 11:54:38 -0400 Subject: [PATCH 4/7] refactor: retain names for parallel unichem queries --- R/unichem.R | 407 ++++++++++++++++++++++++++-------------------------- 1 file changed, 205 insertions(+), 202 deletions(-) diff --git a/R/unichem.R b/R/unichem.R index 2b84f7a..ef1e4b1 100644 --- a/R/unichem.R +++ b/R/unichem.R @@ -26,81 +26,81 @@ #' #' @export getUnichemSources <- function(all_columns = FALSE) { - funContext <- .funContext("AnnotationGx::getUnichemSources") - - response <- .build_unichem_query("sources") |> - .build_request() |> - .perform_request() |> - .parse_resp_json() - - if (response$response != "Success") { - .err(funContext, "Unichem API request failed.") - } - - .debug( - funContext, - sprintf("Unichem sourceCount: %s", response$totalSources) - ) - - sources_dt <- .asDT(response$sources) - - old_names <- c( - "UCICount", - "baseIdUrl", - "description", - "lastUpdated", - "name", - "nameLabel", - "nameLong", - "sourceID", - "srcDetails", - "srcReleaseDate", - "srcReleaseNumber", - "srcUrl", - "updateComments" - ) - - new_names <- c( - "CompoundCount", - "BaseURL", - "Description", - "LastUpdated", - "Name", - "NameLabel", - "NameLong", - "SourceID", - "Details", - "ReleaseDate", - "ReleaseNumber", - "URL", - "UpdateComments" - ) - - data.table::setnames(sources_dt, old_names, new_names) - - new_order <- c( - "Name", - "NameLabel", - "NameLong", - "SourceID", - "CompoundCount", - "BaseURL", - "URL", - "Details", - "Description", - "ReleaseNumber", - "ReleaseDate", - "LastUpdated", - "UpdateComments" - ) - - sources_dt <- sources_dt[, new_order, with = FALSE] - - if (all_columns) { - return(sources_dt) - } - - sources_dt[, c("Name", "SourceID")] + funContext <- .funContext("AnnotationGx::getUnichemSources") + + response <- .build_unichem_query("sources") |> + .build_request() |> + .perform_request() |> + .parse_resp_json() + + if (response$response != "Success") { + .err(funContext, "Unichem API request failed.") + } + + .debug( + funContext, + sprintf("Unichem sourceCount: %s", response$totalSources) + ) + + sources_dt <- .asDT(response$sources) + + old_names <- c( + "UCICount", + "baseIdUrl", + "description", + "lastUpdated", + "name", + "nameLabel", + "nameLong", + "sourceID", + "srcDetails", + "srcReleaseDate", + "srcReleaseNumber", + "srcUrl", + "updateComments" + ) + + new_names <- c( + "CompoundCount", + "BaseURL", + "Description", + "LastUpdated", + "Name", + "NameLabel", + "NameLong", + "SourceID", + "Details", + "ReleaseDate", + "ReleaseNumber", + "URL", + "UpdateComments" + ) + + data.table::setnames(sources_dt, old_names, new_names) + + new_order <- c( + "Name", + "NameLabel", + "NameLong", + "SourceID", + "CompoundCount", + "BaseURL", + "URL", + "Details", + "Description", + "ReleaseNumber", + "ReleaseDate", + "LastUpdated", + "UpdateComments" + ) + + sources_dt <- sources_dt[, new_order, with = FALSE] + + if (all_columns) { + return(sources_dt) + } + + sources_dt[, c("Name", "SourceID")] } #' Query UniChem for a compound. @@ -131,150 +131,153 @@ getUnichemSources <- function(all_columns = FALSE) { #' #' @export queryUnichemCompound <- function( - compound, - type, - sourceID = NA_integer_, - request_only = FALSE, - raw = FALSE, - progress = "Querying UniChem...", - ... + compound, + type, + sourceID = NA_integer_, + request_only = FALSE, + raw = FALSE, + progress = "Querying UniChem...", + ... ) { - checkmate::assert_string(type) - checkmate::assert_flag(request_only) - checkmate::assert_flag(raw) - checkmate::assert( - checkmate::check_flag(progress), - checkmate::check_string(progress, min.chars = 1) - ) - - compounds <- if (is.list(compound)) { - unlist(compound, recursive = TRUE, use.names = FALSE) - } else { - compound - } - checkmate::assert_atomic_vector(compounds, min.len = 1) - - many_queries <- length(compounds) > 1 - - validate_source_ids <- function(src_ids) { - if (type != "sourceID") { - return(rep(NA_integer_, length(compounds))) - } - - checkmate::assert_integerish(src_ids, any.missing = FALSE) - total_sources <- max(getUnichemSources()$SourceID) - - if (length(src_ids) == 1L) { - src_ids <- rep(src_ids, length(compounds)) - } else if (length(src_ids) != length(compounds)) { - stop( - "`sourceID` must be length 1 or match the number of compounds ", - "when type = 'sourceID'" - ) - } - checkmate::assert_integerish(src_ids, lower = 1, upper = total_sources) - src_ids + checkmate::assert_string(type) + checkmate::assert_flag(request_only) + checkmate::assert_flag(raw) + checkmate::assert( + checkmate::check_flag(progress), + checkmate::check_string(progress, min.chars = 1) + ) + + compounds <- if (is.list(compound)) { + unlist(compound, recursive = TRUE, use.names = TRUE) + } else { + compound + } + checkmate::assert_atomic_vector(compounds, min.len = 1) + + many_queries <- length(compounds) > 1 + + validate_source_ids <- function(src_ids) { + if (type != "sourceID") { + return(rep(NA_integer_, length(compounds))) } - source_ids <- validate_source_ids(sourceID) + checkmate::assert_integerish(src_ids, any.missing = FALSE) + total_sources <- max(getUnichemSources()$SourceID) - build_request <- function(cmp, src) { - .build_unichem_compound_req( - type = type, - compound = cmp, - sourceID = if (is.na(src)) NULL else src, - ... - ) + if (length(src_ids) == 1L) { + src_ids <- rep(src_ids, length(compounds)) + } else if (length(src_ids) != length(compounds)) { + stop( + "`sourceID` must be length 1 or match the number of compounds ", + "when type = 'sourceID'" + ) } - - parse_response <- function(parsed, cmp_label) { - if (parsed$response != "Success") { - msg <- paste( - "Unichem API request failed for compound", - cmp_label, - "with type", - type, - ". Error:", - parsed$error - ) - .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) - } - - mapped_sources_dt <- .asDT(parsed$compounds$sources) - old_names <- c("compoundId", "shortName", "longName", "id", "url") - new_names <- c( - "compoundID", - "Name", - "NameLong", - "sourceID", - "sourceURL" - ) - data.table::setnames( - mapped_sources_dt, - old = old_names, - new = new_names - ) - - External_Mappings <- mapped_sources_dt[, new_names, with = FALSE] - - UniChem_Mappings <- list( - UniChem.UCI = parsed$compounds$uci, - UniChem.InchiKey = parsed$compounds$standardInchiKey, - UniChem.Inchi = parsed$compounds$inchi$inchi, - UniChem.formula = parsed$compounds$inchi$formula, - UniChem.connections = parsed$compounds$inchi$connections, - UniChem.hAtoms = parsed$compounds$inchi$hAtoms - ) - - list( - External_Mappings = External_Mappings, - UniChem_Mappings = UniChem_Mappings - ) + checkmate::assert_integerish(src_ids, lower = 1, upper = total_sources) + src_ids + } + + source_ids <- validate_source_ids(sourceID) + + build_request <- function(cmp, src) { + .build_unichem_compound_req( + type = type, + compound = cmp, + sourceID = if (is.na(src)) NULL else src, + ... + ) + } + + parse_response <- function(parsed, cmp_label) { + if (parsed$response != "Success") { + msg <- paste( + "Unichem API request failed for compound", + cmp_label, + "with type", + type, + ". Error:", + parsed$error + ) + .err(.funContext("AnnotationGx::queryUnichemCompound"), msg) } - if (many_queries) { - requests <- Map(build_request, compounds, source_ids) - names(requests) <- if (!is.null(names(compound))) { - names(compound) - } else { - as.character(compounds) - } - - if (request_only) { - return(requests) - } - - responses <- .perform_request_parallel(requests, progress = progress) - names(responses) <- names(requests) - - parsed_responses <- Map(.parse_resp_json, responses) + mapped_sources_dt <- .asDT(parsed$compounds$sources) + old_names <- c("compoundId", "shortName", "longName", "id", "url") + new_names <- c( + "compoundID", + "Name", + "NameLong", + "sourceID", + "sourceURL" + ) + data.table::setnames( + mapped_sources_dt, + old = old_names, + new = new_names + ) - if (raw) { - names(parsed_responses) <- names(responses) - return(parsed_responses) - } + External_Mappings <- mapped_sources_dt[, new_names, with = FALSE] - results <- Map( - parse_response, - parsed_responses, - names(responses) - ) + UniChem_Mappings <- list( + UniChem.UCI = parsed$compounds$uci, + UniChem.InchiKey = parsed$compounds$standardInchiKey, + UniChem.Inchi = parsed$compounds$inchi$inchi, + UniChem.formula = parsed$compounds$inchi$formula, + UniChem.connections = parsed$compounds$inchi$connections, + UniChem.hAtoms = parsed$compounds$inchi$hAtoms + ) - names(results) <- names(responses) - return(results) + list( + External_Mappings = External_Mappings, + UniChem_Mappings = UniChem_Mappings + ) + } + + if (many_queries) { + requests <- Map(build_request, compounds, source_ids) + name_candidates <- names(compounds) + if ( + !is.null(name_candidates) && length(name_candidates) == length(compounds) + ) { + names(requests) <- name_candidates + } else { + names(requests) <- as.character(compounds) } - request <- build_request(compounds[[1L]], source_ids[[1L]]) if (request_only) { - return(request) + return(requests) } - response <- request |> .perform_request() - parsed <- .parse_resp_json(response) + responses <- .perform_request_parallel(requests, progress = progress) + names(responses) <- names(requests) + + parsed_responses <- Map(.parse_resp_json, responses) if (raw) { - return(parsed) + names(parsed_responses) <- names(responses) + return(parsed_responses) } - parse_response(parsed, compounds[[1L]]) + results <- Map( + parse_response, + parsed_responses, + names(responses) + ) + + names(results) <- names(responses) + return(results) + } + + request <- build_request(compounds[[1L]], source_ids[[1L]]) + if (request_only) { + return(request) + } + + response <- request |> .perform_request() + parsed <- .parse_resp_json(response) + + if (raw) { + return(parsed) + } + + parse_response(parsed, compounds[[1L]]) } From 1345c035e4d7a060fcc999da99a3ae439c7ac8a8 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 11:56:00 -0400 Subject: [PATCH 5/7] fix: validate unichem source ids against catalog --- R/unichem.R | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/R/unichem.R b/R/unichem.R index ef1e4b1..d327543 100644 --- a/R/unichem.R +++ b/R/unichem.R @@ -162,7 +162,6 @@ queryUnichemCompound <- function( } checkmate::assert_integerish(src_ids, any.missing = FALSE) - total_sources <- max(getUnichemSources()$SourceID) if (length(src_ids) == 1L) { src_ids <- rep(src_ids, length(compounds)) @@ -172,7 +171,18 @@ queryUnichemCompound <- function( "when type = 'sourceID'" ) } - checkmate::assert_integerish(src_ids, lower = 1, upper = total_sources) + checkmate::assert_integerish(src_ids, any.missing = FALSE) + + valid_ids <- getUnichemSources()$SourceID + invalid_ids <- setdiff(unique(src_ids), valid_ids) + if (length(invalid_ids) > 0L) { + stop( + "`sourceID` contains value(s) not available in UniChem: ", + paste(invalid_ids, collapse = ", "), + "\nValid source IDs: ", + paste(valid_ids, collapse = ", ") + ) + } src_ids } From 0869a84b05c5509ecdb58a2c320adba78cf7a722 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 11:58:04 -0400 Subject: [PATCH 6/7] test: assert exact unichem source columns --- tests/testthat/test_unichem.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_unichem.R b/tests/testthat/test_unichem.R index f036f85..eb3c2fb 100644 --- a/tests/testthat/test_unichem.R +++ b/tests/testthat/test_unichem.R @@ -29,8 +29,10 @@ test_that("getUnichemSources returns a data.table with the correct columns", { col.names = 'named', info = "The data.table should have the correct columns. The min number of rows and columns may change over time and is set on - from UniChem as of March 2024.", + from UniChem as of March 2024." ) + + expect_setequal(names(sources), expected_columns) }) From aaa1c43b4270eb38218abe8cfd0cbfc6e9bf1638 Mon Sep 17 00:00:00 2001 From: Michael Tran Date: Tue, 21 Oct 2025 15:44:53 -0400 Subject: [PATCH 7/7] fix: preserve UniChem results when a query fails --- R/unichem.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/unichem.R b/R/unichem.R index d327543..0f21b66 100644 --- a/R/unichem.R +++ b/R/unichem.R @@ -268,7 +268,17 @@ queryUnichemCompound <- function( } results <- Map( - parse_response, + function(parsed, cmp_label) { + tryCatch( + parse_response(parsed, cmp_label), + error = function(e) { + structure( + list(error = conditionMessage(e)), + class = c("unichem_error", "list") + ) + } + ) + }, parsed_responses, names(responses) )