From f0662c3231acef20ab4c09f0ae57c56054bf8f16 Mon Sep 17 00:00:00 2001 From: dsammour Date: Tue, 26 Feb 2019 15:18:41 +0100 Subject: [PATCH 1/6] On-Disk manipulation of imzML datasets * importImzML function now supports MassSpectrumOnDisk-class of MALDIquant and attaches to the binary ibd files without loading it. * Documentation edited accordingly. --- .Rbuildignore | 2 ++ .gitignore | 1 + DESCRIPTION | 5 ++-- NAMESPACE | 2 ++ R/import-functions.R | 19 +++++++++++---- R/importImzMl-functions.R | 46 +++++++++++++++++++++++++++++------- man/importImzMl-functions.Rd | 19 +++++++++++---- 7 files changed, 75 insertions(+), 19 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 5f87fbb..5466049 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,5 @@ ^\.git$ ^\.gitignore$ ^\.travis.yml$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 1377554..076c102 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.swp +.Rproj.user diff --git a/DESCRIPTION b/DESCRIPTION index 0499f22..210a441 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: MALDIquantForeign -Version: 0.12 +Version: 0.12.50 Date: 2019-01-30 Title: Import/Export Routines for 'MALDIquant' Authors@R: c(person("Sebastian", "Gibb", role=c("aut", "cre"), @@ -7,9 +7,10 @@ Authors@R: c(person("Sebastian", "Gibb", role=c("aut", "cre"), comment=c(ORCID="0000-0001-7406-4443")), person("Pietro", "Franceschi", role=c("ctb"), email="pietro.franceschi@fmach.it")) +biocViews: Depends: R (>= 3.2.2), methods, MALDIquant (>= 1.16.4) Imports: base64enc, digest, readBrukerFlexData (>= 1.7), readMzXmlData - (>= 2.7), XML + (>= 2.7), XML, matter (>= 1.8.0) Suggests: knitr, testthat (>= 0.8), RNetCDF (>= 1.6.1) Description: Functions for reading (tab, csv, Bruker fid, Ciphergen XML, mzXML, mzML, imzML, Analyze 7.5, CDF, mMass MSD) and diff --git a/NAMESPACE b/NAMESPACE index 8804c4c..97c2d4c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,3 +41,5 @@ importFrom(utils,type.convert) importFrom(utils,untar) importFrom(utils,unzip) importFrom(utils,write.table) +importFrom(matter,matter_vec) +importFrom(matter,matter_fc) diff --git a/R/import-functions.R b/R/import-functions.R index e530f6b..84e7de2 100644 --- a/R/import-functions.R +++ b/R/import-functions.R @@ -354,7 +354,8 @@ importMzMl <- function(path, ...) { #' Import imzML files #' #' This function imports files in imzML file format -#' into \code{\link[MALDIquant]{MassSpectrum-class}} or +#' into \code{\link[MALDIquant]{MassSpectrum-class}}, +#' \code{\link[MALDIquant]{MassSpectrumOnDisk-class}} or #' \code{\link[MALDIquant]{MassPeaks-class}} objects. #' #' @param path \code{character}, path to directory or file which should be read @@ -362,6 +363,11 @@ importMzMl <- function(path, ...) { #' @param coordinates \code{matrix}, 2 column matrix that contains the x- and #' y-coordinates for spectra that should be imported. Other spectra would be #' ignored. +#' @param attachOnly logical (defaults to \code{FALSE}), whether to attach the dataset via the \code{matter} +#' package without loading it into memory. See \code{\link[MALDIquant]{MassSpectrumOnDisk-class}}. +#' @param duplicateFile logical, when \code{TRUE} (default), creates a temporary copy of the binary \code{ibd} +#' file in the \code{tempdir} and attaches the \code{\link[MALDIquant]{MassSpectrumOnDisk}} objects to it so +#' as not to affect the original \code{ibd} file. #' @param \ldots arguments to be passed to #' \code{\link[MALDIquantForeign]{import}}. #' @@ -372,9 +378,11 @@ importMzMl <- function(path, ...) { #' \code{\link[MALDIquant]{MassSpectrum-class}}, #' \code{\link[MALDIquant]{MassPeaks-class}} #' @author Sebastian Gibb -#' @references \url{http://strimmerlab.org/software/maldiquant/}, \cr +#' @references \url{http://strimmerlab.org/software/maldiquant/}, \cr\cr #' Definition of \code{imzML} format: -#' \url{http://www.imzml.org/} +#' \url{http://www.imzml.org/}\cr\cr +#' \code{"matter"}: Kylie A. Bemis (2018). matter: A framework for rapid prototyping with binary data on disk. R +#' package version 1.8.0. \url{https://github.com/kuwisdelu/matter}. #' @examples #' #' library("MALDIquant") @@ -391,9 +399,10 @@ importMzMl <- function(path, ...) { #' coordinates = cbind(1:2, c(1, 1))) #' #' @rdname importImzMl-functions +#' #' @export -importImzMl <- function(path, coordinates=NULL, ...) { - import(path=path, type="imzml", coordinates=coordinates, ...) +importImzMl <- function(path, coordinates=NULL, attachOnly=FALSE, duplicateFile=TRUE, ...) { + import(path=path, type="imzml", coordinates=coordinates, attachOnly=attachOnly, duplicateFile=duplicateFile, ...) } #' Import Ciphergen XML files diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index e3e02a6..51c9edd 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -17,8 +17,8 @@ ## along with MALDIquantForeign. If not, see .importImzMl <- function(file, centroided=FALSE, massRange=c(0, Inf), - minIntensity=0, coordinates=NULL, - verbose=FALSE) { + minIntensity=0, coordinates=NULL, attachOnly=FALSE, + duplicateFile=TRUE, verbose=FALSE) { .msg(verbose, "Reading spectrum from ", sQuote(file), " ...") @@ -31,6 +31,15 @@ if (!file.exists(ibdFilename)) { stop("File ", sQuote(ibdFilename), " doesn't exists!") } + + if (attachOnly) { # attach rather than load + if (duplicateFile) { # duplicate the ibd file to the temp dir in order to keep the original ibd intact + tf <- paste0(tempfile(), "_", basename(ibdFilename)) + file.copy(from=ibdFilename, to=tf) + ibdFilename <- tf + + } + } s <- .parseMzMl(file=file, verbose=verbose) @@ -89,7 +98,14 @@ } n <- x[column, "length"] e <- x[column, "encodedLength"] - readBin(file, double(), n=n, size=e/n, signed=TRUE, endian="little") + + if(attachOnly){ + matter::matter_vec(datamode="double", paths=unname(summary(ibd)[[1]]), filemode="rb+", + offset=x[column, "offset"], extent=n) + }else{ + readBin(file, double(), n=n, size=e/n, signed=TRUE, endian="little") + } + } n <- length(sel) @@ -97,6 +113,13 @@ isProcessed <- s$ims$type == "processed" isSeekNeeded <- length(s$ims$ibd) > length(sel) + + if(isProcessed && attachOnly){ + message("The imzML file is of type 'processed'. The 'attachOnly' option is only available ", + "for 'continuous' type and therefore will be overridden. In-memory MassPeaks objects will be created.") + attachOnly <- FALSE + } + if (!isProcessed) { mass <- .readValues(ibd, s$ims$ibd[[sel[1L]]], "mass", isSeekNeeded) @@ -113,11 +136,18 @@ mass <- .readValues(ibd, s$ims$ibd[[sel[i]]], "mass", isSeekNeeded) } intensity <- .readValues(ibd, s$ims$ibd[[sel[i]]], "intensity", isSeekNeeded) - spectra[[i]] <- .createMassObject(mass=mass, intensity=intensity, - metaData=m, centroided=centroided, - massRange=massRange, - minIntensity=minIntensity, - verbose=verbose) + + if(attachOnly){ + spectra[[i]] <- new("MassSpectrumOnDisk", mass=mass, intensity=intensity, + metaData=m, path=unname(summary(ibd)[[1]])) + }else{ + spectra[[i]] <- .createMassObject(mass=mass, intensity=intensity, + metaData=m, centroided=centroided, + massRange=massRange, + minIntensity=minIntensity, + verbose=verbose) + } + } spectra } diff --git a/man/importImzMl-functions.Rd b/man/importImzMl-functions.Rd index 60dad34..45a6d23 100644 --- a/man/importImzMl-functions.Rd +++ b/man/importImzMl-functions.Rd @@ -4,7 +4,8 @@ \alias{importImzMl} \title{Import imzML files} \usage{ -importImzMl(path, coordinates = NULL, ...) +importImzMl(path, coordinates = NULL, attachOnly = FALSE, + duplicateFile = TRUE, ...) } \arguments{ \item{path}{\code{character}, path to directory or file which should be read @@ -14,6 +15,13 @@ in.} y-coordinates for spectra that should be imported. Other spectra would be ignored.} +\item{attachOnly}{logical (defaults to \code{FALSE}), whether to attach the dataset via the \code{matter} +package without loading it into memory. See \code{\link[MALDIquant]{MassSpectrumOnDisk-class}}.} + +\item{duplicateFile}{logical, when \code{TRUE} (default), creates a temporary copy of the binary \code{ibd} +file in the \code{tempdir} and attaches the \code{\link[MALDIquant]{MassSpectrumOnDisk}} objects to it so +as not to affect the original \code{ibd} file.} + \item{\ldots}{arguments to be passed to \code{\link[MALDIquantForeign]{import}}.} } @@ -24,7 +32,8 @@ a \code{list} of \code{\link[MALDIquant]{MassSpectrum-class}} or } \description{ This function imports files in imzML file format -into \code{\link[MALDIquant]{MassSpectrum-class}} or +into \code{\link[MALDIquant]{MassSpectrum-class}}, +\code{\link[MALDIquant]{MassSpectrumOnDisk-class}} or \code{\link[MALDIquant]{MassPeaks-class}} objects. } \examples{ @@ -44,9 +53,11 @@ s <- importImzMl(file.path(exampleDirectory, "tiny_continuous.imzML"), } \references{ -\url{http://strimmerlab.org/software/maldiquant/}, \cr +\url{http://strimmerlab.org/software/maldiquant/}, \cr\cr Definition of \code{imzML} format: -\url{http://www.imzml.org/} +\url{http://www.imzml.org/}\cr\cr +\code{"matter"}: Kylie A. Bemis (2018). matter: A framework for rapid prototyping with binary data on disk. R +package version 1.8.0. \url{https://github.com/kuwisdelu/matter}. } \seealso{ \code{\link[MALDIquant]{MassSpectrum-class}}, From 3f1fc909abe62b498b33a23423cd8491154d1412 Mon Sep 17 00:00:00 2001 From: dsammour Date: Mon, 15 Apr 2019 15:18:08 +0200 Subject: [PATCH 2/6] Removal of matter and added support for OnDiskVector --- DESCRIPTION | 2 +- MALDIquantForeign.Rproj | 17 +++++++++++++++++ NAMESPACE | 2 -- R/import-functions.R | 4 ++-- R/importImzMl-functions.R | 7 +++++-- 5 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 MALDIquantForeign.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index 210a441..7637ee5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: MALDIquantForeign -Version: 0.12.50 +Version: 0.12.75 Date: 2019-01-30 Title: Import/Export Routines for 'MALDIquant' Authors@R: c(person("Sebastian", "Gibb", role=c("aut", "cre"), diff --git a/MALDIquantForeign.Rproj b/MALDIquantForeign.Rproj new file mode 100644 index 0000000..7e331bc --- /dev/null +++ b/MALDIquantForeign.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 7 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/NAMESPACE b/NAMESPACE index 97c2d4c..8804c4c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,5 +41,3 @@ importFrom(utils,type.convert) importFrom(utils,untar) importFrom(utils,unzip) importFrom(utils,write.table) -importFrom(matter,matter_vec) -importFrom(matter,matter_fc) diff --git a/R/import-functions.R b/R/import-functions.R index 84e7de2..2930211 100644 --- a/R/import-functions.R +++ b/R/import-functions.R @@ -363,8 +363,8 @@ importMzMl <- function(path, ...) { #' @param coordinates \code{matrix}, 2 column matrix that contains the x- and #' y-coordinates for spectra that should be imported. Other spectra would be #' ignored. -#' @param attachOnly logical (defaults to \code{FALSE}), whether to attach the dataset via the \code{matter} -#' package without loading it into memory. See \code{\link[MALDIquant]{MassSpectrumOnDisk-class}}. +#' @param attachOnly logical (defaults to \code{FALSE}), whether to attach the dataset via the +#' \code{OnDiskVector} class without loading it into memory. See \code{\link[MALDIquant]{MassSpectrumOnDisk-class}}. #' @param duplicateFile logical, when \code{TRUE} (default), creates a temporary copy of the binary \code{ibd} #' file in the \code{tempdir} and attaches the \code{\link[MALDIquant]{MassSpectrumOnDisk}} objects to it so #' as not to affect the original \code{ibd} file. diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index 51c9edd..d40fba8 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -101,8 +101,11 @@ if(attachOnly){ matter::matter_vec(datamode="double", paths=unname(summary(ibd)[[1]]), filemode="rb+", - offset=x[column, "offset"], extent=n) + offset=x[column, "offset"], extent=n) + OnDiskVector(path=unname(summary(ibd)[[1]]), n=n, offset=x[column, "offset"], size=8L) + }else{ + readBin(file, double(), n=n, size=e/n, signed=TRUE, endian="little") } @@ -139,7 +142,7 @@ if(attachOnly){ spectra[[i]] <- new("MassSpectrumOnDisk", mass=mass, intensity=intensity, - metaData=m, path=unname(summary(ibd)[[1]])) + metaData=m) }else{ spectra[[i]] <- .createMassObject(mass=mass, intensity=intensity, metaData=m, centroided=centroided, From 1d9cb42808768d51482908dd327781a053949c52 Mon Sep 17 00:00:00 2001 From: dsammour Date: Mon, 15 Apr 2019 16:49:38 +0200 Subject: [PATCH 3/6] big fix --- R/importImzMl-functions.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index d40fba8..5def951 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -100,8 +100,7 @@ e <- x[column, "encodedLength"] if(attachOnly){ - matter::matter_vec(datamode="double", paths=unname(summary(ibd)[[1]]), filemode="rb+", - offset=x[column, "offset"], extent=n) + OnDiskVector(path=unname(summary(ibd)[[1]]), n=n, offset=x[column, "offset"], size=8L) }else{ From 67162c2c3a189a27ce0ebc8264f607245afe05ab Mon Sep 17 00:00:00 2001 From: dsammour Date: Tue, 16 Apr 2019 17:37:37 +0200 Subject: [PATCH 4/6] warning MassSpectrumOnDisk --- R/importImzMl-functions.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index 5def951..a03f3f3 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -151,5 +151,18 @@ } } + + .msg(verbose, "Done. ") + + if(attachOnly) + { + if(duplicateFile) + warning("imzML dataset was loaded via attacheOnly option and a duplicate file was generate. ", + "Any changes made to the spectra are directly written to the duplicate file.\n ") + else + warning("imzML dataset was loaded via attacheOnly option to the ORIGINAL FILE. ", + "Any changes made to the spectra are directly written to the imzML file.\n ") + } + spectra } From e3ce24060db752ca2ec96686fc8c7f889025f971 Mon Sep 17 00:00:00 2001 From: dsammour Date: Wed, 24 Apr 2019 13:18:28 +0200 Subject: [PATCH 5/6] added parallel support for importImzML --- DESCRIPTION | 4 +-- R/import-functions.R | 8 ++++-- R/importImzMl-functions.R | 59 ++++++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7637ee5..a9dd182 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,9 +8,9 @@ Authors@R: c(person("Sebastian", "Gibb", role=c("aut", "cre"), "Franceschi", role=c("ctb"), email="pietro.franceschi@fmach.it")) biocViews: -Depends: R (>= 3.2.2), methods, MALDIquant (>= 1.16.4) +Depends: R (>= 3.2.2), methods, MALDIquant (>= 1.19.15) Imports: base64enc, digest, readBrukerFlexData (>= 1.7), readMzXmlData - (>= 2.7), XML, matter (>= 1.8.0) + (>= 2.7), XML, parallel Suggests: knitr, testthat (>= 0.8), RNetCDF (>= 1.6.1) Description: Functions for reading (tab, csv, Bruker fid, Ciphergen XML, mzXML, mzML, imzML, Analyze 7.5, CDF, mMass MSD) and diff --git a/R/import-functions.R b/R/import-functions.R index 2930211..b064e5b 100644 --- a/R/import-functions.R +++ b/R/import-functions.R @@ -368,6 +368,8 @@ importMzMl <- function(path, ...) { #' @param duplicateFile logical, when \code{TRUE} (default), creates a temporary copy of the binary \code{ibd} #' file in the \code{tempdir} and attaches the \code{\link[MALDIquant]{MassSpectrumOnDisk}} objects to it so #' as not to affect the original \code{ibd} file. +#' @param mc.cores integer, specifying number of cores for parallel evaluation through \code{parallel::mclapply}. +#' Falls back to \code{mc.cores = 1} is Windows. #' @param \ldots arguments to be passed to #' \code{\link[MALDIquantForeign]{import}}. #' @@ -401,8 +403,10 @@ importMzMl <- function(path, ...) { #' @rdname importImzMl-functions #' #' @export -importImzMl <- function(path, coordinates=NULL, attachOnly=FALSE, duplicateFile=TRUE, ...) { - import(path=path, type="imzml", coordinates=coordinates, attachOnly=attachOnly, duplicateFile=duplicateFile, ...) +importImzMl <- function(path, coordinates=NULL, attachOnly=FALSE, duplicateFile=TRUE, + mc.cores = 1L, ...) { + import(path=path, type="imzml", coordinates=coordinates, attachOnly=attachOnly, + duplicateFile=duplicateFile, mc.cores = 1L, ...) } #' Import Ciphergen XML files diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index a03f3f3..3975e5d 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -18,7 +18,7 @@ .importImzMl <- function(file, centroided=FALSE, massRange=c(0, Inf), minIntensity=0, coordinates=NULL, attachOnly=FALSE, - duplicateFile=TRUE, verbose=FALSE) { + duplicateFile=TRUE, mc.cores = 1L, verbose=FALSE) { .msg(verbose, "Reading spectrum from ", sQuote(file), " ...") @@ -127,30 +127,39 @@ mass <- .readValues(ibd, s$ims$ibd[[sel[1L]]], "mass", isSeekNeeded) } - ## read mass and intensity values - for (i in seq(along=sel)) { - .msg(verbose, "Reading binary data for spectrum ", i, "/", n, " ...") - - m <- modifyList(s$metaData, s$spectra[[sel[i]]]$metaData) - m$file <- file - - if (isProcessed) { - mass <- .readValues(ibd, s$ims$ibd[[sel[i]]], "mass", isSeekNeeded) - } - intensity <- .readValues(ibd, s$ims$ibd[[sel[i]]], "intensity", isSeekNeeded) - - if(attachOnly){ - spectra[[i]] <- new("MassSpectrumOnDisk", mass=mass, intensity=intensity, - metaData=m) - }else{ - spectra[[i]] <- .createMassObject(mass=mass, intensity=intensity, - metaData=m, centroided=centroided, - massRange=massRange, - minIntensity=minIntensity, - verbose=verbose) - } - - } + ## read mass and intensity values - possibly in parallel + mc.cores <- ifelse(.Platform$OS.type == "windows", 1, mc.cores) + + spectra <- parallel::mclapply(X = seq_along(sel), + mc.cores = mc.cores, + FUN = function(i) { + + .msg(verbose, "Reading binary data for spectrum ", i, "/", n, " ...") + + m <- modifyList(s$metaData, s$spectra[[sel[i]]]$metaData) + m$file <- file + + if (isProcessed) { + mass <- .readValues(ibd, s$ims$ibd[[sel[i]]], "mass", isSeekNeeded) + } + intensity <- .readValues(ibd, s$ims$ibd[[sel[i]]], "intensity", isSeekNeeded) + + if(attachOnly){ + tmpSpectrum <- new("MassSpectrumOnDisk", mass=mass, intensity=intensity, + metaData=m) + }else{ + tmpSpectrum <- .createMassObject(mass=mass, intensity=intensity, + metaData=m, centroided=centroided, + massRange=massRange, + minIntensity=minIntensity, + verbose=verbose) + } + + tmpSpectrum + }) + + + .msg(verbose, "Done. ") From 2c70268f6333e4e06ac7340c34bd7a4f141e4ff3 Mon Sep 17 00:00:00 2001 From: Denis Abu-Sammour Date: Tue, 12 Nov 2019 11:25:22 +0100 Subject: [PATCH 6/6] minor changes --- R/importImzMl-functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/importImzMl-functions.R b/R/importImzMl-functions.R index 3975e5d..6e2cf9d 100644 --- a/R/importImzMl-functions.R +++ b/R/importImzMl-functions.R @@ -166,10 +166,10 @@ if(attachOnly) { if(duplicateFile) - warning("imzML dataset was loaded via attacheOnly option and a duplicate file was generate. ", + message("\nNOTE: imzML dataset was loaded via attacheOnly option and a duplicate file was generate. ", "Any changes made to the spectra are directly written to the duplicate file.\n ") else - warning("imzML dataset was loaded via attacheOnly option to the ORIGINAL FILE. ", + message("\nNOTE: imzML dataset was loaded via attacheOnly option to the ORIGINAL FILE. ", "Any changes made to the spectra are directly written to the imzML file.\n ") }