diff --git a/DESCRIPTION b/DESCRIPTION index 7264e5c..146f511 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,7 +34,7 @@ Description: Macpie is an R package for the integrative analysis of transcriptom License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index b7a43c4..131faf9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -90,6 +90,7 @@ importFrom(ggplot2,theme) importFrom(ggplot2,theme_minimal) importFrom(limma,contrasts.fit) importFrom(limma,eBayes) +importFrom(limma,lmFit) importFrom(limma,makeContrasts) importFrom(limma,plotMDS) importFrom(limma,topTable) diff --git a/R/compute_multi_screen_profile.R b/R/compute_multi_screen_profile.R index 7502a22..b899ebb 100644 --- a/R/compute_multi_screen_profile.R +++ b/R/compute_multi_screen_profile.R @@ -69,7 +69,13 @@ compute_multi_screen_profile <- function(data = NULL, (direction == "up" & .data$log2FC > 0) | (direction == "down" & .data$log2FC < 0) ) %>% - arrange(if (direction == "both") desc(abs(.data$log2FC)) else desc(.data$log2FC)) %>% + arrange( + dplyr::case_when( + direction == "both" ~ -abs(.data$log2FC), + direction == "up" ~ -.data$log2FC, + direction == "down" ~ .data$log2FC + ) + ) %>% slice(seq_len(n_genes_profile)) %>% pull(.data$gene) ) %>% setNames(target) diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R index acb4bc4..238f12b 100644 --- a/R/compute_normalised_counts.R +++ b/R/compute_normalised_counts.R @@ -5,18 +5,20 @@ #' @param data A tidyseurat object merged with metadata. Must contain columns #' "Well_ID", "Row", "Column" #' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom" +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb" #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples -#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial +#' @param k Parameter k for RUVSeq and zinb methods #' @param spikes List of genes to use as spike controls #' @param max_counts Maximum count for a gene across all samples +#' @param num_cores Number of cores #' @importFrom limma voom #' @importFrom Seurat as.SingleCellExperiment #' @import DESeq2 #' @import RUVSeq #' @importFrom Biobase pData #' @importFrom stats model.matrix residuals +#' @importFrom limma lmFit #' @importFrom parallel makeCluster stopCluster #' #' @returns Data frame of normalised counts @@ -30,7 +32,8 @@ compute_normalised_counts <- function(data = NULL, batch = NULL, k = NULL, spikes = NULL, - max_counts = NULL) { + max_counts = NULL, + num_cores = NULL) { req_pkgs <- c("SingleCellExperiment", "EDASeq", "BiocParallel", "doParallel","zinbwave") missing <- req_pkgs[!vapply(req_pkgs, requireNamespace, logical(1), quietly = TRUE)] if (length(missing)) { @@ -41,7 +44,7 @@ compute_normalised_counts <- function(data = NULL, ) } # Helper function to validate input data - validate_inputs <- function(data, method, batch, k, max_counts) { + validate_inputs <- function(data, method, batch, k, max_counts, num_cores) { if (!inherits(data, "Seurat")) { stop("argument 'data' must be a Seurat or TidySeurat object.") } @@ -50,13 +53,14 @@ compute_normalised_counts <- function(data = NULL, "cpm", "clr", "SCT", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom", "zinb")) { + "limma_voom", "limma_trend", "zinb")) { stop("Your normalization method is not available.") } batch <- if (is.null(batch)) "1" else as.character(batch) k <- if (is.null(k)) 2 else k max_counts <- if (is.null(max_counts)) 100 else as.numeric(max_counts) - list(data = data, batch = batch, k = k, method = method) + num_cores <- if (is.null(num_cores)) 1 else num_cores + list(data = data, batch = batch, k = k, method = method, num_cores=num_cores) } #create an unique identifier based on combined annotation @@ -124,10 +128,13 @@ compute_normalised_counts <- function(data = NULL, if (ncol(data) > 100) { message("EdgeR with over 100 samples takes a long time. Consider reducing the number of samples or genes.") } + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition) dge <- calcNormFactors(dge, methods = "TMM") design <- model_matrix - dge <- estimateDisp(dge, design) + dge <- estimateDisp(dge, design, BPPARAM = p) cpm(dge, log = FALSE) } @@ -139,13 +146,41 @@ compute_normalised_counts <- function(data = NULL, dge$E } + normalize_limma_trend <- function(data, batch) { + dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition) + dge <- calcNormFactors(dge, methods = "TMMwsp") + design <- model_matrix + logCPM <- cpm(dge, log=TRUE, prior.count=3) + fit <- lmFit(logCPM, design) + fit <- eBayes(fit, trend=TRUE) + dge <- fit + logCPM + } + normalize_ruvg <- function(data, batch, spikes, k) { counts <- data@assays$RNA$counts if (length(spikes) == 0) { - stop("List of control genes not provided for RUVg.") + warning("List of control genes not provided for RUVg, using default human housekeeping genes.") + spikes <- c( + "ACTB", + "GAPDH", + "RPLP0", + "B2M", + "HPRT1", + "PGK1", + "TBP", + "UBC", + "YWHAZ", + "PPIA", + "RPL19", + "EEF1A1", + "RPS18", + "TFRC" + ) } if (!all(spikes %in% row.names(data@assays$RNA$counts))) { - stop("Some or all of your control genes are not present in the dataset.") + warning("Some or all of your control genes are not present in the dataset.") + spikes <- intersect(spikes, rownames(counts(set))) } #k defines number of sources of variation, two have been chosen for row and column set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts), @@ -164,7 +199,17 @@ compute_normalised_counts <- function(data = NULL, phenoData = data.frame(condition = coldata$condition, row.names = colnames(counts))) differences <- model_matrix - set <- RUVs(set, cIdx = genes, k = k, scIdx = differences) + differences <- lapply(seq_len(ncol(model_matrix)), function(j) { + which(model_matrix[, j] == 1) + }) + max_len <- max(lengths(differences)) + scIdx <- matrix(NA, nrow = max_len, ncol = length(differences)) + for (i in seq_along(differences)) { + scIdx[seq_along(differences[[i]]), i] <- differences[[i]] + } + scIdx<-t(scIdx) + colnames(scIdx) <- colnames(model_matrix) + set <- RUVs(set, cIdx = genes, k = k, scIdx = scIdx) EDASeq::normCounts(set) } @@ -193,37 +238,50 @@ compute_normalised_counts <- function(data = NULL, normalize_zinb <- function(data, batch) { - message("zinb mode takes a couple of minutes. Please allow extra time.") - message("Default uses genes with max_counts > 100 reads across all treatments.") - message("reducing the parameter max_counts may increase the compute time and memory required.") - + message("Please allow extra time for zinb mode.") if (ncol(data) > 50) { message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.") } data_sce <- as.SingleCellExperiment(data) - filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 10) - num_cores <- 8 # Change this based on your system + filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0) cl <- makeCluster(num_cores) doParallel::registerDoParallel(cl) p <- BiocParallel::DoparParam() - system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2, - epsilon=1000, + system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + epsilon=12000, BPPARAM = p, - normalizedValues=TRUE, - residuals = TRUE)) - normalised_values <- zinb@assays@data$normalizedValues + observationalWeights = TRUE)) + counts <- zinb@assays@data$counts + weights <- zinb@assays@data$weights + + # approximate denoised counts (downweighting dropouts) + #dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition) + #dge <- calcNormFactors(dge, methods = "TMM") + #design <- model_matrix + #dge <- estimateDisp(dge, design, BPPARAM = p) + #dge$weights <- assay(zinb, "weights") + #fit <- glmQLFit(dge, design, BPPARAM = p) + #norm_counts <- fitted(fit) + + dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition) + dge <- calcNormFactors(dge, methods = "TMMwsp") + design <- model_matrix + dge <- voom(dge, design, weights = weights) + normalised_values <- dge$E + + #normalised_values <- zinb@assays@data$normalizedValues stopCluster(cl) doParallel::registerDoParallel() return(normalised_values) } # Main function logic - validated <- validate_inputs(data, method, batch, k, max_counts) + validated <- validate_inputs(data, method, batch, k, max_counts, num_cores) data <- validated$data batch <- validated$batch k <- validated$k method <- validated$method - + num_cores <- validated$num_cores # Select the appropriate normalization method norm_data <- switch( method, @@ -235,6 +293,7 @@ compute_normalised_counts <- function(data = NULL, DESeq2 = normalize_deseq2(data, batch), edgeR = normalize_edger(data, batch), limma_voom = normalize_limma_voom(data, batch), + limma_trend = normalize_limma_trend(data, batch), RUVg = normalize_ruvg(data, batch, spikes, k), RUVs = normalize_ruvs(data, batch, k), RUVr = normalize_ruvr(data, batch, k), diff --git a/R/compute_single_de.R b/R/compute_single_de.R index 4712665..ea1f600 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -7,7 +7,7 @@ utils::globalVariables(c("model_matrix")) #' "Well_ID", "Row", "Column" #' @param treatment_samples Value in the column "combined_id" representing replicates of treatment samples in the data #' @param control_samples Value in the column "combined_id" representing replicates of control samples in the data -#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom" +#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend" #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples #' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial @@ -48,7 +48,7 @@ compute_single_de <- function(data = NULL, method <- if (is.null(method)) "limma_voom" else method if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom")) { + "limma_voom", "limma_trend")) { stop("Your normalization method is not available.") } if (is.null(treatment_samples) || is.null(control_samples)) { @@ -102,6 +102,31 @@ compute_single_de <- function(data = NULL, rownames_to_column("gene") return(as.data.frame(top_table)) } + + de_limma_trend <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- estimateDisp(dge, model_matrix) + dge <- calcNormFactors(dge, method = "TMMwsp") + logCPM <- cpm(dge, log=TRUE, prior.count=3) + fit <- lmFit(logCPM, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, trend = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } de_edger <- function(data, pheno_data, treatment_samples, control_samples) { combined_id <- data$combined_id @@ -343,6 +368,7 @@ compute_single_de <- function(data = NULL, de_data <- switch( method, limma_voom = de_limma_voom(data, pheno_data, treatment_samples, control_samples), + limma_trend = de_limma_trend(data, pheno_data, treatment_samples, control_samples), edgeR = de_edger(data, pheno_data, treatment_samples, control_samples), DESeq2 = de_deseq2(data, pheno_data, treatment_samples, control_samples), Seurat_wilcox = de_seurat(data, pheno_data, treatment_samples, control_samples), diff --git a/R/plot_rle.R b/R/plot_rle.R index 6308e48..301a438 100644 --- a/R/plot_rle.R +++ b/R/plot_rle.R @@ -15,8 +15,10 @@ #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples. #' @param normalisation One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "zinb". If empty, defaults to raw. +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb". If empty, defaults to raw. #' @param spikes List of genes to use as spike controls in RUVg +#' @param num_cores Number of cores for edgeR and zinb calculations + #' #' @details #' The function performs the following steps: @@ -43,7 +45,8 @@ plot_rle <- function(data, barcodes = NULL, log = TRUE, batch = NULL, normalisation = NULL, - spikes = NULL) { + spikes = NULL, + num_cores = NULL) { req_pkgs <- c("colorspace", "grDevices") missing <- req_pkgs[!vapply(req_pkgs, requireNamespace, logical(1), quietly = TRUE)] @@ -56,7 +59,8 @@ plot_rle <- function(data, barcodes = NULL, } # Helper function to validate input data - validate_inputs <- function(data, barcodes, label_column, labels, log, batch, normalisation) { + validate_inputs <- function(data, barcodes, label_column, labels, log, batch, + normalisation, num_cores) { if (!inherits(data, "Seurat")) { stop("'data' must be a Seurat or TidySeurat object.") } @@ -78,13 +82,15 @@ plot_rle <- function(data, barcodes = NULL, } batch <- if (is.null(batch)) "1" else as.character(batch) normalisation <- if (is.null(normalisation)) "raw" else normalisation - + num_cores <- if (is.null(num_cores)) 1 else num_cores + list(barcodes = as.factor(barcodes), labels = as.factor(labels), log = ifelse(inherits(log, "function"), TRUE, log), batch = batch, normalisation = normalisation, - spikes = spikes) + spikes = spikes, + num_cores = num_cores) } # Helper function to fetch and log-transform count matrix @@ -176,28 +182,30 @@ plot_rle <- function(data, barcodes = NULL, } # Validate inputs - validated <- validate_inputs(data, barcodes, label_column, labels, log, batch, normalisation) + validated <- validate_inputs(data, barcodes, label_column, labels, log, batch, normalisation, num_cores) barcodes <- validated$barcodes labels <- validated$labels log <- validated$log batch <- validated$batch normalisation <- validated$normalisation + num_cores <- validated$num_cores # Fetch and transform count matrix count_matrix <- switch( normalisation, raw = fetch_count_matrix(data, log), - logNorm = compute_normalised_counts(data, method = "logNorm", batch = batch), - cpm = compute_normalised_counts(data, method = "cpm", batch = batch), - clr = compute_normalised_counts(data, method = "clr", batch = batch), - SCT = compute_normalised_counts(data, method = "SCT", batch = batch), - DESeq2 = compute_normalised_counts(data, method = "DESeq2", batch = batch), - edgeR = compute_normalised_counts(data, method = "edgeR", batch = batch), - limma_voom = compute_normalised_counts(data, method = "limma_voom", batch = batch), - RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes), - RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch), - RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch), - zinb = compute_normalised_counts(data, method = "zinb", batch = batch), + logNorm = compute_normalised_counts(data, method = "logNorm", batch = batch, num_cores = num_cores), + cpm = compute_normalised_counts(data, method = "cpm", batch = batch, num_cores = num_cores), + clr = compute_normalised_counts(data, method = "clr", batch = batch, num_cores = num_cores), + SCT = compute_normalised_counts(data, method = "SCT", batch = batch, num_cores = num_cores), + DESeq2 = compute_normalised_counts(data, method = "DESeq2", batch = batch, num_cores = num_cores), + edgeR = compute_normalised_counts(data, method = "edgeR", batch = batch, num_cores = num_cores), + limma_voom = compute_normalised_counts(data, method = "limma_voom", batch = batch, num_cores = num_cores), + limma_trend = compute_normalised_counts(data, method = "limma_trend", batch = batch, num_cores = num_cores), + RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes, num_cores = num_cores), + RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch, num_cores = num_cores), + RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch, num_cores = num_cores), + zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), stop("Unsupported normalization method.") ) @@ -213,5 +221,7 @@ plot_rle <- function(data, barcodes = NULL, rledf <- compute_rle_df(count_matrix, labels) # Create and return plot - create_rle_plot(rledf, normalisation) + out <- create_rle_plot(rledf, normalisation) + out$SE <- compute_cv(count_matrix) + return(out) } diff --git a/man/compute_normalised_counts.Rd b/man/compute_normalised_counts.Rd index 7722423..0487b81 100644 --- a/man/compute_normalised_counts.Rd +++ b/man/compute_normalised_counts.Rd @@ -10,7 +10,8 @@ compute_normalised_counts( batch = NULL, k = NULL, spikes = NULL, - max_counts = NULL + max_counts = NULL, + num_cores = NULL ) } \arguments{ @@ -18,16 +19,18 @@ compute_normalised_counts( "Well_ID", "Row", "Column"} \item{method}{One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -"edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"} +"edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb"} \item{batch}{Either empty, a single value, or a vector corresponding to the number of samples} -\item{k}{Parameter k for RUVSeq methods, check RUVSeq tutorial} +\item{k}{Parameter k for RUVSeq and zinb methods} \item{spikes}{List of genes to use as spike controls} \item{max_counts}{Maximum count for a gene across all samples} + +\item{num_cores}{Number of cores} } \value{ Data frame of normalised counts diff --git a/man/compute_single_de.Rd b/man/compute_single_de.Rd index 328bfe0..202e2a0 100644 --- a/man/compute_single_de.Rd +++ b/man/compute_single_de.Rd @@ -22,7 +22,7 @@ compute_single_de( \item{control_samples}{Value in the column "combined_id" representing replicates of control samples in the data} -\item{method}{One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"} +\item{method}{One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend"} \item{batch}{Either empty, a single value, or a vector corresponding to the number of samples} diff --git a/man/plot_rle.Rd b/man/plot_rle.Rd index 75a4dcf..c720efd 100644 --- a/man/plot_rle.Rd +++ b/man/plot_rle.Rd @@ -12,7 +12,8 @@ plot_rle( log = TRUE, batch = NULL, normalisation = NULL, - spikes = NULL + spikes = NULL, + num_cores = NULL ) } \arguments{ @@ -34,9 +35,11 @@ Defaults to \code{TRUE}.} number of samples.} \item{normalisation}{One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -"edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "zinb". If empty, defaults to raw.} +"edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb". If empty, defaults to raw.} \item{spikes}{List of genes to use as spike controls in RUVg} + +\item{num_cores}{Number of cores for edgeR and zinb calculations} } \value{ A ggplot object representing the RLE plot.