PMCC-BioinformaticsCore · nbartonicek · Oct 18, 2025 · Aug 22, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -34,7 +34,7 @@ Description: Macpie is an R package for the integrative analysis of transcriptom
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Suggests: 
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -90,6 +90,7 @@ importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_minimal)
 importFrom(limma,contrasts.fit)
 importFrom(limma,eBayes)
+importFrom(limma,lmFit)
 importFrom(limma,makeContrasts)
 importFrom(limma,plotMDS)
 importFrom(limma,topTable)

diff --git a/R/compute_multi_screen_profile.R b/R/compute_multi_screen_profile.R
@@ -69,7 +69,13 @@ compute_multi_screen_profile <- function(data = NULL,
             (direction == "up" & .data$log2FC > 0) |
             (direction == "down" & .data$log2FC < 0)
         ) %>%
-        arrange(if (direction == "both") desc(abs(.data$log2FC)) else desc(.data$log2FC)) %>%
+        arrange(
+          dplyr::case_when(
+            direction == "both" ~ -abs(.data$log2FC),
+            direction == "up"   ~ -.data$log2FC,
+            direction == "down" ~  .data$log2FC
+          )
+        ) %>%
         slice(seq_len(n_genes_profile)) %>%
         pull(.data$gene)
     ) %>% setNames(target)

diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R
@@ -5,18 +5,20 @@
 #' @param data A tidyseurat object merged with metadata. Must contain columns
 #'   "Well_ID", "Row", "Column"
 #' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2",
-#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"
+#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb"
 #' @param batch Either empty, a single value, or a vector corresponding to the
 #'   number of samples
-#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial
+#' @param k Parameter k for RUVSeq and zinb methods
 #' @param spikes List of genes to use as spike controls
 #' @param max_counts Maximum count for a gene across all samples
+#' @param num_cores Number of cores
 #' @importFrom limma voom
 #' @importFrom Seurat as.SingleCellExperiment
 #' @import DESeq2
 #' @import RUVSeq
 #' @importFrom Biobase pData
 #' @importFrom stats model.matrix residuals
+#' @importFrom limma lmFit
 #' @importFrom parallel makeCluster stopCluster
 #'
 #' @returns Data frame of normalised counts
@@ -30,7 +32,8 @@ compute_normalised_counts <- function(data = NULL,
                                     batch = NULL,
                                     k = NULL,
                                     spikes = NULL,
-                                    max_counts = NULL) {
+                                    max_counts = NULL,
+                                    num_cores = NULL) {
   req_pkgs <- c("SingleCellExperiment", "EDASeq", "BiocParallel", "doParallel","zinbwave")
   missing <- req_pkgs[!vapply(req_pkgs, requireNamespace, logical(1), quietly = TRUE)]
   if (length(missing)) {
@@ -41,7 +44,7 @@ compute_normalised_counts <- function(data = NULL,
     )
   }
   # Helper function to validate input data
-  validate_inputs <- function(data, method, batch, k, max_counts) {
+  validate_inputs <- function(data, method, batch, k, max_counts, num_cores) {
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
@@ -50,13 +53,14 @@ compute_normalised_counts <- function(data = NULL,
                        "cpm", "clr", "SCT",
                        "DESeq2", "edgeR",
                        "RUVg", "RUVs", "RUVr",
-                       "limma_voom", "zinb")) {
+                       "limma_voom", "limma_trend", "zinb")) {
       stop("Your normalization method is not available.")
     }
     batch <- if (is.null(batch)) "1" else as.character(batch)
     k <- if (is.null(k)) 2 else k
     max_counts <- if (is.null(max_counts)) 100 else as.numeric(max_counts)
-    list(data = data, batch = batch, k = k, method = method)
+    num_cores <- if (is.null(num_cores)) 1 else num_cores
+    list(data = data, batch = batch, k = k, method = method, num_cores=num_cores)
   }
 
   #create an unique identifier based on combined annotation
@@ -124,10 +128,13 @@ compute_normalised_counts <- function(data = NULL,
     if (ncol(data) > 100) {
       message("EdgeR with over 100 samples takes a long time. Consider reducing the number of samples or genes.")
     }
+    cl <- makeCluster(num_cores)
+    doParallel::registerDoParallel(cl)
+    p <- BiocParallel::DoparParam()
     dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
     dge <- calcNormFactors(dge, methods = "TMM")
     design <- model_matrix
-    dge <- estimateDisp(dge, design)
+    dge <- estimateDisp(dge, design, BPPARAM = p)
     cpm(dge, log = FALSE)
   }
 
@@ -139,13 +146,41 @@ compute_normalised_counts <- function(data = NULL,
     dge$E
   }
 
+  normalize_limma_trend <- function(data, batch) {
+    dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
+    dge <- calcNormFactors(dge, methods = "TMMwsp")
+    design <- model_matrix
+    logCPM <- cpm(dge, log=TRUE, prior.count=3)
+    fit <- lmFit(logCPM, design)
+    fit <- eBayes(fit, trend=TRUE)
+    dge <- fit
+    logCPM
+  }
+
   normalize_ruvg <- function(data, batch, spikes, k) {
     counts <- data@assays$RNA$counts
     if (length(spikes) == 0) {
-      stop("List of control genes not provided for RUVg.")
+      warning("List of control genes not provided for RUVg, using default human housekeeping genes.")
+      spikes <- c(
+        "ACTB",
+        "GAPDH",
+        "RPLP0",
+        "B2M",
+        "HPRT1",
+        "PGK1",
+        "TBP",
+        "UBC",
+        "YWHAZ",
+        "PPIA",
+        "RPL19",
+        "EEF1A1",
+        "RPS18",
+        "TFRC"
+      )
     }
     if (!all(spikes %in% row.names(data@assays$RNA$counts))) {
-      stop("Some or all of your control genes are not present in the dataset.")
+      warning("Some or all of your control genes are not present in the dataset.")
+      spikes <- intersect(spikes, rownames(counts(set)))
     }
     #k defines number of sources of variation, two have been chosen for row and column
     set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts),
@@ -164,7 +199,17 @@ compute_normalised_counts <- function(data = NULL,
                                phenoData = data.frame(condition = coldata$condition,
                                                       row.names = colnames(counts)))
     differences <- model_matrix
-    set <- RUVs(set, cIdx = genes, k = k, scIdx = differences)
+    differences <- lapply(seq_len(ncol(model_matrix)), function(j) {
+      which(model_matrix[, j] == 1)
+    })
+    max_len <- max(lengths(differences))
+    scIdx <- matrix(NA, nrow = max_len, ncol = length(differences))
+    for (i in seq_along(differences)) {
+      scIdx[seq_along(differences[[i]]), i] <- differences[[i]]
+    }
+    scIdx<-t(scIdx)
+    colnames(scIdx) <- colnames(model_matrix)
+    set <- RUVs(set, cIdx = genes, k = k, scIdx = scIdx)
 
     EDASeq::normCounts(set)
   }
@@ -193,37 +238,50 @@ compute_normalised_counts <- function(data = NULL,
 
   normalize_zinb <- function(data, batch) {
 
-    message("zinb mode takes a couple of minutes. Please allow extra time.")
-    message("Default uses genes with max_counts > 100 reads across all treatments.")
-    message("reducing the parameter max_counts may increase the compute time and memory required.")
-
+    message("Please allow extra time for zinb mode.")
     if (ncol(data) > 50) {
       message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.")
     }
     data_sce <- as.SingleCellExperiment(data)
-    filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 10)
-    num_cores <- 8 # Change this based on your system
+    filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0)
     cl <- makeCluster(num_cores)
     doParallel::registerDoParallel(cl)
     p <- BiocParallel::DoparParam()
-    system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2,
-                                 epsilon=1000,
+    system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k,
+                                 epsilon=12000,
                                  BPPARAM = p,
-                                 normalizedValues=TRUE,
-                                 residuals = TRUE))
-    normalised_values <- zinb@assays@data$normalizedValues
+                                 observationalWeights = TRUE))
+    counts <- zinb@assays@data$counts
+    weights <- zinb@assays@data$weights 
+
+    # approximate denoised counts (downweighting dropouts)
+    #dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
+    #dge <- calcNormFactors(dge, methods = "TMM")
+    #design <- model_matrix
+    #dge <- estimateDisp(dge, design, BPPARAM = p)
+    #dge$weights <- assay(zinb, "weights")
+    #fit <- glmQLFit(dge, design, BPPARAM = p)
+    #norm_counts <- fitted(fit)
+
+    dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition)
+    dge <- calcNormFactors(dge, methods = "TMMwsp")
+    design <- model_matrix
+    dge <- voom(dge, design, weights = weights)
+    normalised_values <- dge$E
+
+    #normalised_values <- zinb@assays@data$normalizedValues
     stopCluster(cl)
     doParallel::registerDoParallel()
     return(normalised_values)
   }
 
   # Main function logic
-  validated <- validate_inputs(data, method, batch, k, max_counts)
+  validated <- validate_inputs(data, method, batch, k, max_counts, num_cores)
   data <- validated$data
   batch <- validated$batch
   k <- validated$k
   method <- validated$method
-
+  num_cores <- validated$num_cores
   # Select the appropriate normalization method
   norm_data <- switch(
     method,
@@ -235,6 +293,7 @@ compute_normalised_counts <- function(data = NULL,
     DESeq2 = normalize_deseq2(data, batch),
     edgeR = normalize_edger(data, batch),
     limma_voom = normalize_limma_voom(data, batch),
+    limma_trend = normalize_limma_trend(data, batch),
     RUVg = normalize_ruvg(data, batch, spikes, k),
     RUVs = normalize_ruvs(data, batch, k),
     RUVr = normalize_ruvr(data, batch, k),

diff --git a/R/compute_single_de.R b/R/compute_single_de.R
@@ -7,7 +7,7 @@ utils::globalVariables(c("model_matrix"))
 #'   "Well_ID", "Row", "Column"
 #' @param treatment_samples Value in the column "combined_id" representing replicates of treatment samples in the data
 #' @param control_samples Value in the column "combined_id"  representing replicates of control samples in the data
-#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"
+#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend"
 #' @param batch Either empty, a single value, or a vector corresponding to the
 #'   number of samples
 #' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial
@@ -48,7 +48,7 @@ compute_single_de <- function(data = NULL,
     method <- if (is.null(method)) "limma_voom" else method
     if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR",
                        "RUVg", "RUVs", "RUVr",
-                       "limma_voom")) {
+                       "limma_voom", "limma_trend")) {
       stop("Your normalization method is not available.")
     }
     if (is.null(treatment_samples) || is.null(control_samples)) {
@@ -102,6 +102,31 @@ compute_single_de <- function(data = NULL,
       rownames_to_column("gene")
     return(as.data.frame(top_table))
   }
+
+  de_limma_trend <- function(data, pheno_data, treatment_samples, control_samples) {
+    combined_id <- data$combined_id
+    model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
+      model.matrix(~0 + combined_id + batch)
+    dge <- DGEList(counts = data@assays$RNA$counts,
+                   samples = pheno_data$condition,
+                   group = pheno_data$condition)
+    dge <- estimateDisp(dge, model_matrix)
+    dge <- calcNormFactors(dge, method = "TMMwsp")
+    logCPM <- cpm(dge, log=TRUE, prior.count=3)
+    fit <- lmFit(logCPM, model_matrix)
+    myargs <- list(paste0("combined_id",
+                          treatment_samples, "-",
+                          paste0("combined_id", control_samples)),
+                   levels = model_matrix)
+    contrasts <- do.call(makeContrasts, myargs)
+    tmp <- contrasts.fit(fit, contrasts)
+    tmp <- eBayes(tmp, trend = TRUE)
+    top_table <- topTable(tmp, number = Inf, sort.by = "P") %>%
+      select("logFC", "t", "P.Value", "adj.P.Val") %>%
+      rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>%
+      rownames_to_column("gene")
+    return(as.data.frame(top_table))
+  }
 
   de_edger <- function(data, pheno_data, treatment_samples, control_samples) {
     combined_id <- data$combined_id
@@ -343,6 +368,7 @@ compute_single_de <- function(data = NULL,
   de_data <- switch(
     method,
     limma_voom = de_limma_voom(data, pheno_data, treatment_samples, control_samples),
+    limma_trend = de_limma_trend(data, pheno_data, treatment_samples, control_samples),
     edgeR = de_edger(data, pheno_data, treatment_samples, control_samples),
     DESeq2 = de_deseq2(data, pheno_data, treatment_samples, control_samples),
     Seurat_wilcox = de_seurat(data, pheno_data, treatment_samples, control_samples),