Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Description: Macpie is an R package for the integrative analysis of transcriptom
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Suggests:
knitr,
rmarkdown,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ importFrom(ggplot2,theme)
importFrom(ggplot2,theme_minimal)
importFrom(limma,contrasts.fit)
importFrom(limma,eBayes)
importFrom(limma,lmFit)
importFrom(limma,makeContrasts)
importFrom(limma,plotMDS)
importFrom(limma,topTable)
Expand Down
8 changes: 7 additions & 1 deletion R/compute_multi_screen_profile.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,13 @@ compute_multi_screen_profile <- function(data = NULL,
(direction == "up" & .data$log2FC > 0) |
(direction == "down" & .data$log2FC < 0)
) %>%
arrange(if (direction == "both") desc(abs(.data$log2FC)) else desc(.data$log2FC)) %>%
arrange(
dplyr::case_when(
direction == "both" ~ -abs(.data$log2FC),
direction == "up" ~ -.data$log2FC,
direction == "down" ~ .data$log2FC
)
) %>%
slice(seq_len(n_genes_profile)) %>%
pull(.data$gene)
) %>% setNames(target)
Expand Down
105 changes: 82 additions & 23 deletions R/compute_normalised_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,20 @@
#' @param data A tidyseurat object merged with metadata. Must contain columns
#' "Well_ID", "Row", "Column"
#' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2",
#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"
#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb"
#' @param batch Either empty, a single value, or a vector corresponding to the
#' number of samples
#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial
#' @param k Parameter k for RUVSeq and zinb methods
#' @param spikes List of genes to use as spike controls
#' @param max_counts Maximum count for a gene across all samples
#' @param num_cores Number of cores
#' @importFrom limma voom
#' @importFrom Seurat as.SingleCellExperiment
#' @import DESeq2
#' @import RUVSeq
#' @importFrom Biobase pData
#' @importFrom stats model.matrix residuals
#' @importFrom limma lmFit
#' @importFrom parallel makeCluster stopCluster
#'
#' @returns Data frame of normalised counts
Expand All @@ -30,7 +32,8 @@ compute_normalised_counts <- function(data = NULL,
batch = NULL,
k = NULL,
spikes = NULL,
max_counts = NULL) {
max_counts = NULL,
num_cores = NULL) {
req_pkgs <- c("SingleCellExperiment", "EDASeq", "BiocParallel", "doParallel","zinbwave")
missing <- req_pkgs[!vapply(req_pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(missing)) {
Expand All @@ -41,7 +44,7 @@ compute_normalised_counts <- function(data = NULL,
)
}
# Helper function to validate input data
validate_inputs <- function(data, method, batch, k, max_counts) {
validate_inputs <- function(data, method, batch, k, max_counts, num_cores) {
if (!inherits(data, "Seurat")) {
stop("argument 'data' must be a Seurat or TidySeurat object.")
}
Expand All @@ -50,13 +53,14 @@ compute_normalised_counts <- function(data = NULL,
"cpm", "clr", "SCT",
"DESeq2", "edgeR",
"RUVg", "RUVs", "RUVr",
"limma_voom", "zinb")) {
"limma_voom", "limma_trend", "zinb")) {
stop("Your normalization method is not available.")
}
batch <- if (is.null(batch)) "1" else as.character(batch)
k <- if (is.null(k)) 2 else k
max_counts <- if (is.null(max_counts)) 100 else as.numeric(max_counts)
list(data = data, batch = batch, k = k, method = method)
num_cores <- if (is.null(num_cores)) 1 else num_cores
list(data = data, batch = batch, k = k, method = method, num_cores=num_cores)
}

#create an unique identifier based on combined annotation
Expand Down Expand Up @@ -124,10 +128,13 @@ compute_normalised_counts <- function(data = NULL,
if (ncol(data) > 100) {
message("EdgeR with over 100 samples takes a long time. Consider reducing the number of samples or genes.")
}
cl <- makeCluster(num_cores)
doParallel::registerDoParallel(cl)
p <- BiocParallel::DoparParam()
dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
dge <- calcNormFactors(dge, methods = "TMM")
design <- model_matrix
dge <- estimateDisp(dge, design)
dge <- estimateDisp(dge, design, BPPARAM = p)
cpm(dge, log = FALSE)
}

Expand All @@ -139,13 +146,41 @@ compute_normalised_counts <- function(data = NULL,
dge$E
}

normalize_limma_trend <- function(data, batch) {
dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
dge <- calcNormFactors(dge, methods = "TMMwsp")
design <- model_matrix
logCPM <- cpm(dge, log=TRUE, prior.count=3)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend=TRUE)
dge <- fit
logCPM
}

normalize_ruvg <- function(data, batch, spikes, k) {
counts <- data@assays$RNA$counts
if (length(spikes) == 0) {
stop("List of control genes not provided for RUVg.")
warning("List of control genes not provided for RUVg, using default human housekeeping genes.")
spikes <- c(
"ACTB",
"GAPDH",
"RPLP0",
"B2M",
"HPRT1",
"PGK1",
"TBP",
"UBC",
"YWHAZ",
"PPIA",
"RPL19",
"EEF1A1",
"RPS18",
"TFRC"
)
}
if (!all(spikes %in% row.names(data@assays$RNA$counts))) {
stop("Some or all of your control genes are not present in the dataset.")
warning("Some or all of your control genes are not present in the dataset.")
spikes <- intersect(spikes, rownames(counts(set)))
}
#k defines number of sources of variation, two have been chosen for row and column
set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts),
Expand All @@ -164,7 +199,17 @@ compute_normalised_counts <- function(data = NULL,
phenoData = data.frame(condition = coldata$condition,
row.names = colnames(counts)))
differences <- model_matrix
set <- RUVs(set, cIdx = genes, k = k, scIdx = differences)
differences <- lapply(seq_len(ncol(model_matrix)), function(j) {
which(model_matrix[, j] == 1)
})
max_len <- max(lengths(differences))
scIdx <- matrix(NA, nrow = max_len, ncol = length(differences))
for (i in seq_along(differences)) {
scIdx[seq_along(differences[[i]]), i] <- differences[[i]]
}
scIdx<-t(scIdx)
colnames(scIdx) <- colnames(model_matrix)
set <- RUVs(set, cIdx = genes, k = k, scIdx = scIdx)

EDASeq::normCounts(set)
}
Expand Down Expand Up @@ -193,37 +238,50 @@ compute_normalised_counts <- function(data = NULL,

normalize_zinb <- function(data, batch) {

message("zinb mode takes a couple of minutes. Please allow extra time.")
message("Default uses genes with max_counts > 100 reads across all treatments.")
message("reducing the parameter max_counts may increase the compute time and memory required.")

message("Please allow extra time for zinb mode.")
if (ncol(data) > 50) {
message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.")
}
data_sce <- as.SingleCellExperiment(data)
filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 10)
num_cores <- 8 # Change this based on your system
filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0)
cl <- makeCluster(num_cores)
doParallel::registerDoParallel(cl)
p <- BiocParallel::DoparParam()
system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2,
epsilon=1000,
system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k,
epsilon=12000,
BPPARAM = p,
normalizedValues=TRUE,
residuals = TRUE))
normalised_values <- zinb@assays@data$normalizedValues
observationalWeights = TRUE))
counts <- zinb@assays@data$counts
weights <- zinb@assays@data$weights

# approximate denoised counts (downweighting dropouts)
#dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
#dge <- calcNormFactors(dge, methods = "TMM")
#design <- model_matrix
#dge <- estimateDisp(dge, design, BPPARAM = p)
#dge$weights <- assay(zinb, "weights")
#fit <- glmQLFit(dge, design, BPPARAM = p)
#norm_counts <- fitted(fit)

dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition)
dge <- calcNormFactors(dge, methods = "TMMwsp")
design <- model_matrix
dge <- voom(dge, design, weights = weights)
normalised_values <- dge$E

#normalised_values <- zinb@assays@data$normalizedValues
stopCluster(cl)
doParallel::registerDoParallel()
return(normalised_values)
}

# Main function logic
validated <- validate_inputs(data, method, batch, k, max_counts)
validated <- validate_inputs(data, method, batch, k, max_counts, num_cores)
data <- validated$data
batch <- validated$batch
k <- validated$k
method <- validated$method

num_cores <- validated$num_cores
# Select the appropriate normalization method
norm_data <- switch(
method,
Expand All @@ -235,6 +293,7 @@ compute_normalised_counts <- function(data = NULL,
DESeq2 = normalize_deseq2(data, batch),
edgeR = normalize_edger(data, batch),
limma_voom = normalize_limma_voom(data, batch),
limma_trend = normalize_limma_trend(data, batch),
RUVg = normalize_ruvg(data, batch, spikes, k),
RUVs = normalize_ruvs(data, batch, k),
RUVr = normalize_ruvr(data, batch, k),
Expand Down
30 changes: 28 additions & 2 deletions R/compute_single_de.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ utils::globalVariables(c("model_matrix"))
#' "Well_ID", "Row", "Column"
#' @param treatment_samples Value in the column "combined_id" representing replicates of treatment samples in the data
#' @param control_samples Value in the column "combined_id" representing replicates of control samples in the data
#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom"
#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend"
#' @param batch Either empty, a single value, or a vector corresponding to the
#' number of samples
#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial
Expand Down Expand Up @@ -48,7 +48,7 @@ compute_single_de <- function(data = NULL,
method <- if (is.null(method)) "limma_voom" else method
if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR",
"RUVg", "RUVs", "RUVr",
"limma_voom")) {
"limma_voom", "limma_trend")) {
stop("Your normalization method is not available.")
}
if (is.null(treatment_samples) || is.null(control_samples)) {
Expand Down Expand Up @@ -102,6 +102,31 @@ compute_single_de <- function(data = NULL,
rownames_to_column("gene")
return(as.data.frame(top_table))
}

de_limma_trend <- function(data, pheno_data, treatment_samples, control_samples) {
combined_id <- data$combined_id
model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
model.matrix(~0 + combined_id + batch)
dge <- DGEList(counts = data@assays$RNA$counts,
samples = pheno_data$condition,
group = pheno_data$condition)
dge <- estimateDisp(dge, model_matrix)
dge <- calcNormFactors(dge, method = "TMMwsp")
logCPM <- cpm(dge, log=TRUE, prior.count=3)
fit <- lmFit(logCPM, model_matrix)
myargs <- list(paste0("combined_id",
treatment_samples, "-",
paste0("combined_id", control_samples)),
levels = model_matrix)
contrasts <- do.call(makeContrasts, myargs)
tmp <- contrasts.fit(fit, contrasts)
tmp <- eBayes(tmp, trend = TRUE)
top_table <- topTable(tmp, number = Inf, sort.by = "P") %>%
select("logFC", "t", "P.Value", "adj.P.Val") %>%
rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>%
rownames_to_column("gene")
return(as.data.frame(top_table))
}

de_edger <- function(data, pheno_data, treatment_samples, control_samples) {
combined_id <- data$combined_id
Expand Down Expand Up @@ -343,6 +368,7 @@ compute_single_de <- function(data = NULL,
de_data <- switch(
method,
limma_voom = de_limma_voom(data, pheno_data, treatment_samples, control_samples),
limma_trend = de_limma_trend(data, pheno_data, treatment_samples, control_samples),
edgeR = de_edger(data, pheno_data, treatment_samples, control_samples),
DESeq2 = de_deseq2(data, pheno_data, treatment_samples, control_samples),
Seurat_wilcox = de_seurat(data, pheno_data, treatment_samples, control_samples),
Expand Down
Loading