From 0fb34cf11589b6387f48732e454fe787a6b92da8 Mon Sep 17 00:00:00 2001 From: "C. Navarro" Date: Mon, 12 Oct 2020 15:31:47 +0200 Subject: [PATCH 1/2] plot_bw_bins_diff_scatter added --- elsasserlib/DESCRIPTION | 2 +- elsasserlib/NAMESPACE | 1 + elsasserlib/R/bwplot.R | 96 ++++++++++++++++++++ elsasserlib/R/bwstats.R | 27 ++++-- elsasserlib/man/bw_bed_diff_analysis.Rd | 7 +- elsasserlib/man/bw_bins_diff_analysis.Rd | 5 +- elsasserlib/man/bw_granges_diff_analysis.Rd | 7 +- elsasserlib/man/plot_bw_bins_diff_scatter.Rd | 58 ++++++++++++ elsasserlib/tests/testthat/test_bwstats.R | 12 ++- 9 files changed, 201 insertions(+), 14 deletions(-) create mode 100644 elsasserlib/man/plot_bw_bins_diff_scatter.Rd diff --git a/elsasserlib/DESCRIPTION b/elsasserlib/DESCRIPTION index 14a7132..96b7d9f 100644 --- a/elsasserlib/DESCRIPTION +++ b/elsasserlib/DESCRIPTION @@ -1,7 +1,7 @@ Package: elsasserlib Type: Package Title: General utilities used within Elsasser lab -Version: 1.1.1 +Version: 1.1.2 Authors@R: c( person("Carmen", "Navarro", email = "carmen.navarro@scilifelab.se", diff --git a/elsasserlib/NAMESPACE b/elsasserlib/NAMESPACE index 29cc0a7..afbb507 100644 --- a/elsasserlib/NAMESPACE +++ b/elsasserlib/NAMESPACE @@ -10,6 +10,7 @@ export(bw_profile) export(mean_ratio_norm) export(palette_categorical) export(plot_bw_bed_summary_heatmap) +export(plot_bw_bins_diff_scatter) export(plot_bw_bins_scatter) export(plot_bw_bins_violin) export(plot_bw_profile) diff --git a/elsasserlib/R/bwplot.R b/elsasserlib/R/bwplot.R index e70773c..a66baa3 100644 --- a/elsasserlib/R/bwplot.R +++ b/elsasserlib/R/bwplot.R @@ -327,6 +327,102 @@ plot_bw_profile <- function(bwfiles, } +#' Scatter plot with significance testing. +#' +#' Make a scatterplot where bins that pass a specific statistical significance +#' threshold across replicates are highlighted. +#' +#' @param bwfiles_c1 Condition 1 bigwig files. +#' @param bwfiles_c2 Condition 2 bigwig files. +#' @param label_c1 Label for condition 1. +#' @param label_c2 Label for condition 2. +#' @param bg_bwfiles_c1 Optional condition 1 bigwig input (background) files. +#' @param bg_bwfiles_c2 Optional condition 2 bigwig input (background) files. +#' @param bin_size Bin size. +#' @param genome Genome used. Must be one of supported. +#' @param estimate_size_factors If True, the estimateSizeFactors step from +#' DESeq2 is not skipped. +#' @param pval_cutoff P-value cutoff. +#' @param logfc_cutoff Log fold change cutoff. If logfc_cutoff is negative, +#' bins under logfc_cutoff will be highlighted. +#' @inheritParams plot_bw_bins_scatter +#' @return ggplot object +#' @export +plot_bw_bins_diff_scatter <- function(bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bg_bwfiles_c1 = NULL, + bg_bwfiles_c2 = NULL, + bin_size = 10000, + genome = "mm9", + estimate_size_factors = FALSE, + pval_cutoff = NULL, + logfc_cutoff = NULL, + norm_func_x = identity, + norm_func_y = identity) { + + stats <- bw_bins_diff_analysis(bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bin_size = bin_size, + genome = genome, + estimate_size_factors = estimate_size_factors, + keep_values = TRUE + ) + + # Mean replicates + stats$mean_c1 <- rowMeans(data.frame(stats[, basename(bwfiles_c1)])) + stats$mean_c2 <- rowMeans(data.frame(stats[, basename(bwfiles_c2)])) + + if (! is.null(bg_bwfiles_c1) ) { + bg_values_x <- bw_bins(bg_bwfiles_c1, + bin_size = bin_size, + genome = genome) + + bg_values_x$mean <- rowMeans(data.frame(bg_values_x)[, basename(bg_bwfiles_c1)]) + + stats$mean_c1 <- norm_func_x(stats$mean_c1 / bg_values_x$mean) + } + + if (! is.null(bg_bwfiles_c2) ) { + bg_values_y <- bw_bins(bg_bwfiles_c2, + bin_size = bin_size, + genome = genome) + bg_values_y$mean <- rowMeans(data.frame(bg_values_y)[, basename(bg_bwfiles_c2)]) + + stats$mean_c2 <- norm_func_x(stats$mean_c2 / bg_values_y$mean) + } + + # Replace NAs with 1s so rows are not removed from matrix + stats_filtered <- stats + stats_filtered$padj <- ifelse(is.na(stats$padj), 1, stats$padj) + + if (!is.null(pval_cutoff)) { + stats_filtered <- stats_filtered[stats_filtered$padj < pval_cutoff, ] + } + if (!is.null(logfc_cutoff)) { + if (logfc_cutoff > 0) { + stats_filtered <- stats_filtered[stats_filtered$log2FoldChange > logfc_cutoff, ] + } + else { + stats_filtered <- stats_filtered[stats_filtered$log2FoldChange < logfc_cutoff, ] + } + } + + stats_df <- data.frame(stats) + stats_filtered_df <- data.frame(stats_filtered) + + ggplot(stats_df, aes_string(x = "mean_c1", y = "mean_c2")) + + geom_point(color = "#cccccc", alpha = 0.6) + + theme_elsasserlab_screen(base_size = 18) + + geom_point(data = stats_filtered_df, aes_string(x = "mean_c1", y = "mean_c2"), color = "#F08080") + + xlab(label_c1) + + ylab(label_c2) + + ggtitle(paste("Bin coverage (bin_size = ", bin_size, ")", sep = "")) +} + #' Plot a pretty heatmap using pheatmap library #' #' This function ignores NA values to calculate min and max values. diff --git a/elsasserlib/R/bwstats.R b/elsasserlib/R/bwstats.R index b88adb7..6ba085f 100644 --- a/elsasserlib/R/bwstats.R +++ b/elsasserlib/R/bwstats.R @@ -9,6 +9,7 @@ #' @param bwfiles_c2 Path or array of paths to the bigWig files for second condition. #' @param genome Genome. Available choices are mm9, hg38. #' @param bin_size Bin size. +#' @param keep_values Keep individually calculated values. #' @inheritParams bw_granges_diff_analysis #' @return a DESeqResults object as returned by DESeq2::results function #' @export @@ -18,13 +19,15 @@ bw_bins_diff_analysis <- function(bwfiles_c1, label_c2, bin_size = 10000, genome = "mm9", - estimate_size_factors = FALSE) { + estimate_size_factors = FALSE, + keep_values = FALSE) { bins_c1 <- bw_bins(bwfiles_c1, genome = genome, bin_size = bin_size) bins_c2 <- bw_bins(bwfiles_c2, genome = genome, bin_size = bin_size) bw_granges_diff_analysis(bins_c1, bins_c2, label_c1, label_c2, - estimate_size_factors = estimate_size_factors) + estimate_size_factors = estimate_size_factors, + keep_values = keep_values) } #' Run DESeq2 analysis on bed file @@ -45,13 +48,15 @@ bw_bed_diff_analysis <- function(bwfiles_c1, bedfile, label_c1, label_c2, - estimate_size_factors = FALSE) { + estimate_size_factors = FALSE, + keep_values = FALSE) { loci_c1 <- bw_bed(bwfiles_c1, bedfile = bedfile) loci_c2 <- bw_bed(bwfiles_c2, bedfile = bedfile) bw_granges_diff_analysis(loci_c1, loci_c2, label_c1, label_c2, - estimate_size_factors = estimate_size_factors) + estimate_size_factors = estimate_size_factors, + keep_values = keep_values) } @@ -69,6 +74,9 @@ bw_bed_diff_analysis <- function(bwfiles_c1, #' @param label_c2 Condition name for condition 2. #' @param estimate_size_factors If TRUE, normal DESeq2 procedure is done. Set it #' to true to analyze non-MINUTE data. +#' @param keep_values If true, results also include the separate bin values. +#' This is used to avoid recalculation in plotting functions, but set to +#' false by default. #' @importFrom DESeq2 DESeqDataSetFromMatrix estimateDispersions nbinomWaldTest `sizeFactors<-` results estimateSizeFactors #' @return a DESeqResults object as returned by DESeq2::results function #' @export @@ -76,7 +84,8 @@ bw_granges_diff_analysis <- function(granges_c1, granges_c2, label_c1, label_c2, - estimate_size_factors = FALSE) { + estimate_size_factors = FALSE, + keep_values = FALSE) { # Bind first, get numbers after (drop complete cases separately could cause error) granges_c1 <- sortSeqlevels(granges_c1) @@ -111,7 +120,13 @@ bw_granges_diff_analysis <- function(granges_c1, dds <- estimateDispersions(dds) dds <- nbinomWaldTest(dds) - results(dds) + stats_results <- results(dds) + + if (keep_values == TRUE) { + stats_results[, colnames(cts_df)] <- cts_df + } + + stats_results } diff --git a/elsasserlib/man/bw_bed_diff_analysis.Rd b/elsasserlib/man/bw_bed_diff_analysis.Rd index abbe338..3135898 100644 --- a/elsasserlib/man/bw_bed_diff_analysis.Rd +++ b/elsasserlib/man/bw_bed_diff_analysis.Rd @@ -10,7 +10,8 @@ bw_bed_diff_analysis( bedfile, label_c1, label_c2, - estimate_size_factors = FALSE + estimate_size_factors = FALSE, + keep_values = FALSE ) } \arguments{ @@ -26,6 +27,10 @@ bw_bed_diff_analysis( \item{estimate_size_factors}{If TRUE, normal DESeq2 procedure is done. Set it to true to analyze non-MINUTE data.} + +\item{keep_values}{If true, results also include the separate bin values. +This is used to avoid recalculation in plotting functions, but set to +false by default.} } \value{ a DESeqResults object as returned by DESeq2::results function diff --git a/elsasserlib/man/bw_bins_diff_analysis.Rd b/elsasserlib/man/bw_bins_diff_analysis.Rd index 7fab21d..f74795f 100644 --- a/elsasserlib/man/bw_bins_diff_analysis.Rd +++ b/elsasserlib/man/bw_bins_diff_analysis.Rd @@ -11,7 +11,8 @@ bw_bins_diff_analysis( label_c2, bin_size = 10000, genome = "mm9", - estimate_size_factors = FALSE + estimate_size_factors = FALSE, + keep_values = FALSE ) } \arguments{ @@ -29,6 +30,8 @@ bw_bins_diff_analysis( \item{estimate_size_factors}{If TRUE, normal DESeq2 procedure is done. Set it to true to analyze non-MINUTE data.} + +\item{keep_values}{Keep individually calculated values.} } \value{ a DESeqResults object as returned by DESeq2::results function diff --git a/elsasserlib/man/bw_granges_diff_analysis.Rd b/elsasserlib/man/bw_granges_diff_analysis.Rd index 0134e52..71ef9a9 100644 --- a/elsasserlib/man/bw_granges_diff_analysis.Rd +++ b/elsasserlib/man/bw_granges_diff_analysis.Rd @@ -9,7 +9,8 @@ bw_granges_diff_analysis( granges_c2, label_c1, label_c2, - estimate_size_factors = FALSE + estimate_size_factors = FALSE, + keep_values = FALSE ) } \arguments{ @@ -24,6 +25,10 @@ Note that these objects must correspond to the same loci.} \item{estimate_size_factors}{If TRUE, normal DESeq2 procedure is done. Set it to true to analyze non-MINUTE data.} + +\item{keep_values}{If true, results also include the separate bin values. +This is used to avoid recalculation in plotting functions, but set to +false by default.} } \value{ a DESeqResults object as returned by DESeq2::results function diff --git a/elsasserlib/man/plot_bw_bins_diff_scatter.Rd b/elsasserlib/man/plot_bw_bins_diff_scatter.Rd new file mode 100644 index 0000000..6068e65 --- /dev/null +++ b/elsasserlib/man/plot_bw_bins_diff_scatter.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bwplot.R +\name{plot_bw_bins_diff_scatter} +\alias{plot_bw_bins_diff_scatter} +\title{Scatter plot with significance testing.} +\usage{ +plot_bw_bins_diff_scatter( + bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bg_bwfiles_c1 = NULL, + bg_bwfiles_c2 = NULL, + bin_size = 10000, + genome = "mm9", + estimate_size_factors = FALSE, + pval_cutoff = NULL, + logfc_cutoff = NULL, + norm_func_x = identity, + norm_func_y = identity +) +} +\arguments{ +\item{bwfiles_c1}{Condition 1 bigwig files.} + +\item{bwfiles_c2}{Condition 2 bigwig files.} + +\item{label_c1}{Label for condition 1.} + +\item{label_c2}{Label for condition 2.} + +\item{bg_bwfiles_c1}{Optional condition 1 bigwig input (background) files.} + +\item{bg_bwfiles_c2}{Optional condition 2 bigwig input (background) files.} + +\item{bin_size}{Bin size.} + +\item{genome}{Genome used. Must be one of supported.} + +\item{estimate_size_factors}{If True, the estimateSizeFactors step from +DESeq2 is not skipped.} + +\item{pval_cutoff}{P-value cutoff.} + +\item{logfc_cutoff}{Log fold change cutoff. If logfc_cutoff is negative, +bins under logfc_cutoff will be highlighted.} + +\item{norm_func_x}{Function to use after x / x_bg.} + +\item{norm_func_y}{Function to use after y / y_bg.} +} +\value{ +ggplot object +} +\description{ +Make a scatterplot where bins that pass a specific statistical significance +threshold across replicates are highlighted. +} diff --git a/elsasserlib/tests/testthat/test_bwstats.R b/elsasserlib/tests/testthat/test_bwstats.R index b13ac9e..d1967d9 100644 --- a/elsasserlib/tests/testthat/test_bwstats.R +++ b/elsasserlib/tests/testthat/test_bwstats.R @@ -38,7 +38,8 @@ test_that("bw_bins_diff_analysis passes on parameters", { bins_c2, label_c1, label_c2, - estimate_size_factors = estimate_size_factors) + estimate_size_factors = estimate_size_factors, + keep_values = keep_values) ) expect_call(m_bins, 1, @@ -54,7 +55,8 @@ test_that("bw_bins_diff_analysis passes on parameters", { granges_c2 = reduced_bg_bins, label_c1 = "treated", label_c2 = "untreated", - estimate_size_factors = FALSE) + estimate_size_factors = FALSE, + keep_values = FALSE) expect_args(m_bins, 1, c(bw1, bw2), @@ -81,7 +83,8 @@ test_that("bw_bed_diff_analysis passes on parameters", { loci_c2, label_c1, label_c2, - estimate_size_factors = estimate_size_factors) + estimate_size_factors = estimate_size_factors, + keep_values = keep_values) ) expect_call(m_bed, 1, @@ -97,7 +100,8 @@ test_that("bw_bed_diff_analysis passes on parameters", { granges_c2 = reduced_bg_bins, label_c1 = "treated", label_c2 = "untreated", - estimate_size_factors = FALSE) + estimate_size_factors = FALSE, + keep_values = FALSE) expect_args(m_bed, 1, c(bw1, bw2), From 5ac4f5ff2ff9ffa331f7dcf35da893e8aebe095d Mon Sep 17 00:00:00 2001 From: "C. Navarro" Date: Mon, 12 Oct 2020 16:02:58 +0200 Subject: [PATCH 2/2] volcano plot --- elsasserlib/NAMESPACE | 1 + elsasserlib/R/bwplot.R | 72 ++++++++++++++++++++ elsasserlib/man/plot_bw_bins_diff_volcano.Rd | 46 +++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 elsasserlib/man/plot_bw_bins_diff_volcano.Rd diff --git a/elsasserlib/NAMESPACE b/elsasserlib/NAMESPACE index afbb507..30b3b8d 100644 --- a/elsasserlib/NAMESPACE +++ b/elsasserlib/NAMESPACE @@ -11,6 +11,7 @@ export(mean_ratio_norm) export(palette_categorical) export(plot_bw_bed_summary_heatmap) export(plot_bw_bins_diff_scatter) +export(plot_bw_bins_diff_volcano) export(plot_bw_bins_scatter) export(plot_bw_bins_violin) export(plot_bw_profile) diff --git a/elsasserlib/R/bwplot.R b/elsasserlib/R/bwplot.R index a66baa3..6749c4a 100644 --- a/elsasserlib/R/bwplot.R +++ b/elsasserlib/R/bwplot.R @@ -423,6 +423,78 @@ plot_bw_bins_diff_scatter <- function(bwfiles_c1, ggtitle(paste("Bin coverage (bin_size = ", bin_size, ")", sep = "")) } + +#' Volcano plot for differential analysis of genome-wide bins +#' +#' Make a volcano plot where bins that pass a specific statistical significance +#' threshold across replicates are highlighted. +#' +#' @param bwfiles_c1 Condition 1 bigwig files. +#' @param bwfiles_c2 Condition 2 bigwig files. +#' @param label_c1 Label for condition 1. +#' @param label_c2 Label for condition 2. +#' @param bin_size Bin size. +#' @param genome Genome used. Must be one of supported. +#' @param estimate_size_factors If True, the estimateSizeFactors step from +#' DESeq2 is not skipped. +#' @param pval_cutoff P-value cutoff. +#' @param logfc_cutoff Log fold change cutoff. In volcano plot this cutoff is +#' used as absolute value (-logfc cutoff and + logfc cutoff are selected). +#' @return ggplot object +#' @export +plot_bw_bins_diff_volcano <- function(bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bin_size = 10000, + genome = "mm9", + estimate_size_factors = FALSE, + pval_cutoff = NULL, + logfc_cutoff = NULL) { + + stats <- bw_bins_diff_analysis(bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bin_size = bin_size, + genome = genome, + estimate_size_factors = estimate_size_factors, + keep_values = TRUE + ) + + # Replace NAs with 1s so rows are not removed from matrix + stats$padj <- ifelse(is.na(stats$padj), 1, stats$padj) + stats$log10padj <- -log10(stats$padj) + + stats_filtered <- stats + extra_lines_pval <- NULL + extra_lines_fc <- NULL + + if (!is.null(pval_cutoff)) { + stats_filtered <- stats_filtered[stats_filtered$padj < pval_cutoff, ] + extra_lines_pval <- geom_hline(yintercept = -log10(pval_cutoff), linetype = "dashed") + } + if (!is.null(logfc_cutoff)) { + stats_filtered <- stats_filtered[abs(stats_filtered$log2FoldChange) > logfc_cutoff, ] + extra_lines_fc <- geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed") + } + + stats_df <- data.frame(stats) + stats_filtered_df <- data.frame(stats_filtered) + + ggplot(stats_df, aes_string(x = "log2FoldChange", y = "log10padj")) + + geom_point(color = "#cccccc", alpha = 0.6) + + theme_elsasserlab_screen(base_size = 18) + + geom_point(data = stats_filtered_df, aes_string(x = "log2FoldChange", y = "log10padj"), color = "#F08080") + + xlab("log2 fold change") + + ylab("-log10 p-value") + + geom_vline(xintercept = 0, linetype = "dashed") + + ggtitle(paste("Volcano plot - ", label_c1, "vs", label_c2, "(bin_size = ", bin_size, ")", sep = "")) + + extra_lines_pval + extra_lines_fc + +} + + #' Plot a pretty heatmap using pheatmap library #' #' This function ignores NA values to calculate min and max values. diff --git a/elsasserlib/man/plot_bw_bins_diff_volcano.Rd b/elsasserlib/man/plot_bw_bins_diff_volcano.Rd new file mode 100644 index 0000000..26dc501 --- /dev/null +++ b/elsasserlib/man/plot_bw_bins_diff_volcano.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bwplot.R +\name{plot_bw_bins_diff_volcano} +\alias{plot_bw_bins_diff_volcano} +\title{Volcano plot for differential analysis of genome-wide bins} +\usage{ +plot_bw_bins_diff_volcano( + bwfiles_c1, + bwfiles_c2, + label_c1, + label_c2, + bin_size = 10000, + genome = "mm9", + estimate_size_factors = FALSE, + pval_cutoff = NULL, + logfc_cutoff = NULL +) +} +\arguments{ +\item{bwfiles_c1}{Condition 1 bigwig files.} + +\item{bwfiles_c2}{Condition 2 bigwig files.} + +\item{label_c1}{Label for condition 1.} + +\item{label_c2}{Label for condition 2.} + +\item{bin_size}{Bin size.} + +\item{genome}{Genome used. Must be one of supported.} + +\item{estimate_size_factors}{If True, the estimateSizeFactors step from +DESeq2 is not skipped.} + +\item{pval_cutoff}{P-value cutoff.} + +\item{logfc_cutoff}{Log fold change cutoff. In volcano plot this cutoff is +used as absolute value (-logfc cutoff and + logfc cutoff are selected).} +} +\value{ +ggplot object +} +\description{ +Make a volcano plot where bins that pass a specific statistical significance +threshold across replicates are highlighted. +}