From ae928748446a196d88bc3812d0ad819d23c36a50 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Tue, 9 Dec 2025 10:13:29 +0200 Subject: [PATCH] Added `gtrack.export_bw`, `gtrack.export_bed` and `gintervals.export_bed` functions --- DESCRIPTION | 4 + NAMESPACE | 3 + NEWS.md | 2 + R/export-utils.R | 8 + R/intervals-export.R | 155 ++++ R/track-export.R | 416 ++++++++++ _pkgdown.yml | 3 + man/gintervals.export_bed.Rd | 72 ++ man/gtrack.export_bed.Rd | 107 +++ man/gtrack.export_bw.Rd | 99 +++ tests/testthat/test-export-readr.R | 68 ++ tests/testthat/test-gintervals.export_bed.R | 322 ++++++++ tests/testthat/test-gtrack.export_bed.R | 440 +++++++++++ tests/testthat/test-gtrack.export_bw.R | 792 ++++++++++++++++++++ 14 files changed, 2491 insertions(+) create mode 100644 R/export-utils.R create mode 100644 R/intervals-export.R create mode 100644 R/track-export.R create mode 100644 man/gintervals.export_bed.Rd create mode 100644 man/gtrack.export_bed.Rd create mode 100644 man/gtrack.export_bw.Rd create mode 100644 tests/testthat/test-export-readr.R create mode 100644 tests/testthat/test-gintervals.export_bed.R create mode 100644 tests/testthat/test-gtrack.export_bed.R create mode 100644 tests/testthat/test-gtrack.export_bw.R diff --git a/DESCRIPTION b/DESCRIPTION index 1d7e9c3d..694401c0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,10 +29,14 @@ Imports: Suggests: data.table, dplyr, + GenomeInfoDb, + GenomicRanges, glue, + IRanges, knitr, readr, rmarkdown, + rtracklayer, spelling, stats, stringr, diff --git a/NAMESPACE b/NAMESPACE index 947d8643..2875ced1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ export(gintervals.coverage_fraction) export(gintervals.covered_bp) export(gintervals.diff) export(gintervals.exists) +export(gintervals.export_bed) export(gintervals.force_range) export(gintervals.import_genes) export(gintervals.intersect) @@ -98,6 +99,8 @@ export(gtrack.create_dirs) export(gtrack.create_pwm_energy) export(gtrack.create_sparse) export(gtrack.exists) +export(gtrack.export_bed) +export(gtrack.export_bw) export(gtrack.import) export(gtrack.import_mappedseq) export(gtrack.import_set) diff --git a/NEWS.md b/NEWS.md index d7b2f8cd..0a978344 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,8 @@ * Added `sshift`, `eshift` and `filter` parameters to `gvtrack.create`. * Added `gintervals.path()` and `gtrack.path()` functions that return the actual file system paths for interval sets and tracks. * Added `masked.count` and `masked.frac` virtual track functions that count and fraction masked base pairs (lowercase letters) in the current iterator interval. +* Added `gtrack.export_bw` function that exports a track to a BigWig file. Requires `rtracklayer` package. +* Added `gtrack.export_bed` and `gintervals.export_bed` functions that export a track or intervals set to a BED file. # misha 5.2.2 diff --git a/R/export-utils.R b/R/export-utils.R new file mode 100644 index 00000000..3a215d93 --- /dev/null +++ b/R/export-utils.R @@ -0,0 +1,8 @@ +# Internal helpers for export functions +.misha_readr_available <- function() { + tryCatch(requireNamespace("readr", quietly = TRUE), error = function(...) FALSE) +} + +.misha_write_with_readr <- function(...) { + readr::write_tsv(...) +} diff --git a/R/intervals-export.R b/R/intervals-export.R new file mode 100644 index 00000000..0c7f20aa --- /dev/null +++ b/R/intervals-export.R @@ -0,0 +1,155 @@ +#' Export Intervals to BED Format +#' +#' @description +#' Exports a misha intervals set to BED format. Automatically detects whether to +#' export as BED3 (basic) or BED6 (with strand) based on available columns. +#' +#' @param intervals Either a data.frame with genomic intervals (must have chrom, start, end columns) +#' or a character string naming a saved intervals set (bigset) to load and export. +#' @param file Output BED file path. Should end in .bed. +#' @param track_name Optional track name for BED header line. If provided, a track definition +#' line will be added at the start of the file. +#' @param description Optional description for BED header line. Only used if \code{track_name} +#' is also provided. +#' +#' @return Invisibly returns the file path (useful for piping). +#' +#' @details +#' This function exports genomic intervals (without track values) to BED format. The output +#' format is automatically determined: +#' \itemize{ +#' \item BED3: If only chrom, start, end are available (3 columns) +#' \item BED6: If strand column is present (6 columns: chrom, start, end, name, score, strand) +#' } +#' +#' For BED6 format, the name field is set to "." and score to 0 as placeholders, since +#' intervals sets don't have these values. +#' +#' **Coordinate System:** Both misha and BED format use 0-based half-open intervals [start, end), +#' so coordinates are written directly without conversion. +#' +#' If \code{intervals} is a character string, the function will attempt to load it as a +#' saved intervals set using \code{\link{gintervals.load}}. +#' +#' @examples +#' \dontrun{ +#' # Export intervals data.frame to BED3 +#' my_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) +#' gintervals.export_bed(my_intervals, "output.bed") +#' +#' # Export with track header +#' gintervals.export_bed(my_intervals, "output.bed", +#' track_name = "MyRegions", +#' description = "Interesting genomic regions" +#' ) +#' +#' # Export intervals with strand information (BED6) +#' stranded_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) +#' stranded_intervals$strand <- c(1, -1, 1) +#' gintervals.export_bed(stranded_intervals, "stranded.bed") +#' +#' # Export saved intervals set by name +#' gintervals.save("my_regions", my_intervals) +#' gintervals.export_bed("my_regions", "regions.bed") +#' } +#' +#' @seealso \code{\link{gtrack.import}} for importing BED files as tracks, +#' \code{\link{gintervals.load}} for loading saved intervals sets, +#' \code{\link{gintervals.save}} for saving intervals sets +#' +#' @export +gintervals.export_bed <- function(intervals, file, track_name = NULL, description = NULL) { + if (!is.character(file) || length(file) != 1) { + stop("'file' must be a single character string", call. = FALSE) + } + + file_dir <- dirname(file) + if (!dir.exists(file_dir)) { + stop("Directory does not exist: ", file_dir, call. = FALSE) + } + + if (is.character(intervals) && length(intervals) == 1) { + if (gintervals.exists(intervals)) { + intervals <- gintervals.load(intervals) + } else { + stop("Intervals set '", intervals, "' does not exist", call. = FALSE) + } + } + + if (!is.data.frame(intervals)) { + stop("'intervals' must be a data.frame or a saved intervals set name", call. = FALSE) + } + + if ("chrom1" %in% names(intervals) || "chrom2" %in% names(intervals)) { + stop("Cannot export 2D intervals to BED format", call. = FALSE) + } + + required_cols <- c("chrom", "start", "end") + if (!all(required_cols %in% names(intervals))) { + missing <- setdiff(required_cols, names(intervals)) + stop("Intervals must have chrom, start, and end columns. Missing: ", + paste(missing, collapse = ", "), + call. = FALSE + ) + } + + if (nrow(intervals) == 0) { + stop("Cannot export empty intervals set", call. = FALSE) + } + + # BED uses 0-based coordinates (same as misha), no conversion needed + bed_data <- data.frame( + chrom = as.character(intervals$chrom), + start = intervals$start, + end = intervals$end, + stringsAsFactors = FALSE + ) + + if ("strand" %in% names(intervals)) { + # BED6 requires name and score columns before strand + bed_data$name <- "." + bed_data$score <- 0 + bed_data$strand <- ifelse(intervals$strand == 1, "+", ifelse(intervals$strand == -1, "-", ".")) + } + + if (!is.null(track_name)) { + if (!is.character(track_name) || length(track_name) != 1) { + stop("'track_name' must be a single character string", call. = FALSE) + } + + header <- sprintf('track name="%s"', track_name) + + if (!is.null(description)) { + if (!is.character(description) || length(description) != 1) { + stop("'description' must be a single character string", call. = FALSE) + } + header <- sprintf('%s description="%s"', header, description) + } + + writeLines(header, file) + append <- TRUE + } else { + append <- FALSE + } + + if (isTRUE(getOption("misha.use_readr", TRUE)) && .misha_readr_available()) { + .misha_write_with_readr( + bed_data, + file, + append = append, + col_names = FALSE, + na = "." + ) + } else { + write.table(bed_data, + file, + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, + append = append + ) + } + + invisible(file) +} diff --git a/R/track-export.R b/R/track-export.R new file mode 100644 index 00000000..e612b9c7 --- /dev/null +++ b/R/track-export.R @@ -0,0 +1,416 @@ +#' Export Track to BigWig Format +#' +#' @description +#' Exports a misha track to BigWig format. Works with dense tracks, sparse tracks, +#' virtual tracks, and track expressions. +#' +#' @param track Track name or track expression to export. Can be a regular track name, +#' a virtual track (e.g., "gc", "pwm.max"), or a track expression (e.g., "track1 + track2"). +#' @param file Output BigWig file path. Should end in .bw or .bigWig. +#' @param intervals Genomic intervals to export. Default is ALLGENOME (entire genome). +#' @param iterator Iterator for computing track values. Can be: +#' \itemize{ +#' \item Numeric: bin size in base pairs (e.g., \code{iterator = 1000} for 1kb bins) +#' \item Character: track name to use as iterator intervals +#' \item data.frame: explicit intervals to use as iterator +#' \item NULL (default): use track's native resolution +#' } +#' @param chrom_sizes Named numeric vector of chromosome sizes. If NULL (default), +#' chromosome sizes are automatically extracted from ALLGENOME. +#' @param na_value How to handle NaN/NA values (BigWig format doesn't support them): +#' \itemize{ +#' \item NULL (default): Remove intervals with NaN/NA values +#' \item numeric: Replace NaN/NA values with this value (e.g., \code{na_value = 0}) +#' } +#' @param fixed_summaries Logical. If TRUE, compute summaries at fixed Ensembl-style zoom +#' levels (30X, 65X, 130X, 260X, 450X, 648X, 950X, 1296X, 4800X, 19200X). If FALSE (default), +#' zoom levels are dynamically determined based on data characteristics. Fixed summaries +#' may be preferable for consistent visualization across different genome browsers. +#' +#' @return Invisibly returns the file path (useful for piping). +#' +#' @details +#' BigWig format does not support missing values (NaN/NA). By default (\code{na_value=NULL}), +#' intervals with NaN values are removed from the export. Alternatively, set \code{na_value} +#' to a numeric value (e.g., 0) to replace NaN with that value instead of removing the intervals. +#' +#' This function requires the \pkg{rtracklayer} package from Bioconductor. Install it with: +#' \code{BiocManager::install("rtracklayer")} +#' +#' The function uses \code{\link{gextract}} internally, which provides a unified interface +#' for extracting data from all track types (dense, sparse, virtual, expressions). The +#' \code{iterator} parameter is particularly useful for: +#' \itemize{ +#' \item Reducing output file size by binning high-resolution tracks +#' \item Computing track expressions in fixed-size windows +#' \item Exporting virtual tracks (which require an iterator to be evaluated) +#' } +#' +#' @examples +#' \dontrun{ +#' # Export entire track at native resolution +#' gtrack.export_bw("my.track", "output.bw") +#' +#' # Export track binned at 1kb resolution +#' gtrack.export_bw("my.track", "output_1kb.bw", iterator = 1000) +#' +#' # Export virtual track (GC content in 500bp windows) +#' gtrack.export_bw("gc", "gc_content.bw", iterator = 500) +#' +#' # Export track expression +#' gtrack.export_bw("track1 + track2", "combined.bw", iterator = 1000) +#' +#' # Export specific chromosomes +#' gtrack.export_bw("my.track", "chr1.bw", intervals = gintervals(1)) +#' +#' # Replace NaN values with 0 instead of removing them +#' gtrack.export_bw("coverage.track", "coverage.bw", na_value = 0, iterator = 100) +#' +#' # Use fixed Ensembl-style zoom levels for consistent browser visualization +#' gtrack.export_bw("my.track", "output.bw", fixed_summaries = TRUE) +#' } +#' +#' @seealso \code{\link{gtrack.import}} for importing tracks including BigWig files, +#' \code{\link{gextract}} for extracting track data +#' +#' @export +gtrack.export_bw <- function(track, file, intervals = get("ALLGENOME", envir = .misha), iterator = NULL, + chrom_sizes = NULL, na_value = NULL, fixed_summaries = FALSE) { + required_pkgs <- c("rtracklayer", "GenomicRanges", "IRanges", "GenomeInfoDb") + missing_pkgs <- required_pkgs[!sapply(required_pkgs, requireNamespace, quietly = TRUE)] + + if (length(missing_pkgs) > 0) { + stop("The following Bioconductor packages are required for BigWig export:\n", + paste(missing_pkgs, collapse = ", "), "\n", + "Install them with: BiocManager::install(c('", + paste(missing_pkgs, collapse = "', '"), "'))", + call. = FALSE + ) + } + + if (!is.null(na_value) && (!is.numeric(na_value) || length(na_value) != 1)) { + stop("Parameter 'na_value' must be NULL or a single numeric value", call. = FALSE) + } + + if (!is.logical(fixed_summaries) || length(fixed_summaries) != 1) { + stop("Parameter 'fixed_summaries' must be TRUE or FALSE", call. = FALSE) + } + + file_dir <- dirname(file) + if (!dir.exists(file_dir)) { + stop("Directory does not exist: ", file_dir, call. = FALSE) + } + + if (!is.null(chrom_sizes)) { + if (!is.numeric(chrom_sizes) || is.null(names(chrom_sizes))) { + stop("Parameter 'chrom_sizes' must be a named numeric vector", call. = FALSE) + } + } else { + allgenome <- get("ALLGENOME", envir = .misha) + chrom_sizes <- setNames(allgenome$end, allgenome$chrom) + } + + # Extract track data using gextract - handles all track types (dense, sparse, virtual, expressions) and iterator + data <- tryCatch( + { + gextract(track, intervals = intervals, iterator = iterator) + }, + error = function(e) { + stop("Failed to extract track data: ", e$message, call. = FALSE) + } + ) + + if (is.null(data) || nrow(data) == 0) { + stop("No data extracted from track. Check track name and intervals.", call. = FALSE) + } + + if ("chrom1" %in% names(data) || "chrom2" %in% names(data)) { + stop("Cannot export 2D tracks to BigWig format (use gextract with 1D projection)", call. = FALSE) + } + + if (!all(c("chrom", "start", "end") %in% names(data))) { + stop("Extracted data is missing required columns (chrom, start, end)", call. = FALSE) + } + + standard_cols <- c("chrom", "start", "end", "intervalID") + value_cols <- setdiff(names(data), standard_cols) + + if (length(value_cols) == 0) { + stop("No value column found in extracted data", call. = FALSE) + } + + value_col <- value_cols[1] + + values <- data[[value_col]] + na_mask <- is.na(values) + n_na <- sum(na_mask) + + if (n_na > 0) { + if (is.null(na_value)) { + pct_na <- 100 * n_na / length(values) + if (pct_na > 10) { + warning(sprintf( + "Removing %d intervals with NaN values (%.1f%% of data). Consider setting na_value=0 to replace NaN instead.", + n_na, pct_na + ), call. = FALSE) + } + data <- data[!na_mask, ] + } else { + data[[value_col]][na_mask] <- na_value + } + values <- data[[value_col]] + } + + if (nrow(data) == 0) { + if (n_na > 0 && is.null(na_value)) { + stop("All values are NaN. Set na_value= to replace NaN or check your track/expression.", call. = FALSE) + } else { + stop("No valid (non-NaN) values to export. Consider setting na_value=0 to replace NaN instead of removing.", call. = FALSE) + } + } + + # Coordinate conversion: misha uses 0-based half-open [start, end), GRanges uses 1-based closed [start, end] + # Conversion: GRanges_start = misha_start + 1, GRanges_end = misha_end + granges <- tryCatch( + { + data_chroms <- unique(as.character(data$chrom)) + + match_idx <- match(data_chroms, names(chrom_sizes)) + seq_lengths <- ifelse(!is.na(match_idx), chrom_sizes[match_idx], NA_real_) + names(seq_lengths) <- data_chroms + + if (any(is.na(seq_lengths))) { + max_ends <- aggregate(data$end, by = list(chrom = as.character(data$chrom)), FUN = max) + max_ends_vec <- setNames(max_ends$x, max_ends$chrom) + seq_lengths[is.na(seq_lengths)] <- max_ends_vec[names(seq_lengths)[is.na(seq_lengths)]] + } + + seqinfo <- GenomeInfoDb::Seqinfo( + seqnames = data_chroms, + seqlengths = as.integer(seq_lengths) + ) + + gr <- GenomicRanges::GRanges( + seqnames = as.character(data$chrom), + ranges = IRanges::IRanges( + start = data$start + 1, # Convert from 0-based to 1-based + end = data$end + ), + score = values, + seqinfo = seqinfo + ) + + gr + }, + error = function(e) { + stop("Failed to create GRanges object: ", e$message, call. = FALSE) + } + ) + + tryCatch( + { + rtracklayer::export.bw(granges, file, fixedSummaries = fixed_summaries) + }, + error = function(e) { + stop("Failed to write BigWig file: ", e$message, call. = FALSE) + } + ) + + if (!file.exists(file)) { + stop("BigWig file was not created: ", file, call. = FALSE) + } + + invisible(file) +} + +#' Export Track to BED Format +#' +#' @description +#' Exports a misha track to BED format. Works with dense tracks, sparse tracks, +#' virtual tracks, and track expressions. Output format (BED3/BED5/BED6) is +#' automatically detected based on available data. +#' +#' @param track Track name or track expression to export. Can be a regular track name, +#' a virtual track (e.g., "gc", "kmer.count"), or a track expression (e.g., "track1 + track2"). +#' @param file Output BED file path. Should end in .bed. +#' @param intervals Genomic intervals to export. Default is ALLGENOME (entire genome). +#' @param iterator Iterator for computing track values. Can be: +#' \itemize{ +#' \item Numeric: bin size in base pairs (e.g., \code{iterator = 1000} for 1kb bins) +#' \item Character: track name to use as iterator intervals +#' \item data.frame: explicit intervals to use as iterator +#' \item NULL (default): use track's native resolution +#' } +#' @param na_value How to handle NaN/NA values in the score column: +#' \itemize{ +#' \item NULL (default): Write "." for NaN/NA values (standard BED convention) +#' \item numeric: Replace NaN/NA values with this value (e.g., \code{na_value = 0}) +#' } +#' @param track_name Optional track name for BED header line. If provided, a track definition +#' line will be added at the start of the file. +#' @param description Optional description for BED header line. Only used if \code{track_name} +#' is also provided. +#' +#' @return Invisibly returns the file path (useful for piping). +#' +#' @details +#' The output format is automatically determined based on the extracted data: +#' \itemize{ +#' \item BED3: Only chrom, start, end (no track values extracted) +#' \item BED5: Track values present → chrom, start, end, name, score +#' \item BED6: Track values and strand present → chrom, start, end, name, score, strand +#' } +#' +#' For BED5/BED6 formats, the name field is auto-generated as "interval_N". +#' +#' **Coordinate System:** Both misha and BED format use 0-based half-open intervals [start, end), +#' so coordinates are written directly without conversion. This is different from BigWig export +#' which requires start+1 conversion. +#' +#' The function uses \code{\link{gextract}} internally, which provides a unified interface +#' for extracting data from all track types. The \code{iterator} parameter is particularly +#' useful for: +#' \itemize{ +#' \item Binning high-resolution tracks into fixed-size windows +#' \item Computing track expressions in specific genomic regions +#' \item Evaluating virtual tracks (which require an iterator) +#' } +#' +#' @examples +#' \dontrun{ +#' # Export sparse track at native resolution +#' gtrack.export_bed("my.track", "output.bed") +#' +#' # Export track binned at 1kb resolution +#' gtrack.export_bed("my.track", "output_1kb.bed", iterator = 1000) +#' +#' # Export virtual track (GC content in 500bp windows) +#' gtrack.export_bed("gc", "gc_content.bed", iterator = 500) +#' +#' # Export track expression +#' gtrack.export_bed("track1 + track2", "combined.bed", iterator = 1000) +#' +#' # Export specific chromosomes with track header +#' gtrack.export_bed("my.track", "chr1.bed", +#' intervals = gintervals(1), +#' track_name = "MyTrack", +#' description = "Chromosome 1 data" +#' ) +#' +#' # Replace NaN values with 0 +#' gtrack.export_bed("coverage.track", "coverage.bed", na_value = 0, iterator = 100) +#' } +#' +#' @seealso \code{\link{gtrack.export_bw}} for BigWig export, +#' \code{\link{gtrack.import}} for importing tracks including BED files, +#' \code{\link{gextract}} for extracting track data, +#' \code{\link{gintervals.export_bed}} for exporting intervals without track values +#' +#' @export +gtrack.export_bed <- function(track, file, intervals = get("ALLGENOME", envir = .misha), + iterator = NULL, na_value = NULL, + track_name = NULL, description = NULL) { + if (!is.character(file) || length(file) != 1) { + stop("'file' must be a single character string", call. = FALSE) + } + + file_dir <- dirname(file) + if (!dir.exists(file_dir)) { + stop("Directory does not exist: ", file_dir, call. = FALSE) + } + + if (!is.null(na_value)) { + if (!is.numeric(na_value) || length(na_value) != 1 || is.na(na_value)) { + stop("'na_value' must be a single numeric value", call. = FALSE) + } + } + + if (!is.null(track_name) && (!is.character(track_name) || length(track_name) != 1)) { + stop("'track_name' must be a single character string", call. = FALSE) + } + + if (!is.null(description) && (!is.character(description) || length(description) != 1)) { + stop("'description' must be a single character string", call. = FALSE) + } + + data <- tryCatch( + { + gextract(track, intervals = intervals, iterator = iterator) + }, + error = function(e) { + stop("Failed to extract track data: ", e$message, call. = FALSE) + } + ) + + if (is.null(data) || nrow(data) == 0) { + stop("No data extracted from track", call. = FALSE) + } + + if ("chrom1" %in% names(data) || "chrom2" %in% names(data)) { + stop("Cannot export 2D tracks to BED format", call. = FALSE) + } + + standard_cols <- c("chrom", "start", "end", "intervalID") + value_cols <- setdiff(names(data), standard_cols) + + # BED uses 0-based coordinates (same as misha), no conversion needed + bed_data <- data.frame( + chrom = as.character(data$chrom), + start = data$start, + end = data$end, + stringsAsFactors = FALSE + ) + + if (length(value_cols) > 0) { + values <- data[[value_cols[1]]] + + if (!is.null(na_value)) { + values[is.na(values)] <- na_value + } + + bed_data$name <- paste0("interval_", seq_len(nrow(data))) + bed_data$score <- values + + if ("strand" %in% names(data)) { + bed_data$strand <- ifelse(data$strand == 1, "+", + ifelse(data$strand == -1, "-", ".") + ) + } + } + + if (!is.null(track_name)) { + header <- sprintf('track name="%s"', track_name) + + if (!is.null(description)) { + header <- sprintf('%s description="%s"', header, description) + } + + writeLines(header, file) + append <- TRUE + } else { + append <- FALSE + } + + if (isTRUE(getOption("misha.use_readr", TRUE)) && .misha_readr_available()) { + .misha_write_with_readr( + bed_data, + file, + append = append, + col_names = FALSE, + na = "." + ) + } else { + # Use na = "." to write NaN/NA as "." (BED standard) + write.table(bed_data, + file, + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, + append = append, + na = "." + ) + } + + invisible(file) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 831cebb7..7b7fce64 100755 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -20,6 +20,8 @@ reference: - gtrack.create_dense - gtrack.create_dirs - gtrack.exists + - gtrack.export_bed + - gtrack.export_bw - gtrack.import - gtrack.import_mappedseq - gtrack.import_set @@ -83,6 +85,7 @@ reference: - gintervals.random - gintervals.rbind - gintervals.rm + - gintervals.export_bed - gintervals.save - gintervals.summary - gintervals.union diff --git a/man/gintervals.export_bed.Rd b/man/gintervals.export_bed.Rd new file mode 100644 index 00000000..f04ab65d --- /dev/null +++ b/man/gintervals.export_bed.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/intervals-export.R +\name{gintervals.export_bed} +\alias{gintervals.export_bed} +\title{Export Intervals to BED Format} +\usage{ +gintervals.export_bed(intervals, file, track_name = NULL, description = NULL) +} +\arguments{ +\item{intervals}{Either a data.frame with genomic intervals (must have chrom, start, end columns) +or a character string naming a saved intervals set (bigset) to load and export.} + +\item{file}{Output BED file path. Should end in .bed.} + +\item{track_name}{Optional track name for BED header line. If provided, a track definition +line will be added at the start of the file.} + +\item{description}{Optional description for BED header line. Only used if \code{track_name} +is also provided.} +} +\value{ +Invisibly returns the file path (useful for piping). +} +\description{ +Exports a misha intervals set to BED format. Automatically detects whether to +export as BED3 (basic) or BED6 (with strand) based on available columns. +} +\details{ +This function exports genomic intervals (without track values) to BED format. The output +format is automatically determined: +\itemize{ + \item BED3: If only chrom, start, end are available (3 columns) + \item BED6: If strand column is present (6 columns: chrom, start, end, name, score, strand) +} + +For BED6 format, the name field is set to "." and score to 0 as placeholders, since +intervals sets don't have these values. + +**Coordinate System:** Both misha and BED format use 0-based half-open intervals [start, end), +so coordinates are written directly without conversion. + +If \code{intervals} is a character string, the function will attempt to load it as a +saved intervals set using \code{\link{gintervals.load}}. +} +\examples{ +\dontrun{ +# Export intervals data.frame to BED3 +my_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) +gintervals.export_bed(my_intervals, "output.bed") + +# Export with track header +gintervals.export_bed(my_intervals, "output.bed", + track_name = "MyRegions", + description = "Interesting genomic regions" +) + +# Export intervals with strand information (BED6) +stranded_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) +stranded_intervals$strand <- c(1, -1, 1) +gintervals.export_bed(stranded_intervals, "stranded.bed") + +# Export saved intervals set by name +gintervals.save("my_regions", my_intervals) +gintervals.export_bed("my_regions", "regions.bed") +} + +} +\seealso{ +\code{\link{gtrack.import}} for importing BED files as tracks, + \code{\link{gintervals.load}} for loading saved intervals sets, + \code{\link{gintervals.save}} for saving intervals sets +} diff --git a/man/gtrack.export_bed.Rd b/man/gtrack.export_bed.Rd new file mode 100644 index 00000000..0274babd --- /dev/null +++ b/man/gtrack.export_bed.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/track-export.R +\name{gtrack.export_bed} +\alias{gtrack.export_bed} +\title{Export Track to BED Format} +\usage{ +gtrack.export_bed( + track, + file, + intervals = get("ALLGENOME", envir = .misha), + iterator = NULL, + na_value = NULL, + track_name = NULL, + description = NULL +) +} +\arguments{ +\item{track}{Track name or track expression to export. Can be a regular track name, +a virtual track (e.g., "gc", "kmer.count"), or a track expression (e.g., "track1 + track2").} + +\item{file}{Output BED file path. Should end in .bed.} + +\item{intervals}{Genomic intervals to export. Default is ALLGENOME (entire genome).} + +\item{iterator}{Iterator for computing track values. Can be: +\itemize{ + \item Numeric: bin size in base pairs (e.g., \code{iterator = 1000} for 1kb bins) + \item Character: track name to use as iterator intervals + \item data.frame: explicit intervals to use as iterator + \item NULL (default): use track's native resolution +}} + +\item{na_value}{How to handle NaN/NA values in the score column: +\itemize{ + \item NULL (default): Write "." for NaN/NA values (standard BED convention) + \item numeric: Replace NaN/NA values with this value (e.g., \code{na_value = 0}) +}} + +\item{track_name}{Optional track name for BED header line. If provided, a track definition +line will be added at the start of the file.} + +\item{description}{Optional description for BED header line. Only used if \code{track_name} +is also provided.} +} +\value{ +Invisibly returns the file path (useful for piping). +} +\description{ +Exports a misha track to BED format. Works with dense tracks, sparse tracks, +virtual tracks, and track expressions. Output format (BED3/BED5/BED6) is +automatically detected based on available data. +} +\details{ +The output format is automatically determined based on the extracted data: +\itemize{ + \item BED3: Only chrom, start, end (no track values extracted) + \item BED5: Track values present → chrom, start, end, name, score + \item BED6: Track values and strand present → chrom, start, end, name, score, strand +} + +For BED5/BED6 formats, the name field is auto-generated as "interval_N". + +**Coordinate System:** Both misha and BED format use 0-based half-open intervals [start, end), +so coordinates are written directly without conversion. This is different from BigWig export +which requires start+1 conversion. + +The function uses \code{\link{gextract}} internally, which provides a unified interface +for extracting data from all track types. The \code{iterator} parameter is particularly +useful for: +\itemize{ + \item Binning high-resolution tracks into fixed-size windows + \item Computing track expressions in specific genomic regions + \item Evaluating virtual tracks (which require an iterator) +} +} +\examples{ +\dontrun{ +# Export sparse track at native resolution +gtrack.export_bed("my.track", "output.bed") + +# Export track binned at 1kb resolution +gtrack.export_bed("my.track", "output_1kb.bed", iterator = 1000) + +# Export virtual track (GC content in 500bp windows) +gtrack.export_bed("gc", "gc_content.bed", iterator = 500) + +# Export track expression +gtrack.export_bed("track1 + track2", "combined.bed", iterator = 1000) + +# Export specific chromosomes with track header +gtrack.export_bed("my.track", "chr1.bed", + intervals = gintervals(1), + track_name = "MyTrack", + description = "Chromosome 1 data" +) + +# Replace NaN values with 0 +gtrack.export_bed("coverage.track", "coverage.bed", na_value = 0, iterator = 100) +} + +} +\seealso{ +\code{\link{gtrack.export_bw}} for BigWig export, + \code{\link{gtrack.import}} for importing tracks including BED files, + \code{\link{gextract}} for extracting track data, + \code{\link{gintervals.export_bed}} for exporting intervals without track values +} diff --git a/man/gtrack.export_bw.Rd b/man/gtrack.export_bw.Rd new file mode 100644 index 00000000..daf2d0b3 --- /dev/null +++ b/man/gtrack.export_bw.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/track-export.R +\name{gtrack.export_bw} +\alias{gtrack.export_bw} +\title{Export Track to BigWig Format} +\usage{ +gtrack.export_bw( + track, + file, + intervals = get("ALLGENOME", envir = .misha), + iterator = NULL, + chrom_sizes = NULL, + na_value = NULL, + fixed_summaries = FALSE +) +} +\arguments{ +\item{track}{Track name or track expression to export. Can be a regular track name, +a virtual track (e.g., "gc", "pwm.max"), or a track expression (e.g., "track1 + track2").} + +\item{file}{Output BigWig file path. Should end in .bw or .bigWig.} + +\item{intervals}{Genomic intervals to export. Default is ALLGENOME (entire genome).} + +\item{iterator}{Iterator for computing track values. Can be: +\itemize{ + \item Numeric: bin size in base pairs (e.g., \code{iterator = 1000} for 1kb bins) + \item Character: track name to use as iterator intervals + \item data.frame: explicit intervals to use as iterator + \item NULL (default): use track's native resolution +}} + +\item{chrom_sizes}{Named numeric vector of chromosome sizes. If NULL (default), +chromosome sizes are automatically extracted from ALLGENOME.} + +\item{na_value}{How to handle NaN/NA values (BigWig format doesn't support them): +\itemize{ + \item NULL (default): Remove intervals with NaN/NA values + \item numeric: Replace NaN/NA values with this value (e.g., \code{na_value = 0}) +}} + +\item{fixed_summaries}{Logical. If TRUE, compute summaries at fixed Ensembl-style zoom +levels (30X, 65X, 130X, 260X, 450X, 648X, 950X, 1296X, 4800X, 19200X). If FALSE (default), +zoom levels are dynamically determined based on data characteristics. Fixed summaries +may be preferable for consistent visualization across different genome browsers.} +} +\value{ +Invisibly returns the file path (useful for piping). +} +\description{ +Exports a misha track to BigWig format. Works with dense tracks, sparse tracks, +virtual tracks, and track expressions. +} +\details{ +BigWig format does not support missing values (NaN/NA). By default (\code{na_value=NULL}), +intervals with NaN values are removed from the export. Alternatively, set \code{na_value} +to a numeric value (e.g., 0) to replace NaN with that value instead of removing the intervals. + +This function requires the \pkg{rtracklayer} package from Bioconductor. Install it with: +\code{BiocManager::install("rtracklayer")} + +The function uses \code{\link{gextract}} internally, which provides a unified interface +for extracting data from all track types (dense, sparse, virtual, expressions). The +\code{iterator} parameter is particularly useful for: +\itemize{ + \item Reducing output file size by binning high-resolution tracks + \item Computing track expressions in fixed-size windows + \item Exporting virtual tracks (which require an iterator to be evaluated) +} +} +\examples{ +\dontrun{ +# Export entire track at native resolution +gtrack.export_bw("my.track", "output.bw") + +# Export track binned at 1kb resolution +gtrack.export_bw("my.track", "output_1kb.bw", iterator = 1000) + +# Export virtual track (GC content in 500bp windows) +gtrack.export_bw("gc", "gc_content.bw", iterator = 500) + +# Export track expression +gtrack.export_bw("track1 + track2", "combined.bw", iterator = 1000) + +# Export specific chromosomes +gtrack.export_bw("my.track", "chr1.bw", intervals = gintervals(1)) + +# Replace NaN values with 0 instead of removing them +gtrack.export_bw("coverage.track", "coverage.bw", na_value = 0, iterator = 100) + +# Use fixed Ensembl-style zoom levels for consistent browser visualization +gtrack.export_bw("my.track", "output.bw", fixed_summaries = TRUE) +} + +} +\seealso{ +\code{\link{gtrack.import}} for importing tracks including BigWig files, + \code{\link{gextract}} for extracting track data +} diff --git a/tests/testthat/test-export-readr.R b/tests/testthat/test-export-readr.R new file mode 100644 index 00000000..a318aa31 --- /dev/null +++ b/tests/testthat/test-export-readr.R @@ -0,0 +1,68 @@ +test_that("gintervals.export_bed uses readr when available and option enabled", { + tmp <- withr::local_tempfile(fileext = ".bed") + intervals <- data.frame(chrom = "chr1", start = 0L, end = 10L) + used_readr <- FALSE + + withr::local_options(misha.use_readr = TRUE) + with_mocked_bindings( + .misha_readr_available = function() TRUE, + .misha_write_with_readr = function(bed_data, file, append, col_names, na) { + used_readr <<- TRUE + lines <- apply(bed_data, 1, paste, collapse = "\t") + con <- file(file, if (isTRUE(append)) "a" else "w") + on.exit(close(con), add = TRUE) + writeLines(lines, con, sep = "\n", useBytes = TRUE) + }, + .env = asNamespace("misha"), + gintervals.export_bed(intervals, tmp) + ) + + expect_true(used_readr) + expect_identical(readLines(tmp), "chr1\t0\t10") +}) + +test_that("gintervals.export_bed falls back to write.table when readr is unavailable", { + tmp <- withr::local_tempfile(fileext = ".bed") + intervals <- data.frame(chrom = "chr1", start = 0L, end = 10L) + withr::local_options(misha.use_readr = FALSE) + + gintervals.export_bed(intervals, tmp) + + expect_identical(readLines(tmp), "chr1\t0\t10") +}) + +test_that("gtrack.export_bed uses readr path and NA handling when available", { + tmp <- withr::local_tempfile(fileext = ".bed") + intervals <- gintervals(c(1, 1), c(0, 10), c(10, 20)) + values <- c(NA_real_, 5) + track <- paste0("track_readr_", sample(1:1e9, 1)) + gtrack.create_sparse(track, "tmp", intervals, values) + withr::defer(gtrack.rm(track, force = TRUE)) + + used_readr <- FALSE + na_arg <- NULL + + withr::local_options(misha.use_readr = TRUE) + with_mocked_bindings( + .misha_readr_available = function() TRUE, + .misha_write_with_readr = function(bed_data, file, append, col_names, na) { + used_readr <<- TRUE + na_arg <<- na + bed_out <- bed_data + bed_out[is.na(bed_out)] <- na + lines <- vapply(seq_len(nrow(bed_out)), function(i) paste(bed_out[i, ], collapse = "\t"), character(1)) + con <- file(file, if (isTRUE(append)) "a" else "w") + on.exit(close(con), add = TRUE) + writeLines(lines, con, sep = "\n", useBytes = TRUE) + }, + .env = asNamespace("misha"), + gtrack.export_bed(track, tmp, intervals = intervals) + ) + + expect_true(used_readr) + expect_identical(na_arg, ".") + bed_lines <- readLines(tmp) + expect_length(bed_lines, 2) + expect_match(bed_lines[1], "^(chr)?1\\t0\\t10\\tinterval_1\\t\\.$") + expect_match(bed_lines[2], "^(chr)?1\\t10\\t20\\tinterval_2\\t5$") +}) diff --git a/tests/testthat/test-gintervals.export_bed.R b/tests/testthat/test-gintervals.export_bed.R new file mode 100644 index 00000000..5dbb17c2 --- /dev/null +++ b/tests/testthat/test-gintervals.export_bed.R @@ -0,0 +1,322 @@ +load_test_db() + +test_that("gintervals.export_bed exports basic BED3 format", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create simple intervals + test_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) + + # Export + result <- gintervals.export_bed(test_intervals, tmpfile) + + # Should return file path invisibly + expect_equal(result, tmpfile) + expect_true(file.exists(tmpfile)) + + # Read and verify content + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 3) + expect_equal(ncol(bed_content), 3) # BED3 + + # Verify coordinates (no conversion - 0-based preserved) + expect_equal(bed_content[, 1], as.character(test_intervals$chrom)) + expect_equal(bed_content[, 2], test_intervals$start) + expect_equal(bed_content[, 3], test_intervals$end) +}) + +test_that("gintervals.export_bed exports BED6 format with strand", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create intervals with strand + test_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) + test_intervals$strand <- c(1, -1, 1) + + # Export + gintervals.export_bed(test_intervals, tmpfile) + + # Read and verify content + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 3) + expect_equal(ncol(bed_content), 6) # BED6 + + # Verify strand conversion (1 -> "+", -1 -> "-") + expect_equal(bed_content[, 6], c("+", "-", "+")) + + # Verify name and score placeholders + expect_true(all(bed_content[, 4] == ".")) + expect_true(all(bed_content[, 5] == 0)) +}) + +test_that("gintervals.export_bed adds track header when requested", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, c(1000, 2000), c(2000, 3000)) + + # Export with track name only + gintervals.export_bed(test_intervals, tmpfile, track_name = "MyRegions") + + lines <- readLines(tmpfile) + expect_match(lines[1], 'track name="MyRegions"') + expect_equal(length(lines), 3) # header + 2 data lines + + # Export with both track name and description + tmpfile2 <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile2)) + + gintervals.export_bed(test_intervals, tmpfile2, + track_name = "MyRegions", + description = "Test regions" + ) + + lines2 <- readLines(tmpfile2) + expect_match(lines2[1], 'track name="MyRegions" description="Test regions"') +}) + +test_that("gintervals.export_bed works with saved intervals set", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create and save intervals set + test_intervals <- gintervals(c(1, 2), c(1000, 2000), c(2000, 3000)) + test_set_name <- paste0("test.export_bed_", sample(1:1e9, 1)) + + gintervals.save(test_set_name, test_intervals) + withr::defer(gintervals.rm(test_set_name, force = TRUE)) + + # Export by name + gintervals.export_bed(test_set_name, tmpfile) + + # Verify export + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 2) + expect_equal(bed_content[, 2], test_intervals$start) + expect_equal(bed_content[, 3], test_intervals$end) +}) + +test_that("gintervals.export_bed validates intervals parameter", { + tmpfile <- tempfile(fileext = ".bed") + + # Missing required columns + bad_intervals <- data.frame(chrom = 1, start = 100) + expect_error( + gintervals.export_bed(bad_intervals, tmpfile), + "must have chrom, start, and end" + ) + + # Not a data.frame or valid name + expect_error( + gintervals.export_bed(list(chrom = 1, start = 100, end = 200), tmpfile), + "must be a data.frame" + ) + + # Non-existent intervals set + expect_error( + gintervals.export_bed("nonexistent_set_12345", tmpfile), + "does not exist" + ) +}) + +test_that("gintervals.export_bed validates file parameter", { + test_intervals <- gintervals(1, 1000, 2000) + + # Non-character file + expect_error( + gintervals.export_bed(test_intervals, 123), + "must be a single character string" + ) + + # Multiple file paths + expect_error( + gintervals.export_bed(test_intervals, c("file1.bed", "file2.bed")), + "must be a single character string" + ) + + # Non-existent directory + expect_error( + gintervals.export_bed(test_intervals, "/nonexistent_dir_12345/file.bed"), + "Directory does not exist" + ) +}) + +test_that("gintervals.export_bed validates track_name and description", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 1000, 2000) + + # Invalid track_name type + expect_error( + gintervals.export_bed(test_intervals, tmpfile, track_name = 123), + "must be a single character string" + ) + + # Invalid description type + expect_error( + gintervals.export_bed(test_intervals, tmpfile, track_name = "test", description = 123), + "must be a single character string" + ) +}) + +test_that("gintervals.export_bed rejects empty intervals", { + tmpfile <- tempfile(fileext = ".bed") + + # gintervals with empty vectors returns NULL, which triggers data.frame check + empty_intervals <- gintervals(integer(0), integer(0), integer(0)) + + expect_error( + gintervals.export_bed(empty_intervals, tmpfile), + "must be a data.frame" + ) + + # Also test with an actual empty data.frame + empty_df <- data.frame(chrom = integer(0), start = integer(0), end = integer(0)) + expect_error( + gintervals.export_bed(empty_df, tmpfile), + "Cannot export empty intervals" + ) +}) + +test_that("gintervals.export_bed rejects 2D intervals", { + tmpfile <- tempfile(fileext = ".bed") + + # Create 2D intervals + intervals_2d <- gintervals.2d(1, 1000, 2000, 1, 3000, 4000) + + expect_error( + gintervals.export_bed(intervals_2d, tmpfile), + "Cannot export 2D intervals" + ) +}) + +test_that("gintervals.export_bed preserves coordinate system", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create intervals with specific coordinates to verify no conversion + test_intervals <- gintervals( + c(1, 1, 2), + c(0, 100, 1000), # Including 0 to test edge case + c(50, 200, 2000) + ) + + gintervals.export_bed(test_intervals, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + + # Coordinates should be exactly as input (no +1 conversion like BigWig) + expect_equal(bed_content[, 2], c(0, 100, 1000)) + expect_equal(bed_content[, 3], c(50, 200, 2000)) +}) + +test_that("gintervals.export_bed handles multiple chromosomes", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create intervals across multiple chromosomes + test_intervals <- gintervals( + c(1, 1, 2, 2, 3), + c(1000, 5000, 2000, 6000, 1000), + c(2000, 6000, 3000, 7000, 2000) + ) + + gintervals.export_bed(test_intervals, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 5) + + # Verify chromosomes are preserved + expect_equal(bed_content[, 1], c("chr1", "chr1", "chr2", "chr2", "chr3")) +}) + +test_that("gintervals.export_bed handles file overwrite", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals1 <- gintervals(1, 1000, 2000) + test_intervals2 <- gintervals(c(1, 2), c(3000, 4000), c(4000, 5000)) + + # Write first file + gintervals.export_bed(test_intervals1, tmpfile) + content1 <- readLines(tmpfile) + expect_equal(length(content1), 1) + + # Overwrite with second file + gintervals.export_bed(test_intervals2, tmpfile) + content2 <- readLines(tmpfile) + expect_equal(length(content2), 2) +}) + +test_that("gintervals.export_bed round-trip preserves data", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create test intervals + original_intervals <- gintervals( + c(1, 1, 2), + c(1000, 5000, 2000), + c(2000, 6000, 3000) + ) + + # Export to BED + gintervals.export_bed(original_intervals, tmpfile) + + # Import back using gtrack.import + tmptrack <- paste0("test.reimport_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + gtrack.import(tmptrack, "", tmpfile, binsize = 0) + + # Extract reimported data + reimported <- gextract(tmptrack, gintervals.all()) + + # Compare coordinates (should match exactly - no conversion) + expect_equal(nrow(reimported), nrow(original_intervals)) + expect_equal(reimported$start, original_intervals$start) + expect_equal(reimported$end, original_intervals$end) + expect_equal(as.character(reimported$chrom), as.character(original_intervals$chrom)) +}) + +test_that("gintervals.export_bed handles large intervals sets", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create large intervals set (1000 intervals) + n <- 1000 + large_intervals <- gintervals( + rep(1, n), + seq(0, (n - 1) * 1000, 1000), + seq(500, n * 1000 - 500, 1000) + ) + + # Should complete without error + expect_no_error(gintervals.export_bed(large_intervals, tmpfile)) + + # Verify file size + expect_true(file.exists(tmpfile)) + expect_gt(file.size(tmpfile), 0) + + # Verify row count + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + expect_equal(nrow(bed_content), n) +}) + +test_that("gintervals.export_bed handles special chromosome names", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create intervals with various chromosome representations + test_intervals <- data.frame( + chrom = c("chr1", "chr2", "chrX"), + start = c(1000, 2000, 3000), + end = c(2000, 3000, 4000), + stringsAsFactors = FALSE + ) + + gintervals.export_bed(test_intervals, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(bed_content[, 1], c("chr1", "chr2", "chrX")) +}) diff --git a/tests/testthat/test-gtrack.export_bed.R b/tests/testthat/test-gtrack.export_bed.R new file mode 100644 index 00000000..9fb57bac --- /dev/null +++ b/tests/testthat/test-gtrack.export_bed.R @@ -0,0 +1,440 @@ +load_test_db() + +test_that("gtrack.export_bed exports sparse track to BED5 format", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create sparse track + test_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) + test_values <- c(10.5, 20.3, 15.7) + + tmptrack <- paste0("test.bed_export_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test sparse track", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + result <- gtrack.export_bed(tmptrack, tmpfile) + + # Should return file path invisibly + expect_equal(result, tmpfile) + expect_true(file.exists(tmpfile)) + + # Read and verify BED5 format (chrom, start, end, name, score) + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 3) + expect_equal(ncol(bed_content), 5) # BED5 + + # Verify coordinates (no conversion - 0-based preserved) + expect_equal(bed_content[, 2], test_intervals$start) + expect_equal(bed_content[, 3], test_intervals$end) + + # Verify scores (use tolerance for float precision) + expect_equal(bed_content[, 5], test_values, tolerance = 1e-6) +}) + +test_that("gtrack.export_bed exports dense track to BED5 format", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Export existing dense track + test_intervals <- gintervals(1, 0, 10000) + gtrack.export_bed("test.fixedbin", tmpfile, intervals = test_intervals) + + expect_true(file.exists(tmpfile)) + + # Read and verify BED5 format + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_gt(nrow(bed_content), 0) + expect_equal(ncol(bed_content), 5) # BED5 (has track values) +}) + +test_that("gtrack.export_bed works with track expressions and iterator", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Export track expression with iterator + test_intervals <- gintervals(1, 0, 10000) + gtrack.export_bed("test.fixedbin * 2", tmpfile, + intervals = test_intervals, + iterator = 1000 + ) + + expect_true(file.exists(tmpfile)) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + expect_gt(nrow(bed_content), 0) + expect_equal(ncol(bed_content), 5) # BED5 +}) + +test_that("gtrack.export_bed handles NaN values with na_value parameter", { + tmpfile1 <- tempfile(fileext = ".bed") + tmpfile2 <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile1)) + withr::defer(unlink(tmpfile2)) + + # Create track with NaN values + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + test_values <- c(10.5, NaN, 15.7) + + tmptrack <- paste0("test.nan_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test NaN", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export with default (NaN written as ".") + gtrack.export_bed(tmptrack, tmpfile1) + bed1 <- readLines(tmpfile1) + expect_match(bed1[2], "\\.$") # Second line should end with "." + + # Export with na_value replacement + gtrack.export_bed(tmptrack, tmpfile2, na_value = -999) + bed2 <- read.table(tmpfile2, sep = "\t", header = FALSE) + expect_equal(bed2[2, 5], -999) +}) + +test_that("gtrack.export_bed adds track header when requested", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 5000) + + # Export with track name only + gtrack.export_bed("test.fixedbin", tmpfile, + intervals = test_intervals, + track_name = "MyTrack" + ) + + lines <- readLines(tmpfile) + expect_match(lines[1], 'track name="MyTrack"') + + # Export with both track name and description + tmpfile2 <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile2)) + + gtrack.export_bed("test.fixedbin", tmpfile2, + intervals = test_intervals, + track_name = "MyTrack", + description = "Test data" + ) + + lines2 <- readLines(tmpfile2) + expect_match(lines2[1], 'track name="MyTrack" description="Test data"') +}) + +test_that("gtrack.export_bed round-trip preserves data", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create test track + test_intervals <- gintervals(c(1, 1, 2), c(1000, 5000, 2000), c(2000, 6000, 3000)) + test_values <- c(10.5, 20.3, 15.7) + + tmptrack1 <- paste0("test.roundtrip1_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack1, "Test track", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack1, force = TRUE)) + + # Export to BED + gtrack.export_bed(tmptrack1, tmpfile) + + # Import back + tmptrack2 <- paste0("test.roundtrip2_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack2, force = TRUE)) + gtrack.import(tmptrack2, "", tmpfile, binsize = 0) + + # Compare original and reimported + original <- gextract(tmptrack1, gintervals.all()) + reimported <- gextract(tmptrack2, gintervals.all()) + + expect_equal(nrow(original), nrow(reimported)) + expect_equal(original$start, reimported$start) + expect_equal(original$end, reimported$end) + expect_equal(original[[5]], reimported[[5]], tolerance = 1e-6) # Values +}) + +test_that("gtrack.export_bed preserves coordinate system (no conversion)", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create track with specific coordinates including edge cases + test_intervals <- gintervals(c(1, 1, 2), c(0, 100, 1000), c(50, 200, 2000)) + test_values <- c(5.0, 10.0, 15.0) + + tmptrack <- paste0("test.coords_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test coords", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + gtrack.export_bed(tmptrack, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + + # Coordinates should be exactly as input (no +1 conversion like BigWig) + expect_equal(bed_content[, 2], c(0, 100, 1000)) + expect_equal(bed_content[, 3], c(50, 200, 2000)) +}) + +test_that("gtrack.export_bed handles multiple chromosomes", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create track with multiple chromosomes + test_intervals <- gintervals( + c(1, 1, 2, 2, 3), + c(1000, 5000, 2000, 6000, 1000), + c(2000, 6000, 3000, 7000, 2000) + ) + test_values <- c(10, 20, 30, 40, 50) + + tmptrack <- paste0("test.multichrom_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test multi", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + gtrack.export_bed(tmptrack, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + expect_equal(nrow(bed_content), 5) + + # Verify chromosomes preserved + expect_equal(bed_content[, 1], c("chr1", "chr1", "chr2", "chr2", "chr3")) +}) + +test_that("gtrack.export_bed validates file parameter", { + # Non-character file + expect_error( + gtrack.export_bed("test.fixedbin", 123), + "must be a single character string" + ) + + # Multiple file paths + expect_error( + gtrack.export_bed("test.fixedbin", c("file1.bed", "file2.bed")), + "must be a single character string" + ) + + # Non-existent directory + expect_error( + gtrack.export_bed("test.fixedbin", "/nonexistent_dir_12345/file.bed"), + "Directory does not exist" + ) +}) + +test_that("gtrack.export_bed validates na_value parameter", { + tmpfile <- tempfile(fileext = ".bed") + + # Non-numeric na_value + expect_error( + gtrack.export_bed("test.fixedbin", tmpfile, na_value = "invalid"), + "must be a single numeric value" + ) + + # Multiple values + expect_error( + gtrack.export_bed("test.fixedbin", tmpfile, na_value = c(0, 1)), + "must be a single numeric value" + ) + + # NA value + expect_error( + gtrack.export_bed("test.fixedbin", tmpfile, na_value = NA), + "must be a single numeric value" + ) +}) + +test_that("gtrack.export_bed validates track_name and description", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Invalid track_name type + expect_error( + gtrack.export_bed("test.fixedbin", tmpfile, track_name = 123), + "must be a single character string" + ) + + # Invalid description type + expect_error( + gtrack.export_bed("test.fixedbin", tmpfile, + track_name = "test", + description = 123 + ), + "must be a single character string" + ) +}) + +test_that("gtrack.export_bed handles invalid track names", { + tmpfile <- tempfile(fileext = ".bed") + + # Non-existent track + expect_error( + gtrack.export_bed("nonexistent.track.12345", tmpfile), + "Failed to extract track data" + ) +}) + +test_that("gtrack.export_bed handles file overwrite", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 5000) + + # Write first file + gtrack.export_bed("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000) + content1 <- readLines(tmpfile) + size1 <- length(content1) + + # Overwrite with different iterator (different data) + test_intervals2 <- gintervals(1, 0, 10000) + gtrack.export_bed("test.fixedbin", tmpfile, intervals = test_intervals2, iterator = 500) + content2 <- readLines(tmpfile) + size2 <- length(content2) + + # Second export should have more lines (smaller iterator = more bins) + expect_gt(size2, size1) +}) + +test_that("gtrack.export_bed works with custom intervals", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Export only specific region + custom_intervals <- gintervals(1, 5000, 15000) + gtrack.export_bed("test.fixedbin", tmpfile, + intervals = custom_intervals, + iterator = 1000 + ) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + + # All intervals should be within custom range + expect_true(all(bed_content[, 2] >= 5000)) + expect_true(all(bed_content[, 3] <= 15000)) +}) + +test_that("gtrack.export_bed works with virtual tracks", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Export virtual track with iterator (using masked.frac which exists) + test_intervals <- gintervals(1, 1000, 5000) + + # Create virtual track first + gvtrack.create("test_masked", NULL, "masked.frac") + withr::defer(remove_all_vtracks()) + + expect_no_error( + gtrack.export_bed("test_masked", tmpfile, + intervals = test_intervals, + iterator = 500 + ) + ) + + expect_true(file.exists(tmpfile)) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + expect_gt(nrow(bed_content), 0) + expect_equal(ncol(bed_content), 5) # BED5 +}) + +test_that("gtrack.export_bed handles negative values", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create track with negative values + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + test_values <- c(-10.5, -20.3, -5.7) + + tmptrack <- paste0("test.negative_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test negative", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + gtrack.export_bed(tmptrack, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + + # Verify negative values preserved (use tolerance for float precision) + expect_true(all(bed_content[, 5] < 0)) + expect_equal(bed_content[, 5], test_values, tolerance = 1e-6) +}) + +test_that("gtrack.export_bed produces reasonable file sizes", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 100000) + + # Export with different iterator sizes + gtrack.export_bed("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000) + size1 <- file.size(tmpfile) + + gtrack.export_bed("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 10000) + size2 <- file.size(tmpfile) + + # All files should have reasonable sizes + expect_gt(size1, 100) # Not empty + expect_gt(size2, 100) # Not empty + + # Smaller iterator should produce larger file (more bins) + expect_gt(size1, size2) +}) + +test_that("gtrack.export_bed handles regions with no track coverage", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Export with intervals that may have sparse/no track data + # (gextract will still return bins, but values may be NaN) + test_intervals <- gintervals(1, 50000000, 50010000) + + # Should complete without error (creates file with NaN values as ".") + expect_no_error( + gtrack.export_bed("test.fixedbin", tmpfile, + intervals = test_intervals, + iterator = 1000 + ) + ) + + # File should be created + expect_true(file.exists(tmpfile)) +}) + +test_that("gtrack.export_bed exports numeric precision correctly", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create track with precise decimal values + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + test_values <- c(0.123456789, 9.876543210, 1.23456789) + + tmptrack <- paste0("test.precision_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test precision", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + gtrack.export_bed(tmptrack, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE) + + # Values should be preserved with reasonable precision + expect_equal(bed_content[, 5], test_values, tolerance = 1e-6) +}) + +test_that("gtrack.export_bed auto-generates interval names", { + tmpfile <- tempfile(fileext = ".bed") + withr::defer(unlink(tmpfile)) + + # Create small track + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + test_values <- c(10, 20, 30) + + tmptrack <- paste0("test.names_", sample(1:1e9, 1)) + gtrack.create_sparse(tmptrack, "Test names", test_intervals, test_values) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Export + gtrack.export_bed(tmptrack, tmpfile) + + bed_content <- read.table(tmpfile, sep = "\t", header = FALSE, stringsAsFactors = FALSE) + + # Verify auto-generated names + expect_equal(bed_content[, 4], c("interval_1", "interval_2", "interval_3")) +}) diff --git a/tests/testthat/test-gtrack.export_bw.R b/tests/testthat/test-gtrack.export_bw.R new file mode 100644 index 00000000..d98926a1 --- /dev/null +++ b/tests/testthat/test-gtrack.export_bw.R @@ -0,0 +1,792 @@ +create_isolated_test_db() + +test_that("gtrack.export_bw works with dense track at native resolution", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export a known dense track + result <- gtrack.export_bw("test.fixedbin", tmpfile) + + # Verify function returns file path invisibly + expect_equal(result, tmpfile) + + # Verify file exists and has content + expect_true(file.exists(tmpfile)) + expect_gt(file.size(tmpfile), 0) + + # Import back and verify we can read it + imported <- rtracklayer::import.bw(tmpfile) + expect_s4_class(imported, "GRanges") + expect_gt(length(imported), 0) +}) + +test_that("gtrack.export_bw works with sparse track", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + gtrack.export_bw("test.sparse", tmpfile) + + expect_true(file.exists(tmpfile)) + expect_gt(file.size(tmpfile), 0) + + # Verify we can import it + imported <- rtracklayer::import.bw(tmpfile) + expect_s4_class(imported, "GRanges") +}) + +test_that("gtrack.export_bw handles NaN values correctly - removal (default)", { + skip_if_not_installed("rtracklayer") + + # Create track with NaN values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 2000, 3000), c(1100, 2100, 3100)) + values <- c(1.0, NaN, 2.0) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # With na_value=NULL (default), should remove NaN intervals + # Suppress warning since we're testing removal behavior, not the warning + expect_no_error(suppressWarnings(gtrack.export_bw(tmptrack, tmpfile))) + + # Verify NaN was filtered + imported <- rtracklayer::import.bw(tmpfile) + expect_equal(length(imported), 2) # Only 2 intervals, NaN filtered + expect_equal(as.numeric(imported$score), c(1.0, 2.0)) +}) + +test_that("gtrack.export_bw handles NaN values correctly - replacement", { + skip_if_not_installed("rtracklayer") + + # Create track with NaN values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 2000, 3000), c(1100, 2100, 3100)) + values <- c(1.0, NaN, 2.0) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # With na_value=0, should replace NaN with 0 + expect_no_error(gtrack.export_bw(tmptrack, tmpfile, na_value = 0)) + + # Verify NaN was replaced with 0 + imported <- rtracklayer::import.bw(tmpfile) + expect_equal(length(imported), 3) # All 3 intervals present + expect_equal(as.numeric(imported$score), c(1.0, 0.0, 2.0)) +}) + +test_that("gtrack.export_bw warns when removing many NaN values", { + skip_if_not_installed("rtracklayer") + + # Create track with many NaN values (>10%) + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # 6 out of 10 values are NaN (60%) + intervals <- gintervals(1, seq(1000, 10000, 1000), seq(1100, 10100, 1000)) + values <- c(1, NaN, NaN, 2, NaN, 3, NaN, NaN, 4, NaN) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Should warn about removing many NaN values + expect_warning( + gtrack.export_bw(tmptrack, tmpfile), + "Removing.*NaN" + ) +}) + +test_that("gtrack.export_bw errors when all values are NaN with na_value=NULL", { + skip_if_not_installed("rtracklayer") + + # Create track with all NaN values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 2000), c(1100, 2100)) + values <- c(NaN, NaN) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Should error when all values are NaN + # Suppress warning that occurs before the error + expect_error( + suppressWarnings(gtrack.export_bw(tmptrack, tmpfile)), + "All values are NaN" + ) +}) + +test_that("gtrack.export_bw rejects 2D tracks", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # test.rects is a 2D track + expect_error( + gtrack.export_bw("test.rects", tmpfile), + "2D track" + ) +}) + +test_that("gtrack.export_bw validates inputs", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + + # Invalid file path - directory doesn't exist + expect_error( + gtrack.export_bw("test.fixedbin", "/invalid/path/that/does/not/exist/file.bw"), + "Directory does not exist" + ) + + # Invalid na_value (not numeric or NULL) - character + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, na_value = "invalid"), + "na_value.*numeric" + ) + + # Invalid na_value - vector instead of single value + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, na_value = c(0, 1)), + "na_value.*single" + ) +}) + +test_that("gtrack.export_bw works with numeric iterator (binning)", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export with 1kb bins + gtrack.export_bw("test.fixedbin", tmpfile, intervals = gintervals(1, 0, 100000), iterator = 1000) + + expect_true(file.exists(tmpfile)) + + # Verify bins are 1kb + imported <- rtracklayer::import.bw(tmpfile) + widths <- IRanges::width(imported) + # Most bins should be 1000bp (some at chromosome ends may be smaller) + expect_equal(median(widths), 1000) +}) + +test_that("gtrack.export_bw works with track expressions", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export sum of track and constant + gtrack.export_bw("test.fixedbin + 1", tmpfile, + intervals = gintervals(1, 0, 100000), + iterator = 1000 + ) + + expect_true(file.exists(tmpfile)) + + # Verify we can import it + imported <- rtracklayer::import.bw(tmpfile) + expect_s4_class(imported, "GRanges") +}) + +test_that("gtrack.export_bw works with custom intervals", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export only chromosome 1 + custom_intervals <- gintervals(1) + gtrack.export_bw("test.fixedbin", tmpfile, intervals = custom_intervals) + + expect_true(file.exists(tmpfile)) + + # Verify only chr1 was exported + imported <- rtracklayer::import.bw(tmpfile) + chrom_names <- as.character(GenomicRanges::seqnames(imported)) + expect_true(all(chrom_names == "chr1")) +}) + +test_that("gtrack.export_bw returns file path invisibly", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + result <- gtrack.export_bw("test.fixedbin", tmpfile, + intervals = gintervals(1, 0, 10000) + ) + expect_equal(result, tmpfile) +}) + +test_that("gtrack.export_bw handles coordinate conversion correctly", { + skip_if_not_installed("rtracklayer") + + # Create a sparse track with known intervals + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + # Misha uses 0-based half-open intervals [start, end) + # These intervals: [1000, 2000), [3000, 4000) + intervals <- gintervals(1, c(1000, 3000), c(2000, 4000)) + values <- c(10, 20) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + gtrack.export_bw(tmptrack, tmpfile) + + # Import and check coordinates + # GRanges uses 1-based closed intervals [start, end] + # So [1000, 2000) should become [1001, 2000] + imported <- rtracklayer::import.bw(tmpfile) + + expect_equal(length(imported), 2) + expect_equal(as.integer(GenomicRanges::start(imported)), c(1001L, 3001L)) + expect_equal(as.integer(GenomicRanges::end(imported)), c(2000L, 4000L)) + expect_equal(as.numeric(imported$score), c(10, 20)) +}) + +test_that("gtrack.export_bw round-trip preserves values", { + skip_if_not_installed("rtracklayer") + + # Create a track, export to BigWig, import back, and compare + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, seq(1000, 10000, 1000), seq(2000, 11000, 1000)) + values <- seq(1, 10) + gtrack.create_sparse(tmptrack, "", intervals, values) + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export + gtrack.export_bw(tmptrack, tmpfile) + + # Import back with rtracklayer + imported <- rtracklayer::import.bw(tmpfile) + + # Check values match (accounting for coordinate conversion) + expect_equal(as.numeric(imported$score), values) + expect_equal(length(imported), length(values)) +}) + +test_that("gtrack.export_bw handles empty extraction result", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Try to extract from non-overlapping intervals + # This should fail during gextract, not in our function + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, + intervals = gintervals(999, 0, 1000) + ), # Non-existent chromosome + "extract" + ) +}) + +test_that("gtrack.export_bw works with multiple chromosomes", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export chromosomes 1 and 2 + custom_intervals <- gintervals(c(1, 2)) + gtrack.export_bw("test.fixedbin", tmpfile, intervals = custom_intervals) + + expect_true(file.exists(tmpfile)) + + # Verify both chromosomes are present + imported <- rtracklayer::import.bw(tmpfile) + chrom_names <- unique(as.character(GenomicRanges::seqnames(imported))) + expect_true(all(c("chr1", "chr2") %in% chrom_names)) +}) + +test_that("gtrack.export_bw with iterator produces correct bin sizes", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export with 5kb bins + bin_size <- 5000 + gtrack.export_bw("test.fixedbin", tmpfile, + intervals = gintervals(1, 0, 100000), + iterator = bin_size + ) + + imported <- rtracklayer::import.bw(tmpfile) + widths <- IRanges::width(imported) + + # Most bins should be exactly bin_size (last one might be smaller) + expect_true(median(widths) == bin_size) + expect_true(max(widths) == bin_size) +}) + +# Tests using gtrack.import to verify round-trip +test_that("gtrack.export_bw -> gtrack.import round-trip works with sparse track", { + skip_if_not_installed("rtracklayer") + + # Create a sparse track with known values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 5000, 10000), c(2000, 6000, 11000)) + values <- c(10.5, 20.3, 30.7) + gtrack.create_sparse(tmptrack, "", intervals, values) + + # Export to BigWig + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + gtrack.export_bw(tmptrack, tmpfile) + + # Import back using gtrack.import (binsize = 0 for sparse tracks) + imported_track <- paste0("test.imported_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(imported_track, force = TRUE)) + gtrack.import(imported_track, "", tmpfile, binsize = 0) + + # Extract data from both tracks + original_data <- gextract(tmptrack, intervals = intervals, iterator = intervals) + imported_data <- gextract(imported_track, intervals = intervals, iterator = intervals) + + # Compare values (allowing small floating point differences) + expect_equal(imported_data[[imported_track]], original_data[[tmptrack]], tolerance = 1e-6) + expect_equal(nrow(imported_data), nrow(original_data)) +}) + +test_that("gtrack.export_bw -> gtrack.import round-trip works with binned dense track", { + skip_if_not_installed("rtracklayer") + + # Export dense track with 1kb bins + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 50000) + bin_size <- 1000 + + # Get expected values by extracting with iterator + expected_data <- gextract("test.fixedbin", intervals = test_intervals, iterator = bin_size) + + # Export to BigWig + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = bin_size) + + # Import back (binsize = bin_size for dense tracks) + imported_track <- paste0("test.imported_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(imported_track, force = TRUE)) + gtrack.import(imported_track, "", tmpfile, binsize = bin_size) + + # Extract from imported track + imported_data <- gextract(imported_track, intervals = test_intervals, iterator = bin_size) + + # Compare values + expect_equal(nrow(imported_data), nrow(expected_data)) + expect_equal(imported_data[[imported_track]], expected_data$test.fixedbin, tolerance = 1e-6) +}) + +test_that("gtrack.export_bw -> gtrack.import round-trip preserves NaN replacement", { + skip_if_not_installed("rtracklayer") + + # Create track with NaN values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 2000, 3000, 4000), c(1100, 2100, 3100, 4100)) + values <- c(1.5, NaN, 2.5, NaN) + gtrack.create_sparse(tmptrack, "", intervals, values) + + # Export with na_value=0 + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + gtrack.export_bw(tmptrack, tmpfile, na_value = 0) + + # Import back (binsize = 0 for sparse tracks) + imported_track <- paste0("test.imported_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(imported_track, force = TRUE)) + gtrack.import(imported_track, "", tmpfile, binsize = 0) + + # Extract from imported track + imported_data <- gextract(imported_track, intervals = intervals, iterator = intervals) + + # NaN values should have been replaced with 0 + expect_equal(nrow(imported_data), 4) + expect_equal(imported_data[[imported_track]], c(1.5, 0.0, 2.5, 0.0), tolerance = 1e-6) +}) + +test_that("gtrack.export_bw -> gtrack.import round-trip handles NaN removal", { + skip_if_not_installed("rtracklayer") + + # Create track with NaN values + tmptrack <- paste0("test.tmptrack_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(tmptrack, force = TRUE)) + + intervals <- gintervals(1, c(1000, 2000, 3000, 4000), c(1100, 2100, 3100, 4100)) + values <- c(1.5, NaN, 2.5, NaN) + gtrack.create_sparse(tmptrack, "", intervals, values) + + # Export with na_value=NULL (default, removes NaN intervals) + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + expect_no_error(suppressWarnings(gtrack.export_bw(tmptrack, tmpfile))) + + # Import back (binsize = 0 for sparse tracks) + imported_track <- paste0("test.imported_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(imported_track, force = TRUE)) + gtrack.import(imported_track, "", tmpfile, binsize = 0) + + # Extract all intervals from imported track + imported_data <- gextract(imported_track, intervals = gintervals(1)) + + # Should only have 2 intervals (NaN intervals were removed) + expect_equal(nrow(imported_data), 2) + + # Check the non-NaN intervals are preserved + non_nan_intervals <- intervals[!is.na(values), ] + extracted_values <- gextract(imported_track, intervals = non_nan_intervals, iterator = non_nan_intervals) + expect_equal(extracted_values[[imported_track]], c(1.5, 2.5), tolerance = 1e-6) +}) + +test_that("gtrack.export_bw -> gtrack.import round-trip works with track expression", { + skip_if_not_installed("rtracklayer") + + # Export a track expression + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 50000) + bin_size <- 1000 + + # Get expected values from expression + expected_data <- gextract("test.fixedbin + 10", intervals = test_intervals, iterator = bin_size, colnames = "expr") + + # Export expression + gtrack.export_bw("test.fixedbin + 10", tmpfile, intervals = test_intervals, iterator = bin_size) + + # Import back (binsize = bin_size for dense tracks) + imported_track <- paste0("test.imported_", sample(1:1e9, 1)) + withr::defer(gtrack.rm(imported_track, force = TRUE)) + gtrack.import(imported_track, "", tmpfile, binsize = bin_size) + + # Extract from imported track + imported_data <- gextract(imported_track, intervals = test_intervals, iterator = bin_size) + + # Compare values + expect_equal(nrow(imported_data), nrow(expected_data)) + expect_equal(imported_data[[imported_track]], expected_data$expr, tolerance = 1e-6) +}) + +test_that("gtrack.export_bw works with fixed_summaries parameter", { + skip_if_not_installed("rtracklayer") + + tmpfile1 <- tempfile(fileext = ".bw") + tmpfile2 <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile1)) + withr::defer(unlink(tmpfile2)) + + test_intervals <- gintervals(1, 0, 100000) + + # Export with dynamic summaries (default) + gtrack.export_bw("test.fixedbin", tmpfile1, intervals = test_intervals, iterator = 1000, fixed_summaries = FALSE) + + # Export with fixed summaries + gtrack.export_bw("test.fixedbin", tmpfile2, intervals = test_intervals, iterator = 1000, fixed_summaries = TRUE) + + # Both files should exist + expect_true(file.exists(tmpfile1)) + expect_true(file.exists(tmpfile2)) + + # Both should have content (file sizes may differ due to different summary levels) + expect_gt(file.size(tmpfile1), 0) + expect_gt(file.size(tmpfile2), 0) + + # Both should be importable and contain the same data + imported1 <- rtracklayer::import.bw(tmpfile1) + imported2 <- rtracklayer::import.bw(tmpfile2) + + expect_equal(length(imported1), length(imported2)) + expect_equal(as.numeric(imported1$score), as.numeric(imported2$score), tolerance = 1e-6) +}) + +test_that("gtrack.export_bw validates fixed_summaries parameter", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + + # Invalid fixed_summaries - not logical + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, fixed_summaries = "invalid"), + "fixed_summaries.*TRUE or FALSE" + ) + + # Invalid fixed_summaries - vector instead of single value + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, fixed_summaries = c(TRUE, FALSE)), + "fixed_summaries.*TRUE or FALSE" + ) +}) + +# ===== COMPREHENSIVE EDGE CASE TESTS ===== + +test_that("gtrack.export_bw handles invalid track names gracefully", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + + # Non-existent track + expect_error( + gtrack.export_bw("nonexistent.track", tmpfile), + "does not contain any tracks|Cannot find|does not exist" + ) + + # Empty string + expect_error( + gtrack.export_bw("", tmpfile) + ) + + # NULL track name + expect_error( + gtrack.export_bw(NULL, tmpfile) + ) +}) + +test_that("gtrack.export_bw handles file overwrite behavior correctly", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 10000) + + # Create initial file + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000) + expect_true(file.exists(tmpfile)) + + initial_size <- file.size(tmpfile) + + # Export again to same file (should overwrite) + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000) + expect_true(file.exists(tmpfile)) + + # File should still be valid + imported <- rtracklayer::import.bw(tmpfile) + expect_gt(length(imported), 0) +}) + +test_that("gtrack.export_bw handles infinity and extreme values", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Create track with extreme values + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + extreme_vals <- c(1e308, -1e308, 0) + + gtrack.create_sparse("test.extreme", "Test extreme values", test_intervals, extreme_vals) + withr::defer(gtrack.rm("test.extreme", force = TRUE)) + + # Export - Inf values should be converted to NaN and then to na_value + gtrack.export_bw("test.extreme", tmpfile, na_value = -999) + + imported <- rtracklayer::import.bw(tmpfile) + expect_gt(length(imported), 0) +}) + +test_that("gtrack.export_bw validates iterator parameter edge cases", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + test_intervals <- gintervals(1, 0, 1000) + + # Zero iterator should error + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 0), + "iterator" + ) + + # Negative iterator should error + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = -100), + "iterator" + ) + + # Non-numeric iterator should error + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = "invalid"), + "iterator" + ) +}) + +test_that("gtrack.export_bw preserves numeric precision", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Create track with precise values + test_intervals <- gintervals(1, c(1000, 2000, 3000, 4000), c(2000, 3000, 4000, 5000)) + precise_vals <- c(0.123456789, 9.876543210, 1.111111111, 2.222222222) + + gtrack.create_sparse("test.precise", "Test precision", test_intervals, precise_vals) + withr::defer(gtrack.rm("test.precise", force = TRUE)) + + # Export and reimport + gtrack.export_bw("test.precise", tmpfile) + imported <- rtracklayer::import.bw(tmpfile) + + # Values should match within reasonable tolerance (BigWig uses float32) + original_sorted <- sort(precise_vals) + imported_sorted <- sort(as.numeric(imported$score)) + expect_equal(imported_sorted, original_sorted, tolerance = 1e-6) +}) + +test_that("gtrack.export_bw handles single-value tracks", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Create track with constant value + test_intervals <- gintervals(1, c(1000, 2000, 3000), c(2000, 3000, 4000)) + constant_val <- c(42.0, 42.0, 42.0) + + gtrack.create_sparse("test.constant", "Test constant", test_intervals, constant_val) + withr::defer(gtrack.rm("test.constant", force = TRUE)) + + # Export + gtrack.export_bw("test.constant", tmpfile) + imported <- rtracklayer::import.bw(tmpfile) + + # All values should be 42 + expect_true(all(abs(as.numeric(imported$score) - 42.0) < 1e-6)) +}) + +test_that("gtrack.export_bw handles negative values correctly", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Create track with negative values + test_intervals <- gintervals(1, c(1000, 2000, 3000, 4000), c(2000, 3000, 4000, 5000)) + neg_vals <- c(-10.5, -20.3, -0.1, -999.9) + + gtrack.create_sparse("test.negative", "Test negative", test_intervals, neg_vals) + withr::defer(gtrack.rm("test.negative", force = TRUE)) + + # Export + gtrack.export_bw("test.negative", tmpfile) + imported <- rtracklayer::import.bw(tmpfile) + + # All imported values should be negative + expect_true(all(as.numeric(imported$score) < 0)) + + # Check specific values + imported_sorted <- sort(as.numeric(imported$score)) + original_sorted <- sort(neg_vals) + expect_equal(imported_sorted, original_sorted, tolerance = 1e-6) +}) + +test_that("gtrack.export_bw handles empty data after filtering", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + # Export with intervals that don't overlap with track data + non_overlapping <- gintervals(1, 50000000, 60000000) + + # This should either create empty file or error gracefully + result <- tryCatch( + { + gtrack.export_bw("test.fixedbin", tmpfile, intervals = non_overlapping, iterator = 1000) + "success" + }, + error = function(e) "error" + ) + + # Either way is acceptable - just shouldn't crash + expect_true(result %in% c("success", "error")) +}) + +test_that("gtrack.export_bw validates multiple na_value scenarios", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 10000) + + # na_value as zero + expect_no_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000, na_value = 0) + ) + + # na_value as negative + expect_no_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000, na_value = -999) + ) + + # na_value as positive + expect_no_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000, na_value = 999) + ) + + # Invalid na_value (non-numeric) + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000, na_value = "invalid"), + "na_value.*numeric" + ) + + # Invalid na_value (multiple values) + expect_error( + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000, na_value = c(0, 1)), + "na_value.*single" + ) +}) + +test_that("gtrack.export_bw produces valid file sizes", { + skip_if_not_installed("rtracklayer") + + tmpfile <- tempfile(fileext = ".bw") + withr::defer(unlink(tmpfile)) + + test_intervals <- gintervals(1, 0, 100000) + + # Export with different iterator sizes + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 100) + size_100 <- file.size(tmpfile) + + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 1000) + size_1000 <- file.size(tmpfile) + + gtrack.export_bw("test.fixedbin", tmpfile, intervals = test_intervals, iterator = 10000) + size_10000 <- file.size(tmpfile) + + # All files should have reasonable sizes + expect_gt(size_100, 0) + expect_gt(size_1000, 0) + expect_gt(size_10000, 0) + + # Smaller iterators (more bins) should generally produce larger files + # (though compression can affect this, so we just check they're all valid) + expect_true(all(c(size_100, size_1000, size_10000) > 100)) # At least 100 bytes +})