From 9bbf68f0300c7483e0c4ca1f21373a88de0b387b Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 24 Nov 2025 14:37:29 -0800 Subject: [PATCH 1/7] All modularized cNF code added/updated --- 02_normalize_batchcorrect_omics.R | 768 +++++++++++++++++++++ 02_run_normalize_omics.Rmd | 165 +++++ 03_analyze_modality_correlations.R | 279 ++++++++ 04_analyze_modality_and_pathway_enrich.Rmd | 114 +++ 04_leapr_biomarker.R | 467 +++++++++++++ cNF_helper_code.R | 1 + 6 files changed, 1794 insertions(+) create mode 100644 02_normalize_batchcorrect_omics.R create mode 100644 02_run_normalize_omics.Rmd create mode 100644 03_analyze_modality_correlations.R create mode 100644 04_analyze_modality_and_pathway_enrich.Rmd create mode 100644 04_leapr_biomarker.R diff --git a/02_normalize_batchcorrect_omics.R b/02_normalize_batchcorrect_omics.R new file mode 100644 index 0000000..a5dad15 --- /dev/null +++ b/02_normalize_batchcorrect_omics.R @@ -0,0 +1,768 @@ +# ============================================================================= +# normalize_omics_pipeline.R +# Arguments: +# - write_outputs: master on/off for writing CSVs, PDFs, and uploading +# - save_basename: override base name used in filenames +# - do_batch_correct: skip ComBat when FALSE; use combined matrix instead +# ============================================================================= + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(stringr) + library(SummarizedExperiment) + library(ggplot2) + library(readr) + library(rlang) +}) + +# Small helpers + +modified_zscore <- function(x, na.rm = TRUE) { + m <- suppressWarnings(stats::median(x, na.rm = na.rm)) + md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) + if (is.na(md) || md == 0) return(rep(0, length(x))) + 0.6745 * (x - m) / md +} + +filter_by_missingness <- function(mat) { + keep <- apply(mat, 1, function(r) mean(is.na(r)) <= 0.5) + mat[keep, , drop = FALSE] +} + +union_rows_fill_NA <- function(mats) { + all_feats <- Reduce(union, lapply(mats, rownames)) + lapply(mats, function(m) { + mm <- matrix(NA_real_, nrow = length(all_feats), ncol = ncol(m), + dimnames = list(all_feats, colnames(m))) + mm[rownames(m), colnames(m)] <- m + mm + }) +} + +collapse_duplicate_features <- function(mat) { + if (!any(duplicated(rownames(mat)))) return(mat) + grp <- split(seq_len(nrow(mat)), rownames(mat)) + collapsed <- do.call(rbind, lapply(grp, function(ix) colSums(mat[ix, , drop = FALSE], na.rm = TRUE))) + rownames(collapsed) <- names(grp) + collapsed +} + +make_dropper <- function(substrings) { + if (is.null(substrings) || length(substrings) == 0) return(function(x) rep(FALSE, length(x))) + pattern <- paste0(substrings, collapse = "|") + function(x) grepl(pattern, x, fixed = FALSE) +} + +# Functions to clean up irregular names + +basename_only <- function(x) sub("^.*[\\\\/]", "", x) +basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) + +normalize_specimen_like <- function(x) { + y <- tolower(x) + y <- gsub("\\s+", "", y) + y <- gsub("\\.", "-", y) + y <- gsub("_", "-", y) + y <- gsub("organoids?$", "organoid", y) + y <- gsub("-organoids?-", "-organoid-", y) + y <- gsub("skin$", "skin", y) + y <- gsub("tissues?$", "tissue", y) + y <- gsub("--+", "-", y) + y <- gsub("^-|-$", "", y) + y +} + +parse_rna_header_triplet <- function(fnames) { + toks_list <- strsplit(fnames, "\\.") + out <- lapply(seq_along(toks_list), function(i) { + toks <- toks_list[[i]] + sample_id <- if (length(toks) >= 1) toks[[1]] else NA_character_ + + tumor <- NA_character_ + condition_raw <- NA_character_ + + if (length(toks) >= 2) { + if (grepl("^T\\d+$", toks[[2]], ignore.case = TRUE)) { + tumor <- toupper(toks[[2]]) + cond_tokens <- toks[-c(1,2)] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } else { + cond_tokens <- toks[-1] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } + } + + condition_norm <- condition_raw + if (!is.na(condition_norm)) { + low <- tolower(condition_norm) + if (grepl("^organoids?$", low)) condition_norm <- "organoid" + else if (grepl("^tissues?$", low)) condition_norm <- "tissue" + else if (grepl("^skin$", low)) condition_norm <- "skin" + } + + data.frame( + fname = fnames[i], + sample_id = sample_id, + tumor = ifelse(is.na(tumor), NA_character_, toupper(tumor)), + condition_raw = condition_raw, + condition_norm= condition_norm, + stringsAsFactors = FALSE + ) + }) + df <- do.call(rbind, out) + df$specimen_norm <- with(df, { + sid <- sample_id + tmr <- ifelse(is.na(tumor) | tumor == "", "", paste0("_", tumor)) + cnd <- ifelse(is.na(condition_norm) | condition_norm == "", "", paste0("_", condition_norm)) + paste0(sid, tmr, cnd) + }) + df +} + +# Functions to get data from Synapse + +read_wide_from_synapse <- function(syn, syn_id) { + message(" Reading Synapse file: ", syn_id) + df <- read.table( + syn$get(syn_id)$path, + sep = "\t", header = TRUE, quote = '"', + fill = TRUE, check.names = FALSE + ) + message(sprintf(" - Read %d rows × %d cols; first cols: %s", + nrow(df), ncol(df), paste(head(colnames(df), 8), collapse = ", "))) + df +} + +detect_value_start_col <- function(wide_df, fallback = 5) { + nms <- colnames(wide_df) + is_pathy <- grepl("\\.(raw|mzml)$", nms, ignore.case = TRUE) | + grepl("[/\\\\]", nms) | + grepl("^[A-Za-z]:\\\\", nms) + if (any(is_pathy)) { + i <- which(is_pathy)[1] + message(" Auto-detected first sample column at index ", i, " to '", nms[i], "'") + return(i) + } + message(" Did not detect path/RAW headers; using fallback value_start_col=", fallback) + fallback +} + +parse_fnames <- function(fnames, aliquot_field_index, cohort) { + message("Parsing filenames to (fname, aliquot, cohort)") + rows <- lapply(fnames, function(fname) { + toks <- strsplit(fname, "_", fixed = TRUE)[[1]] + aliq <- NA_real_ + + if (!is.null(aliquot_field_index) && + aliquot_field_index >= 1 && + aliquot_field_index <= length(toks)) { + aliq_try <- suppressWarnings(as.double(toks[[aliquot_field_index]])) + if (!is.na(aliq_try)) aliq <- aliq_try + } + + if (is.na(aliq)) { + num_tokens <- suppressWarnings(as.double(toks)) + if (any(!is.na(num_tokens))) aliq <- tail(num_tokens[!is.na(num_tokens)], 1) + } + + data.frame( + fname = fname, + aliquot = aliq, + cohort = cohort, + stringsAsFactors = FALSE + ) + }) + out <- do.call(rbind, rows) + message(sprintf(" - Parsed %d samples (aliquot NA: %d)", + nrow(out), sum(is.na(out$aliquot)))) + out +} + +# -------------------------- Feature ID builders ------------------------------ + +build_phospho_ids <- function(df) { + lsite <- tolower(df$Residue) + paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) +} +build_global_ids <- function(df) as.character(df$Genes) +build_rna_ids <- function(df) { + cand <- c("gene_id","Gene","gene","gene_name","Symbol","symbol","ENSEMBL","Ensembl","ensembl_gene_id") + hit <- cand[cand %in% names(df)] + if (length(hit) == 0) stop("RNA feature-id column not found.") + if ("gene_id" %in% hit) return(as.character(df[["gene_id"]])) + if ("gene_name" %in% hit) return(as.character(df[["gene_name"]])) + as.character(df[[hit[[1]]]]) +} +pick_builder <- function(modality) { + m <- tolower(modality) + if (m == "phospho") return(build_phospho_ids) + if (m == "global") return(build_global_ids) + if (m == "rna") return(build_rna_ids) + stop("Unknown modality: ", modality) +} + +# Functions to Normalize Data. Uses SummarizedExperiment + +coldata_tbl <- function(se) { + cd <- as.data.frame(SummarizedExperiment::colData(se), stringsAsFactors = FALSE) + if ("fname" %in% names(cd)) names(cd)[names(cd) == "fname"] <- ".coldata_fname" + names(cd) <- make.unique(names(cd), sep = "_") + cd$fname <- rownames(cd) + cd$fname_base <- basename_only(cd$fname) + cd$fname_stem <- basename_no_ext(cd$fname) + cd <- cd[, c("fname","fname_base","fname_stem", setdiff(names(cd), c("fname","fname_base","fname_stem"))), drop = FALSE] + cd +} + +looks_like_sample_header <- function(x) { + grepl("\\.(raw|mzml)$", x, ignore.case = TRUE) | + grepl("[/\\\\]", x) | + grepl("^[A-Za-z]:\\\\", x) +} + +make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop_name, modality) { + all_candidate <- colnames(wide_df)[value_start_col:ncol(wide_df)] + has_pathy <- any(looks_like_sample_header(all_candidate)) + if (has_pathy) { + sample_cols <- all_candidate[looks_like_sample_header(all_candidate)] + sample_cols <- setdiff(sample_cols, c("Site", "Sequence")) + } else { + sample_cols <- setdiff(all_candidate, c("Site", "Sequence")) + } + + message(" Candidate sample columns (first 6):") + print(utils::head(sample_cols, 6)) + + message(" Casting measurement block to numeric") + raw_block <- wide_df[, sample_cols, drop = FALSE] + clean_block <- as.data.frame( + lapply(raw_block, function(col) { + if (is.factor(col)) col <- as.character(col) + col[col %in% c("", "NA", "NaN", "na", "n/a", "NULL")] <- NA + suppressWarnings(as.numeric(col)) + }), + check.names = FALSE + ) + + keep <- !drop_name(colnames(clean_block)) + if (any(!keep)) { + message(" Dropping unwanted sample columns by pattern: ", sum(!keep)) + clean_block <- clean_block[, keep, drop = FALSE] + sample_cols <- sample_cols[keep] + } + + mat <- as.matrix(clean_block) + rownames(mat) <- feature_ids + colnames(mat) <- sample_cols + + fnames_df <- fnames_df %>% dplyr::semi_join(data.frame(fname = sample_cols), by = "fname") + + parsed_df <- if (tolower(modality) == "rna") parse_rna_header_triplet(fnames_df$fname) else + data.frame(fname = fnames_df$fname, stringsAsFactors = FALSE) + + message(" Joining sample meta (by aliquot & cohort)") + meta_join <- meta + if ("fname" %in% names(meta_join)) { + message(" ! Debug: Dropping 'fname' column from 'meta' to prevent duplication") + meta_join <- dplyr::select(meta_join, -fname) + } + + cdata <- fnames_df %>% + dplyr::left_join(meta_join, by = c("aliquot","cohort")) %>% + dplyr::left_join(parsed_df, by = "fname") %>% + dplyr::mutate(cohort = as.factor(cohort), + cohort_key = as.character(cohort)) + + if (!"aliquot" %in% names(cdata)) cdata$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cdata) || "aliquot.y" %in% names(cdata)) { + cdata$aliquot <- dplyr::coalesce(cdata$aliquot, cdata$aliquot.x, cdata$aliquot.y) + cdata <- dplyr::select(cdata, -dplyr::any_of(c("aliquot.x","aliquot.y"))) + } + + if (tolower(modality) == "rna") { + meta_norm <- meta %>% + dplyr::mutate( + Specimen_norm = normalize_specimen_like(Specimen), + cohort_key = as.character(cohort) + ) %>% + dplyr::select(Specimen, Specimen_norm, Patient, Tumor, cohort_key) + + cdata2 <- cdata %>% + dplyr::mutate(fname_norm = normalize_specimen_like(ifelse( + is.na(specimen_norm) | specimen_norm == "", fname, specimen_norm + ))) %>% + dplyr::left_join(meta_norm, by = c("cohort_key", "fname_norm" = "Specimen_norm")) + + for (nm in c("Specimen","Patient","Tumor")) { + if (!nm %in% names(cdata)) cdata[[nm]] <- NA + cdata[[nm]] <- dplyr::coalesce(cdata[[nm]], cdata2[[nm]]) + } + + if (!"Specimen" %in% names(cdata)) cdata$Specimen <- NA_character_ + if (!"Patient" %in% names(cdata)) cdata$Patient <- NA_character_ + if (!"Tumor" %in% names(cdata)) cdata$Tumor <- NA_character_ + + cdata$Patient <- dplyr::coalesce(cdata$Patient, cdata$sample_id) + cdata$Tumor <- dplyr::coalesce(cdata$Tumor, cdata$tumor) + + make_specimen <- function(pid, tmr, cond) { + pid_clean <- pid + tmr_clean <- ifelse(is.na(tmr) | tmr == "", "", paste0("_", tmr)) + cond_clean <- ifelse(is.na(cond) | cond == "", "", paste0("_", cond)) + paste0(pid_clean, tmr_clean, cond_clean) + } + need_spec <- is.na(cdata$Specimen) | cdata$Specimen == "" + if (any(need_spec)) { + cdata$Specimen[need_spec] <- make_specimen( + pid = cdata$Patient[need_spec], + tmr = cdata$Tumor[need_spec], + cond = cdata$condition_norm[need_spec] + ) + } + } + + message(" - Example cdata rows:") + print(utils::head(cdata[, intersect(c( + "fname","aliquot","cohort","Specimen","Patient","Tumor", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm" + ), names(cdata)), drop = FALSE], 10)) + + rn <- cdata$fname + cdata_nofname <- dplyr::select(cdata, -fname) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(cdata_nofname, row.names = rn) + ) + + message(sprintf(" - SE assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(sprintf(" - colData names: %s", paste(names(SummarizedExperiment::colData(se)), collapse = ", "))) + se +} + +scale_columns_modified_z <- function(m) { + out <- m + for (j in seq_len(ncol(m))) out[, j] <- modified_zscore(m[, j]) + out +} + +normalize_by_modality <- function(se, modality) { + mtype <- tolower(modality) + mat0 <- as.matrix(SummarizedExperiment::assay(se, "values")) + c0 <- colnames(mat0) + + message(" Normalizing modality = ", modality) + if (mtype == "phospho") { + mat0[mat0 == 0] <- NA_real_ + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 0.01) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "global") { + mlog <- log2(mat0) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "rna") { + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 1) + mat2 <- scale_columns_modified_z(mlog) + } else { + abort(paste0("Unknown modality: ", modality)) + } + + mat3 <- collapse_duplicate_features(mat2) + stopifnot(identical(colnames(mat3), c0)) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat3)); rownames(rd) <- rd$feature_id + se_out <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat3), + rowData = rd, + colData = SummarizedExperiment::colData(se)[colnames(mat3), , drop = FALSE] + ) + message(sprintf(" - Normalized assay dims: %d feats × %d samples", nrow(se_out), ncol(se_out))) + se_out +} + +# Function to Combine Data (batches) + +combine_batches_intersection <- function(se_list) { + message("Combining batches (intersection of features, then cbind samples)") + mats <- lapply(se_list, function(se) as.matrix(SummarizedExperiment::assay(se, "values"))) + feats <- Reduce(intersect, lapply(mats, rownames)) + feats <- feats[!is.na(feats) & feats != ""] + message(" - Intersection feature count: ", length(feats)) + if (length(feats) == 0) stop("No common features across batches after cleaning feature IDs.") + matsI <- lapply(seq_along(mats), function(i) { + m <- mats[[i]][feats, , drop = FALSE] + message(sprintf(" Batch %d: %d feats × %d samples after intersect", i, nrow(m), ncol(m))) + m + }) + matCB <- do.call(cbind, matsI) + + cd <- do.call(S4Vectors::rbind, lapply(se_list, SummarizedExperiment::colData)) + rd <- S4Vectors::DataFrame(feature_id = rownames(matCB)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment( + assays = S4Vectors::SimpleList(values = matCB), + rowData = rd, + colData = cd + ) + message(sprintf(" - Combined assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(" - Head(sample names) in combined assay:") + print(utils::head(colnames(SummarizedExperiment::assay(se, "values")), 6)) + message(" - Head(rownames) in combined colData (should match):") + print(utils::head(rownames(SummarizedExperiment::colData(se)), 6)) + invisible(se) +} + +# Combat Function. (Lots of messages to help debug) + +combat_by_cohort <- function(se) { + message("Running ComBat by cohort (batch-only; mean.only = FALSE)") + suppressPackageStartupMessages(library(sva)) + + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + message(sprintf("Matrix dims before ComBat: %d features × %d samples", nrow(mat), ncol(mat))) + + n_bad <- sum(!is.finite(mat)) + if (n_bad > 0) message(" Replacing ", n_bad, " non-finite values with 0.") + mat[!is.finite(mat)] <- 0 + + cd <- as.data.frame(SummarizedExperiment::colData(se)) + cd <- cd[colnames(mat), , drop = FALSE] + if (!"cohort" %in% names(cd)) stop("colData must contain 'cohort' for ComBat batching.") + + batch <- droplevels(as.factor(cd$cohort)) + message(" Batch table (pre-drop):"); print(table(batch, useNA = "ifany")) + + keep <- !is.na(batch) + if (any(!keep)) { + message(" Dropping ", sum(!keep), " samples with NA cohort before ComBat.") + mat <- mat[, keep, drop = FALSE] + batch <- droplevels(batch[keep]) + cd <- cd[keep, , drop = FALSE] + } + + message(" Final check — ncol(mat)=", ncol(mat), "; length(batch)=", length(batch)) + message(" Batch table (final):"); print(table(batch, useNA = "ifany")) + + pre_by_cohort <- tapply(colMeans(mat), batch, sd) + message(" Pre-ComBat: SD of column means by cohort:"); print(pre_by_cohort) + + cb <- sva::ComBat(dat = mat, batch = batch, mean.only = FALSE, par.prior = TRUE) + + post_by_cohort <- tapply(colMeans(cb), batch, sd) + message(" Post-ComBat: SD of column means by cohort:"); print(post_by_cohort) + + SummarizedExperiment::assay(se, "values") <- cb + message(" Matrix dims after ComBat: ", nrow(cb), " × ", ncol(cb)) + invisible(se) +} + +# Plot Functions (PCA) + more debug messages + +se_to_long <- function(se, modality) { + feature_col <- if (tolower(modality) == "global") "Gene" else "feature_id" + + avals <- as.data.frame(SummarizedExperiment::assay(se, "values"), check.names = FALSE) + avals[[feature_col]] <- rownames(avals) + + long <- avals |> + tidyr::pivot_longer(cols = -all_of(feature_col), names_to = "fname", values_to = "correctedAbundance") + + cd <- coldata_tbl(se) + + if (!"aliquot" %in% names(cd)) cd$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cd) || "aliquot.y" %in% names(cd)) { + cd$aliquot <- dplyr::coalesce(cd$aliquot, cd$aliquot.x, cd$aliquot.y) + } + + want <- c("fname","aliquot","cohort","Specimen","Patient","Tumor", + "fname_base","fname_stem", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm") + have <- intersect(want, names(cd)) + long1 <- dplyr::left_join(long, cd[, have, drop = FALSE], by = "fname") + + message(" - se_to_long(): non-NA counts to Patient=", + sum(!is.na(long1$Patient)), "; Tumor=", sum(!is.na(long1$Tumor)), + "; Specimen=", sum(!is.na(long1$Specimen)), "; cohort=", sum(!is.na(long1$cohort))) + distinct(long1) +} + +pca_df_present_in_all <- function(se) { + message("Preparing PCA (features present in ALL samples)") + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + + keep_rows <- apply(mat, 1, function(r) all(is.finite(r))) + n_keep <- sum(keep_rows); n_all <- nrow(mat) + message(" - Kept ", n_keep, " / ", n_all, " features with complete data for PCA") + if (n_keep < 2) stop("Too few complete features for PCA after intersection filter.") + + pcs <- prcomp(t(mat[keep_rows, , drop = FALSE])) + + cd <- coldata_tbl(se) + df <- as.data.frame(pcs$x[, 1:2, drop = FALSE]) + df$fname <- rownames(df) + + df1 <- dplyr::left_join(df, cd, by = "fname") + + message(" - colData columns present: ", paste(setdiff(names(cd), c("fname","fname_base","fname_stem")), collapse = ", ")) + message(" - Non-NA counts in colData: Patient=", sum(!is.na(cd$Patient)), + "; Tumor=", sum(!is.na(cd$Tumor)), "; Specimen=", sum(!is.na(cd$Specimen)), + "; cohort=", sum(!is.na(cd$cohort))) + message(" - After join: n rows = ", nrow(df1)) + message(" - Non-NA counts after join: Patient=", sum(!is.na(df1$Patient)), + "; Tumor=", sum(!is.na(df1$Tumor)), "; Specimen=", sum(!is.na(df1$Specimen)), + "; cohort=", sum(!is.na(df1$cohort))) + + if (sum(!is.na(df1$Patient)) == 0) { + message(" ! Warning: Patient is NA for all samples after join. Will color/shape by cohort.") + df1$Patient_fallback <- as.character(df1$cohort) + df1$Tumor_fallback <- as.character(df1$cohort) + message(" - DEBUG: head(df1$fname):"); print(utils::head(df1$fname, 6)) + message(" - DEBUG: head(cd$fname):"); print(utils::head(cd$fname, 6)) + } + + if ("Specimen" %in% names(df1)) { + df1$Tumor <- stringr::str_remove(stringr::str_extract(df1$Specimen, "_T\\d+"), "^_") + } + + df1 +} + +plot_pca <- function(pc_df, title_text, pcols = NULL) { + color_col <- if ("Patient" %in% names(pc_df) && any(!is.na(pc_df$Patient))) { + "Patient" + } else if ("condition_norm" %in% names(pc_df) && any(!is.na(pc_df$condition_norm))) { + "condition_norm" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Patient_fallback" + } + + shape_col <- if ("Tumor" %in% names(pc_df) && any(!is.na(pc_df$Tumor))) { + "Tumor" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Tumor_fallback" + } + + g <- ggplot(pc_df, aes(PC1, PC2, col = .data[[color_col]])) + + geom_point(aes(shape = .data[[shape_col]]), size = 3) + + labs(title = title_text, color = color_col, shape = shape_col) + + theme_bw() + if (!is.null(pcols) && color_col == "Patient") g <- g + scale_color_manual(values = pcols) + print(g) + g +} + +plot_hist <- function(se, title_text) { + cd <- coldata_tbl(se) + df <- as.data.frame(SummarizedExperiment::assay(se, "values")) |> + tidyr::pivot_longer(everything(), names_to = "fname", values_to = "val") |> + dplyr::left_join(cd[, c("fname","cohort")], by = "fname") + g <- ggplot(df, aes(x = val, fill = as.factor(cohort))) + + geom_histogram(bins = 60, alpha = 0.9) + + labs(title = title_text, x = "Value", fill = "Cohort") + + theme_bw() + print(g) + g +} + +# Upload function + +perform_uploads <- function(paths, syn, parent_id) { + if (is.null(parent_id) || length(paths) == 0) return(invisible(NULL)) + message("All steps succeeded — uploading ", length(paths), " file(s) to Synapse…") + for (p in paths) { + fullp <- normalizePath(p, winslash = "/", mustWork = FALSE) + message(" Uploading: ", basename(p), " (", fullp, ")") + f <- syn$store(synapser::File(p, parentId = parent_id)) + message(" Uploaded: ", basename(p), " (local: ", fullp, ") to Synapse ID: ", f$properties$id) + } + message("Uploads complete.") +} + + + +# ------------------------------- Main entry ---------------------------------- +# This is how we call the function / pipeline + +run_modality <- function( + modality, + batches, + meta, + syn, + drop_name_substrings = NULL, + out_dir = ".", + out_prefix = NULL, + upload_parent_id = NULL, + pcols = NULL, + write_outputs = TRUE, # master toggle to write CSV/PDF & upload + save_basename = NULL, # override base name used in output files + do_batch_correct = TRUE # if FALSE, skip ComBat and use combined matrix +) { + message("==================================================") + message("Starting run_modality(): ", modality) + message("Output directory: ", normalizePath(out_dir, winslash = "/", mustWork = FALSE)) + if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) + if (is.null(out_prefix)) out_prefix <- gsub("[^A-Za-z0-9]+", "_", tolower(modality)) + file_stem <- if (!is.null(save_basename) && nzchar(save_basename)) save_basename else out_prefix + + upload_queue <- character(0) + results <- NULL + + tryCatch({ + drop_name <- make_dropper(drop_name_substrings) + build_ids <- pick_builder(modality) + + # ---- Per-batch normalization ------------------------------------------------ + se_list <- vector("list", length(batches)) + for (i in seq_along(batches)) { + message("--------------------------------------------------") + message("Batch ", i, " of ", length(batches)) + b <- batches[[i]] + + wide <- read_wide_from_synapse(syn, b$syn_id) + + if (tolower(modality) == "phospho") { + message(" Preprocessing phospho table: drop blank Gene.Names; build site ids") + wide <- wide %>% dplyr::filter(!is.na(.data$Gene.Names), .data$Gene.Names != "") + } else if (tolower(modality) == "global") { + message(" Preprocessing global table: split multi-symbol rows (Genes by ';')") + wide <- tidyr::separate_rows(wide, Genes, sep = ";") |> + dplyr::mutate(Genes = trimws(Genes)) |> + dplyr::filter(!is.na(Genes) & Genes != "") + } else if (tolower(modality) == "rna") { + message(" Preprocessing RNA table: (using merged Salmon matrix; samples start after gene_id/gene_name)") + } + + feats <- build_ids(wide) + ok <- !is.na(feats) & feats != "" + if (!all(ok)) { + message(" Dropping ", sum(!ok), " empty/NA feature IDs before SE construction.") + wide <- wide[ok, , drop = FALSE] + feats <- feats[ok] + } + + fallback_col <- if (tolower(modality) == "rna") 3 else 5 + first_col <- if (!is.null(b$value_start_col)) b$value_start_col else detect_value_start_col(wide, fallback = fallback_col) + + fnmap <- parse_fnames(colnames(wide)[first_col:ncol(wide)], b$fname_aliquot_index, b$cohort) + if (tolower(modality) == "rna" && any(is.na(fnmap$aliquot))) { + message(" ! Note: ", sum(is.na(fnmap$aliquot)), " RNA sample(s) with NA aliquot after parsing (expected for RNA headers).") + } + + se0 <- make_se(wide, value_start_col = first_col, feature_ids = feats, + fnames_df = fnmap, meta = meta, drop_name = drop_name, modality = modality) + se_n <- normalize_by_modality(se0, modality) + se_list[[i]] <- se_n + + if (write_outputs) { + message(" Writing per-batch normalized long table (pre-ComBat)") + batch_long <- se_to_long(se_n, modality) + batch_tag <- paste0("batch", b$cohort) + batch_path <- file.path(out_dir, paste0(file_stem, "_", batch_tag, "_normalized_long.csv")) + readr::write_csv(batch_long, batch_path) + upload_queue <- c(upload_queue, batch_path) + } + } + + # ---- Combine (INTERSECTION) & pre-QC --------------------------------------- + message("--------------------------------------------------") + se_combined <- combine_batches_intersection(se_list) + + message(" Pre-QC plots (PCA & histogram) on combined (pre-ComBat)") + pre_pc_df <- pca_df_present_in_all(se_combined) + pre_pca <- plot_pca(pre_pc_df, paste0(modality, " samples"), pcols = pcols) + pre_hist <- plot_hist(se_combined, paste0(modality, ": value distribution (pre-ComBat)")) + if (write_outputs) { + pre_pca_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_PCA.pdf")) + pre_hist_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_Hist.pdf")) + ggsave(pre_pca_pdf, pre_pca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(pre_hist_pdf, pre_hist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, pre_pca_pdf, pre_hist_pdf) + } + + # ---- Optional ComBat (batch-only) ------------------------------------------ + if (isTRUE(do_batch_correct)) { + message("--------------------------------------------------") + se_post <- combat_by_cohort(se_combined) + post_suffix <- "_batchCorrected" + post_title <- paste0("Batch-corrected ", modality, " samples") + } else { + message("--------------------------------------------------") + message("Skipping ComBat per do_batch_correct=FALSE; using combined matrix as 'post'.") + se_post <- se_combined + post_suffix <- "_noBatchCorrect" + post_title <- paste0("Combined ", modality, " samples (no ComBat)") + } + + # ---- Exports ---------------------------------------------------------------- + message(" Building long tables") + long_pre <- se_to_long(se_combined, modality) |> + dplyr::filter(is.finite(correctedAbundance)) + long_post <- se_to_long(se_post, modality) + + if (write_outputs) { + path_pre <- file.path(out_dir, paste0(file_stem, "_preComBat_long.csv")) + path_post <- file.path(out_dir, paste0(file_stem, post_suffix, ".csv")) + write_csv(long_pre, path_pre) + write_csv(long_post, path_post) + upload_queue <- c(upload_queue, path_pre, path_post) + } + + # ---- Post (either ComBat or not) QC ---------------------------------------- + message(" Post-QC plots (PCA & histogram)") + pc_df <- pca_df_present_in_all(se_post) + gpca <- plot_pca(pc_df, post_title, pcols = pcols) + ghist <- plot_hist(se_post, paste0(modality, ": value distribution", ifelse(isTRUE(do_batch_correct), "", " (no ComBat)"))) + if (write_outputs) { + post_pca_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_PCA.pdf", "_PCA_noComBat.pdf"))) + post_hist_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_Hist.pdf", "_Hist_noComBat.pdf"))) + ggsave(post_pca_pdf, gpca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(post_hist_pdf, ghist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, post_pca_pdf, post_hist_pdf) + } + + # ---- Pack results for return ------------------------------------------------ + results <- list( + se_batches = se_list, + se_combined = se_combined, + se_corrected = if (isTRUE(do_batch_correct)) se_post else NULL, + se_post = se_post, # always populated (corrected or not) + did_combat = isTRUE(do_batch_correct), + long_pre = long_pre, + long_post = long_post, + pca_df_pre = pre_pc_df, + pca_df_post = pc_df, + plots = list(pre_pca = pre_pca, pre_hist = pre_hist, pca = gpca, hist = ghist), + files = if (write_outputs) list(queued = upload_queue) else list() + ) + + # ---- FINAL STEP: Uploads ---------------------------------------------------- + if (write_outputs && !is.null(upload_parent_id)) { + perform_uploads(upload_queue, syn, upload_parent_id) + } else if (!write_outputs) { + message("write_outputs=FALSE — skipping all writes/uploads.") + } else { + message("No upload_parent_id provided — skipping uploads.") + } + + }, error = function(e) { + message("ERROR: run_modality() failed for ", modality, ". No uploads were attempted.") + message(" ", conditionMessage(e)) + message("------- DEBUG SNAPSHOT -------") + message(" traceback:"); print(sys.calls()) + message(" sessionInfo():"); print(utils::sessionInfo()) + message("------- END DEBUG -----------") + stop(e) + }) + + message("run_modality() finished successfully for: ", modality) + results +} diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd new file mode 100644 index 0000000..5c852ec --- /dev/null +++ b/02_run_normalize_omics.Rmd @@ -0,0 +1,165 @@ +--- +title: "run_normalize_omics" +author: "JJ" +date: "2025-11-04" +output: html_document +--- + +# Get Helper Scripts +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +synLogin() + +syn <- list(get = synapser::synGet, store = synapser::synStore) + + +# Load helper metadata (your cNF_helper_code.R defines 'meta' and 'pcols') +source("cNF_helper_code.R") + +# Source the pipeline +source("02_normalize_batchcorrect_omics.R") + +``` + + +# Run batch correction / normalization across phospho, global, and rna samples. + +```{r} + +# --------------------------------------------------------------------------- +# run_modality() — quick reference for args & expected batch structure +# --------------------------------------------------------------------------- +# Args: +# modality : Character. One of "Global", "Phospho", or "rna". +# Controls feature-ID parsing and normalization rules. +# +# batches : List of per-cohort lists describing each input table. +# See “Batch spec” below. +# +# meta : Data frame with sample metadata. Must contain at least: +# - cohort (int or factor; matches batches[[i]]$cohort) +# - Specimen, Patient, Tumor (optional but used if present) +# For RNA, if aliquot joins fail, headers are matched to +# meta$Specimen using a normalized form. Generated by cNF_helper_code.R +# +# syn : An initialized synapser client (e.g., syn <- synapser::synLogin()). +# +# drop_name_substrings: Character vector of substrings. Any sample column whose +# name contains one of these (regex OR) is dropped. +# Use NULL or character(0) to keep all samples. +# +# out_dir : Directory where outputs/QC are written. Created if missing. +# +# out_prefix : String used in filenames (e.g., "global", "phospho", "rna"). +# If save_basename is NULL, this is also used as the basename. +# +# upload_parent_id : Synapse folder/entity ID to upload written files. +# If NULL, no uploads occur. +# +# pcols : Optional named color vector for plotting (e.g., by patient). +# If NULL, ggplot defaults are used. +# +# write_outputs : Logical. If TRUE, write CSVs/PDFs to out_dir (and upload +# when upload_parent_id is set). If FALSE, nothing is written/ +# uploaded; results are returned in-memory only. +# +# save_basename : Optional string to control the root of output filenames. +# If NULL, falls back to out_prefix. Useful when you want the +# directory name (out_dir) and the file basename to differ. +# +# do_batch_correct : Logical. If TRUE, applies ComBat across cohorts (batch-only). +# If FALSE, skips ComBat; filenames will include “_noBatchCorrect”. +# +# Returns: +# A list with: +# - se_batches : List of per-batch SummarizedExperiment objects (normalized). +# - se_combined : Combined SE after feature intersection. +# - se_corrected : Post-ComBat SE (or the same as combined if do_batch_correct=FALSE). +# - long_pre : Long-format data frame pre-ComBat (finite values only). +# - long_post : Long-format data frame post-ComBat (or pre if no ComBat). +# - pca_df_pre / pca_df_post : Data frames used for PCA scatter plots. +# - plots : ggplot objects for PCA & histograms. +# - files : Paths of any written files (if write_outputs=TRUE). +# +# +# --------------------------------------------------------------------------- + + +# Substrings to drop (These were the protocol optimization samples) +drop_subs <- c( + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25" +) + +# PHOSPHO BATCHES +phospho_batches <- list( + list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), + list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +) + +# Run +phospho <- run_modality( + modality = "Phospho", + batches = phospho_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "phospho_test", + out_prefix = "phospho", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "Phospho_batch_corrected", + do_batch_correct = TRUE +) + +# GLOBAL BATCHES +global_batches <- list( + list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +global <- run_modality( + modality = "Global", + batches = global_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "global_test", + out_prefix = "global", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "Global_batch_corrected", + do_batch_correct = TRUE +) + + +# RNA BATCHES +rna_batches <- list( + list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +rna <- run_modality( + modality = "rna", + batches = rna_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "rna_test", + out_prefix = "rna", + upload_parent_id = "syn71099587", + pcols = pcols, + write_outputs = FALSE, + save_basename = "RNA_Matrix_no_batch_correct", + do_batch_correct = FALSE #Note this is set to false right now. Not needed for RNA +) + + +``` diff --git a/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R new file mode 100644 index 0000000..10fa56e --- /dev/null +++ b/03_analyze_modality_correlations.R @@ -0,0 +1,279 @@ +# analyze_modality.R +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(tibble) + library(ggplot2) + library(pheatmap) +}) + +dir.create("figs", showWarnings = FALSE) + + +# Helpers + +make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { + # Strict cleaning: trim to character, drop NA/blank IDs, then pivot + df <- df_long %>% + ungroup() %>% + # keep only shared sample IDs first (as before) + dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% + # coerce and trim IDs + mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop rows with NA/blank sample/feature IDs + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + if (any(bad_sample)) message("[make_feature_matrix] Dropping ", sum(bad_sample), " rows with NA/blank ", sample_col) + if (any(bad_feature)) message("[make_feature_matrix] Dropping ", sum(bad_feature), " rows with NA/blank ", feature_col) + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[make_feature_matrix] No rows left after cleaning; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Pivot to wide + wide <- df %>% + dplyr::select(all_of(c(sample_col, feature_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(feature_col), + values_from = all_of(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + # Guard against NA/blank rownames after pivot + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[make_feature_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + + if (!nrow(wide)) { + warning("[make_feature_matrix] Wide table is empty after removing bad rownames; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Finalize rownames (must be unique, non-empty) + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + wide +} + +make_drug_matrix <- function( + fits, metric = "uM_viability", + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" +) { + fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + dplyr::select(all_of(c(sample_col, drug_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(drug_col), + values_from = all_of(value_col), + values_fn = mean + ) %>% + tibble::column_to_rownames(sample_col) +} + +summarize_drugs <- function( + fits, metric = "uM_viability", metric_col = "dose_response_metric", + outdir = "figs", rotate_x = 45 +) { + ds <- fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + group_by(.data$improve_drug_id) %>% + distinct() %>% + summarize( + meanResponse = mean(.data$dose_response_value, na.rm = TRUE), + nMeasured = n_distinct(.data$improve_sample_id), + variability = sd(.data$dose_response_value, na.rm = TRUE), + .groups = "drop" + ) + + p_eff <- ds %>% + arrange(desc(.data$meanResponse)) %>% + dplyr::filter(.data$meanResponse < 0.5) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most efficacious drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + p_var <- ds %>% + arrange(desc(.data$variability)) %>% + dplyr::filter(.data$variability > 0.15) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most variable drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + ggsave(file.path(outdir, "most_efficacious.pdf"), p_eff, width = 12, height = 8, dpi = 300) + ggsave(file.path(outdir, "most_variable.pdf"), p_var, width = 12, height = 8, dpi = 300) + + list(summary = ds, p_eff = p_eff, p_var = p_var) +} + +compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { + if (is.null(shared_samples)) { + shared_samples <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + } + if (length(shared_samples) == 0L) { + return(tibble( + drug = character(), feature = character(), + cor = numeric(), pval = numeric(), fdr = numeric(), + direction = character() + )) + } + + drug_mat <- drug_mat[shared_samples, , drop = FALSE] + feat_mat <- feat_mat[shared_samples, , drop = FALSE] + + cres <- suppressWarnings( + stats::cor(drug_mat, feat_mat, use = "pairwise.complete.obs", method = "spearman") + ) %>% + as.data.frame() %>% + tibble::rownames_to_column("drug") %>% + tidyr::pivot_longer(cols = - "drug", names_to = "feature", values_to = "cor") + + csig <- do.call(rbind, lapply(colnames(drug_mat), function(d) { + do.call(rbind, lapply(colnames(feat_mat), function(f) { + dv <- drug_mat[, d]; fv <- feat_mat[, f] + p <- NA_real_ + if (sum(is.finite(dv) & is.finite(fv)) >= 3) { + p <- tryCatch( + stats::cor.test(dv, fv, method = "spearman", use = "pairwise.complete.obs")$p.value, + error = function(e) NA_real_ + ) + } + c(drug = d, feature = f, pval = p) + })) %>% + as.data.frame() + })) %>% + as.data.frame() %>% + mutate(pval = as.numeric(.data$pval)) %>% + mutate(fdr = p.adjust(.data$pval, method = "BH")) + + left_join(cres, csig, by = c("drug","feature")) %>% + mutate(direction = ifelse(.data$cor < 0, "neg", "pos")) +} + +summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = "figs") { + if (nrow(cor_tbl) == 0L) return(list(summary = tibble(), plot = NULL)) + corsummary <- cor_tbl %>% + dplyr::filter(is.finite(.data$fdr), !is.na(.data$fdr), .data$fdr < fdr_thresh) %>% + mutate(direction = ifelse(.data$cor > 0, "pos", "neg")) %>% + group_by(.data$drug, .data$direction) %>% + summarize(features = n(), meanCor = mean(.data$cor), .groups = "drop") + + if (nrow(corsummary) == 0L) return(list(summary = corsummary, plot = NULL)) + + p <- corsummary %>% + dplyr::filter(.data$features > 1) %>% + ggplot(aes(x = .data$drug, y = .data$features, fill = .data$direction)) + + geom_col(position = "dodge") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + + labs(title = paste0("Significant feature counts per drug (FDR < ", fdr_thresh, ")"), + x = "Drug", y = "# Features") + + ggsave(file.path(outdir, "cor_features_by_drug.pdf"), p, width = 12, height = 6, units = "in") + list(summary = corsummary, plot = p) +} + +# --------------------------- +# Main wrapper +# --------------------------- +# Returns: list(drug_mat, feat_mat, shared_ids, cor_tbl, cor_summary, cor_plot, drug_summary) +analyze_modality <- function( + fits, + df_long, + sample_col, # e.g., "Specimen" + feature_col, # e.g., "feature_id" | "Gene" | "site" + value_col, # e.g., "correctedAbundance" + metric = "uM_viability", + outdir = "figs", + heatmap_filename = "drug_heatmap_large.pdf", + fdr_thresh = 0.25 +) { + # Pre-intersection like original (all metrics) + shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) + + # Feature matrix over shared IDs + feat_mat <- make_feature_matrix( + df_long = df_long, + shared_ids = shared_ids, + sample_col = sample_col, + feature_col = feature_col, + value_col = value_col + ) + + # Drug matrix for the exact metric (no normalization/fallbacks) + drug_mat <- make_drug_matrix( + fits = fits, + metric = metric, + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" + ) + + # Summaries & heatmap (original style) + dsum <- summarize_drugs( + fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir + ) + + if (!is.null(heatmap_filename) && nrow(drug_mat) > 0 && ncol(drug_mat) > 0) { + fulldrugs <- dsum$summary %>% + dplyr::filter(.data$nMeasured == nrow(drug_mat)) %>% + pull(.data$improve_drug_id) + + subm <- drug_mat[, colnames(drug_mat) %in% fulldrugs, drop = FALSE] + if (nrow(subm) > 1 && ncol(subm) > 0) { + pheatmap::pheatmap( + as.matrix(subm), + filename = file.path(outdir, heatmap_filename), + width = 28, height = 16, + angle_col = 45, fontsize_col = 6, + cluster_rows = TRUE, cluster_cols = TRUE, + show_rownames = TRUE, show_colnames = TRUE + ) + } + } + + # Correlations (only if overlap) + shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + cor_tbl <- if (length(shared_after) > 0L) { + compute_cors(drug_mat, feat_mat, shared_samples = shared_after) + } else { + tibble(drug = character(), feature = character(), cor = numeric(), + pval = numeric(), fdr = numeric(), direction = character()) + } + cor_res <- summarize_correlated_features(cor_tbl, fdr_thresh = fdr_thresh, outdir = outdir) + + list( + drug_mat = drug_mat, + feat_mat = feat_mat, + shared_ids = shared_ids, + cor_tbl = cor_tbl, + cor_summary = cor_res$summary, + cor_plot = cor_res$plot, + drug_summary = dsum$summary + ) +} diff --git a/04_analyze_modality_and_pathway_enrich.Rmd b/04_analyze_modality_and_pathway_enrich.Rmd new file mode 100644 index 0000000..c55b038 --- /dev/null +++ b/04_analyze_modality_and_pathway_enrich.Rmd @@ -0,0 +1,114 @@ +--- +title: "run_analyze_modality_and_enrichment" +author: "JJ" +date: "2025-11-12" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + + +source("cNF_helper_code.R") +source("03_analyze_modality_correlations.R") +source("04_leapr_biomarker.R") +``` + + +```{r} + +drugs <- readr::read_tsv(synGet("syn69947322")$path) + +#RNA +rlong <- readr::read_csv(synGet('syn71333780')$path) +#Global +glong <- readr::read_csv(synGet('syn70078416')$path) +#Phospho +plong <- readr::read_csv(synGet('syn70078415')$path) + +corr_rna <- analyze_modality( + fits = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "rna_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +corr_global <- analyze_modality( + fits = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "global_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +corr_phospho <- analyze_modality( + fits = drugs, # syn69947322 TSV already read + df_long = plong, # your RNA or proteo/phospho long table + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` + + +```{r warning=FALSE} + +res_bio_rna <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + omic_label = "rna", + cache_path = "leapR_RNA_krbpaths_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + + +res_bio_global <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + omic_label = "global", + cache_path = "leapR_Global_krbpaths_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + + +res_bio_phospho <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = plong, + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + omic_label = "phospho", + cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "kinasesubstrates" +) + + + + +``` diff --git a/04_leapr_biomarker.R b/04_leapr_biomarker.R new file mode 100644 index 0000000..f679d82 --- /dev/null +++ b/04_leapr_biomarker.R @@ -0,0 +1,467 @@ +# leapr_biomarker.R + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(readr) + library(stringr) + library(tibble) + library(SummarizedExperiment) + library(S4Vectors) + library(leapR) + library(ggplot2) + library(grDevices) +}) + +# ----------------------------- +# Helpers +# ----------------------------- +# Long to wide matrix (rows = samples, cols = features) +# df_long columns: +# sample_col : sample IDs (e.g., "Specimen") +# feature_col : feature IDs (e.g., "feature_id", "Gene", "site") depending on input group +# value_col : numeric values (e.g., "correctedAbundance") +long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { + if (is.null(df_long) || !nrow(df_long)) return(NULL) + + df <- df_long |> + dplyr::mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop NA/blank sample or feature IDs with messages + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + n_bad_s <- sum(bad_sample, na.rm = TRUE) + n_bad_f <- sum(bad_feature, na.rm = TRUE) + if (n_bad_s > 0) message("[long_to_matrix] Dropping ", n_bad_s, " rows with NA/blank ", sample_col) + if (n_bad_f > 0) message("[long_to_matrix] Dropping ", n_bad_f, " rows with NA/blank ", feature_col) + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[long_to_matrix] No rows left after removing NA/blank sample/feature IDs.") + return(NULL) + } + + # Pivot to wide + wide <- df %>% + dplyr::select( + !!rlang::sym(sample_col), + !!rlang::sym(feature_col), + !!rlang::sym(value_col) + ) %>% + tidyr::pivot_wider( + names_from = !!rlang::sym(feature_col), + values_from = !!rlang::sym(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[long_to_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + if (!nrow(wide)) { + warning("[long_to_matrix] Wide table is empty after cleaning.") + return(NULL) + } + + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + as.matrix(wide) +} + + + +# ---- PHOSPHO +.extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + x <- as.character(site_id) + + # Get chars before first '-' (e.g., "AAAS-S495s" -> "AAAS") + gene <- sub("^([^\\-]+)-.*$", "\\1", x, perl = TRUE) + + # If no '-' present, fall back to splitting on common delimiters + if (identical(gene, x)) { + parts <- strsplit(x, "[|:_\\-\\.]", fixed = FALSE)[[1]] + gene <- parts[1] + } + gene <- sub("^([A-Za-z0-9]+).*", "\\1", gene) + gene <- toupper(gene) + if (nchar(gene) == 0) return(NA_character_) + gene +} + +# Build phospho site gene map from long table +.build_phospho_gene_map_from_long <- function(df_long, feature_col) { + gene_cols <- c("Gene","gene","hgnc_id","hgnc_symbol","protein","Protein","Symbol","symbol") + has <- gene_cols[gene_cols %in% colnames(df_long)] + if (length(has)) { + gcol <- has[[1]] + mp <- df_long %>% + dplyr::select(!!rlang::sym(feature_col), !!rlang::sym(gcol)) %>% + dplyr::rename(site = !!rlang::sym(feature_col), gene = !!rlang::sym(gcol)) %>% + dplyr::mutate(site = trimws(as.character(site)), + gene = toupper(trimws(as.character(gene)))) %>% + dplyr::filter(!is.na(site), site != "", !is.na(gene), gene != "") %>% + dplyr::distinct(site, gene) + if (nrow(mp)) return(setNames(mp$gene, mp$site)) + } + sites <- unique(trimws(as.character(df_long[[feature_col]]))) + sites <- sites[!is.na(sites) & sites != ""] + if (!length(sites)) return(NULL) + genes <- vapply(sites, .extract_gene_from_site, FUN.VALUE = character(1)) + genes[genes == ""] <- NA_character_ + setNames(genes, sites) +} + + +.collapse_sites_to_genes <- function(cor_named_vec, map_site2gene, agg = c("mean","maxabs")) { + agg <- match.arg(agg) + if (is.null(map_site2gene) || !length(cor_named_vec)) return(cor_named_vec) + + # Align and drop unmapped + genes <- map_site2gene[names(cor_named_vec)] + keep <- !is.na(genes) & genes != "" + v <- cor_named_vec[keep] + g <- genes[keep] + if (!length(v)) return(setNames(numeric(0), character(0))) + + if (agg == "mean") { + # mean per gene + df <- tibble(gene = g, val = as.numeric(v)) %>% + group_by(gene) %>% summarise(val = mean(val, na.rm = TRUE), .groups = "drop") + out <- stats::setNames(df$val, df$gene) + } else { + # max by absolute value, keep sign + df <- tibble(gene = g, val = as.numeric(v)) %>% + mutate(ord = order(-abs(val))) %>% + group_by(gene) %>% + slice_max(order_by = abs(val), n = 1, with_ties = FALSE) %>% + ungroup() + out <- stats::setNames(df$val, df$gene) + } + out +} + +# ---- Normalize phosphosite IDs to match kinasesubstrates (e.g. "AAAS-S495s" -> "AAAS-S495") +.normalize_kinase_site_id <- function(x) { + x <- as.character(x) + x <- trimws(x) + # Drop trailing lowercase letters + sub("[a-z]+$", "", x) +} + +# Spearman correlations +.col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build SummarizedExperiment to feed into leapR +.build_se_from_corvec <- function(cor_named_vec, features_all, col_label, + map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- S4Vectors::DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment::SummarizedExperiment( + assays = list(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(sample = col_label) + ) + SummarizedExperiment::assayNames(se) <- assay_label + se +} + +.safe_leapr <- function(...) { + tryCatch(leapR::leapR(...), + error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) +} + +# Load a leapR built-in geneset by name +.load_leapr_geneset_by_name <- function(name) { + valid <- c("kinasesubstrates", "ncipid", "krbpaths", "longlist", "shortlist") + if (!(name %in% valid)) { + stop("Unknown geneset name: '", name, "'. Valid: ", paste(valid, collapse = ", ")) + } + suppressWarnings(utils::data(list = name, package = "leapR", envir = environment())) + if (!exists(name, inherits = FALSE)) { + stop("leapR dataset '", name, "' not found in the installed {leapR}.") + } + get(name, inherits = FALSE) +} + +# Decide default geneset from omic label when no override is provided +.default_geneset_for_omic <- function(omic_label) { + ol <- tolower(omic_label) + if (ol %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome")) { + .load_leapr_geneset_by_name("kinasesubstrates") + } else { + .load_leapr_geneset_by_name("krbpaths") + } +} + +# ----------------------------- +# Main +# ----------------------------- +run_leapr_directional_one_cached <- function(drugs, + df_long, + sample_col, + feature_col, + value_col, + omic_label, + cache_path, + write_csvs = FALSE, + always_rerun = FALSE, + min_features = 5, + test_one = FALSE, + geneset_name = NULL, + geneset_object = NULL) { + + # cache check! If the cached value exists, stop there. + if (!always_rerun && is.character(cache_path) && nzchar(cache_path) && file.exists(cache_path)) { + load(cache_path) # loads res_list + if (exists("res_list")) return(res_list) + } + + # pivot long to matrix + feat_mat <- long_to_matrix(df_long, sample_col, feature_col, value_col) + if (is.null(feat_mat) || !nrow(feat_mat) || !ncol(feat_mat)) { + warning("[run_leapr_directional_one_cached] Empty feature matrix after pivot; returning empty list.") + return(list()) + } + + # pick geneset + if (!is.null(geneset_object)) { + geneset_db <- geneset_object + } else if (!is.null(geneset_name)) { + geneset_db <- .load_leapr_geneset_by_name(geneset_name) + } else { + geneset_db <- .default_geneset_for_omic(omic_label) + } + + # optional phospho site gene mapping + map_site2gene <- NULL + is_phospho <- tolower(omic_label) %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome") + if (is_phospho) { + map_site2gene <- .build_phospho_gene_map_from_long(df_long, feature_col) + if (is.null(map_site2gene) || !length(map_site2gene)) { + phos_features <- colnames(feat_mat) + map_site2gene <- setNames( + vapply(phos_features, .extract_gene_from_site, FUN.VALUE = character(1)), + phos_features + ) + } + # Print statements + feats <- colnames(feat_mat) + mapped <- map_site2gene[feats] + n_mapped <- sum(!is.na(mapped) & mapped != "") + message(sprintf("[phospho mapping] %d/%d sites mapped to gene symbols (%.1f%%)", + n_mapped, length(feats), 100 * n_mapped / max(1, length(feats)))) + if (n_mapped < length(feats)) { + unm <- feats[is.na(mapped) | mapped == ""] + if (length(unm)) { + show_n <- min(5L, length(unm)) + message("[phospho mapping] Unmapped examples: ", + paste(utils::head(unm, show_n), collapse = ", "), + if (length(unm) > show_n) paste0(" ... +", length(unm) - show_n, " more") else "") + } + } + } + + # For phospho, detect when we are using site-level kinase substrates + uses_kinase_sites <- is_phospho && { + if (!is.null(geneset_name)) { + identical(geneset_name, "kinasesubstrates") + } else { + identical(geneset_db, .load_leapr_geneset_by_name("kinasesubstrates")) + } + } + + # If using kinasesubstrates, normalize column names so they match site IDs + if (uses_kinase_sites) { + old_sites <- colnames(feat_mat) + norm_sites <- .normalize_kinase_site_id(old_sites) + if (!identical(old_sites, norm_sites)) { + message("[kinasesubstrates] Normalizing phosphosite IDs (e.g. 'AAAS-S495s' -> 'AAAS-S495')") + colnames(feat_mat) <- make.unique(norm_sites) + } + } + + res_list <- list() + out_csv_dir <- file.path("leapR_top_paths", "dir_split") + if (write_csvs && !dir.exists(out_csv_dir)) dir.create(out_csv_dir, recursive = TRUE) + + all_drugs <- unique(drugs$improve_drug_id) + if (test_one && length(all_drugs) > 0) { + message("[run_leapr_directional_one_cached] test_one=TRUE then running only the first drug: ", all_drugs[[1]]) + all_drugs <- all_drugs[[1]] + } else { + all_drugs <- sort(all_drugs) + } + + total <- length(all_drugs) + for (i in seq_along(all_drugs)) { + drug <- all_drugs[[i]] + message(sprintf("[%-3d/%-3d] %s", i, total, drug)) + + # mean response per sample for uM_viability + dv <- drugs %>% + dplyr::filter(.data$improve_drug_id == !!drug, + .data$dose_response_metric == "uM_viability") %>% + dplyr::group_by(.data$improve_sample_id) %>% + dplyr::summarise(resp = mean(.data$dose_response_value, na.rm = TRUE), + .groups = "drop") + + if (!nrow(dv)) { + message(" No response rows for metric 'uM_viability'; skipping.") + next + } + dv_vec <- stats::setNames(dv$resp, dv$improve_sample_id) + + # correlations at site-level (or normalized site-level for kinasesubstrates) + cors <- .col_spearman(dv_vec, feat_mat) + pos <- cors[!is.na(cors) & cors > 0] # resistant (TOP) + neg <- cors[!is.na(cors) & cors < 0] # sensitive (BOTTOM; flip) + message(sprintf(" Features (site-level): pos=%d, neg=%d (min_features=%d)", + length(pos), length(neg), min_features)) + + if (uses_kinase_sites && i == 1) { + ks_sites <- unique(unlist(geneset_db[["matrix"]])) + ov <- intersect(names(cors), ks_sites) + message("[kinasesubstrates] Overlapping sites with geneset (first drug): ", length(ov)) + if (length(ov)) { + message(" Example overlaps: ", paste(utils::head(ov, 5), collapse = ", ")) + } + } + + # For phospho + GENE-LEVEL sets, collapse site to gene before SE + if (is_phospho && !uses_kinase_sites) { + pos <- .collapse_sites_to_genes(pos, map_site2gene, agg = "mean") + neg <- .collapse_sites_to_genes(neg, map_site2gene, agg = "mean") + message(sprintf(" Gene-level: pos=%d, neg=%d", length(pos), length(neg))) + } + + res_list[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP (resistant) + if (length(pos) >= min_features) { + feats_top <- names(pos) + se_top <- .build_se_from_corvec( + cor_named_vec = pos, + features_all = feats_top, + col_label = paste0(drug, "_TOP"), + map_to_gene = if (is_phospho) NULL else NULL, # not needed when features are genes/sites + assay_label = omic_label + ) + top_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_top, + assay_name = omic_label, + primary_columns = paste0(drug, "_TOP"), + id_column = NULL + ) + res_list[[drug]]$top <- top_res + message(" TOP (resistant): ", if (is.null(top_res)) "no result" else "OK") + if (write_csvs && !is.null(top_res)) { + utils::write.csv(as.data.frame(top_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_TOP.csv")), + row.names = FALSE) + } + } else { + message(" TOP (resistant): skipped (too few positive features)") + } + + # BOTTOM (sensitive) + if (length(neg) >= min_features) { + # flip sign so strong negatives rank to top + neg_flip <- -neg + feats_bot <- names(neg_flip) + se_bot <- .build_se_from_corvec( + cor_named_vec = neg_flip, + features_all = feats_bot, + col_label = paste0(drug, "_BOTTOM"), + map_to_gene = if (is_phospho) NULL else NULL, + assay_label = omic_label + ) + bot_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_bot, + assay_name = omic_label, + primary_columns = paste0(drug, "_BOTTOM"), + id_column = NULL + ) + res_list[[drug]]$bottom <- bot_res + message(" BOTTOM(sensitive): ", if (is.null(bot_res)) "no result" else "OK") + if (write_csvs && !is.null(bot_res)) { + utils::write.csv(as.data.frame(bot_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_BOTTOM.csv")), + row.names = FALSE) + } + } else { + message(" BOTTOM(sensitive): skipped (too few negative features)") + } + + if (isTRUE(test_one)) break + } + + if (is.character(cache_path) && nzchar(cache_path)) { + save(res_list, file = cache_path) + } + res_list +} + +# ----------------------------- +# Plot and save using leapR builtin plotter +# ----------------------------- +save_leapr_plots <- function(res_list, omic_label, top_n = 15) { + if (!length(res_list)) return(invisible(NULL)) + dir.create("figs", showWarnings = FALSE) + safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + + for (drug in names(res_list)) { + two <- res_list[[drug]] + + # TOP (resistant) + if (!is.null(two$top)) { + p_top <- leapR::plot_leapr_bar(two$top, + title = paste0(drug, " — ", omic_label, " (Resistant)"), + top_n = top_n) + if (!is.null(p_top)) { + fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, + "_resistant_top", top_n, ".pdf")) + ggplot2::ggsave(fn, p_top, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + + # BOTTOM (Sensitive) + if (!is.null(two$bottom)) { + p_bot <- leapR::plot_leapr_bar(two$bottom, + title = paste0(drug, " — ", omic_label, " (Sensitive)"), + top_n = top_n) + if (!is.null(p_bot)) { + fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, + "_sensitive_top", top_n, ".pdf")) + ggplot2::ggsave(fn, p_bot, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + } + invisible(NULL) +} diff --git a/cNF_helper_code.R b/cNF_helper_code.R index 0670c6c..711858a 100644 --- a/cNF_helper_code.R +++ b/cNF_helper_code.R @@ -1,3 +1,4 @@ +#cNF_helper_code.R ##standard metadata across all cNFs, including colors if possible From 6737340066e9b7dc7d095e051b92691d50f743f0 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 24 Nov 2025 14:46:09 -0800 Subject: [PATCH 2/7] clean up --- 02_normalize_harmonize_proteomics.html | 841 ---------------- 02_normalize_harmonize_proteomics.rmd | 527 ---------- 03_drug_biomarkers_legacy_code.Rmd | 1265 ++++++++++++++++++++++++ cNF_Analysis_and_fig_descriptions.docx | Bin 0 -> 27471 bytes 4 files changed, 1265 insertions(+), 1368 deletions(-) delete mode 100644 02_normalize_harmonize_proteomics.html delete mode 100644 02_normalize_harmonize_proteomics.rmd create mode 100644 03_drug_biomarkers_legacy_code.Rmd create mode 100644 cNF_Analysis_and_fig_descriptions.docx diff --git a/02_normalize_harmonize_proteomics.html b/02_normalize_harmonize_proteomics.html deleted file mode 100644 index 803d592..0000000 --- a/02_normalize_harmonize_proteomics.html +++ /dev/null @@ -1,841 +0,0 @@ - - - - - - - - - - - - - - - -Reformat and process proteomics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Normalize phospho-proteomics

-

We now have phospho proteomics from two cohorts. Here I’m trying to -collect data from both and normalize but am clearly missing something. I -do the following:

-
    -
  1. replace all zero values with NA to avoid skewing normalization
  2. -
  3. remove any features that are absent from >50% of the samples
  4. -
  5. take the log of the data, then take a modified z score
  6. -
-

Each dataset is done individually then combined at the end. There is -a clear batch effect.

-
-

Cohort 1 phospho

-

We start with the cohort 1 phospho data here.

-
##cohort 1 phospho
-##first we read in file, and get site info
-phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |>
-  as.data.frame()
-
-phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA
-
-pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
- 
-##logtransform##median transform
-pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x)
-    length(which(is.na(x)))/length(x) < 0.5))
-
-pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2,
-             function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho1$site[pzeros])
-
-##move to long form, upload
-plong1<-pmat1|>
-  tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|>
-  left_join(pfnames1) |>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T))|>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv')
-syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365'))
-
## File(id='syn65598472', synapseStore=True, modifiedOn='2025-10-07T16:10:10.287Z', dataFileHandleId='163828412', versionNumber=51, name='log2normMedCenteredPhospho.csv', createdBy='1418096', parentId='syn70078365', path='log2normMedCenteredPhospho.csv', _file_handle={'id': '163828412', 'etag': 'c532fccc-538b-4345-85ee-78a55c4db07b', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:03.000Z', 'modifiedOn': '2025-10-02T01:31:03.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '479eaaac2d8f26d5ef94fbec8a7d4c6d', 'fileName': 'log2normMedCenteredPhospho.csv', 'storageLocationId': 1, 'contentSize': 11566930, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/aff2a055-c32f-4a5c-bc5b-421908f76e6b/log2normMedCenteredPhospho.csv', 'previewId': '163828414', 'isPreview': False, 'externalURL': None}, concreteType='org.sagebionetworks.repo.model.FileEntity', isLatestVersion=True, cacheDir='', etag='0fc57caf-2cbf-4e7e-88a1-d4fce206142e', files=['log2normMedCenteredPhospho.csv'], modifiedBy='1418096', versionLabel='51', createdOn='2025-03-20T19:21:02.864Z')
-

The file is uploaded to synapse.

-
-
-

Cohort 2 phospho

-

Now on October 7 we can process the second batch of phospho.

-
##cohort 2 phospho
-##1 read in data
-phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-')
-
-
-phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA
-
-pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-##remove missingness 
-tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5))
-
-##log2 adjusted z score
-pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2,
-              function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho2$site[tm])
-
-
-
-plong2<-pmat2|>
-  tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |>
-  left_join(pfnames2)|>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
##save to file
-readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv')
-syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, id='syn70078413', path='log2normMedCenteredPhospho_cohort2.csv', createdOn='2025-10-07T15:56:04.262Z', createdBy='1418096', files=['log2normMedCenteredPhospho_cohort2.csv'], _file_handle={'id': '163828418', 'etag': '1d896b0a-e1eb-4083-a6e0-b3f266dc59ce', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:22.000Z', 'modifiedOn': '2025-10-02T01:31:22.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '84b9a91da42f0630ebbd37e0ad9b22a7', 'fileName': 'log2normMedCenteredPhospho_cohort2.csv', 'storageLocationId': 1, 'contentSize': 6535765, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/b3ab724b-02f7-4b08-8ea0-0191c2835ee6/log2normMedCenteredPhospho_cohort2.csv', 'previewId': '163828419', 'isPreview': False, 'externalURL': None}, name='log2normMedCenteredPhospho_cohort2.csv', modifiedOn='2025-10-07T16:10:11.331Z', dataFileHandleId='163828418', etag='755cc7b3-9fae-473e-b684-9e81d4200f95', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', versionLabel='3', parentId='syn70078365', modifiedBy='1418096')
-

Now that we have two cohorts we can try to combine without batch -correction.

-
-
-

Combined phospho

-

Combining the phoshpo data here.

-
##now we move back to long form
-plong <- rbind(plong1,plong2)
-  #pmat |>
-#  as.data.frame()|>
-#  tibble::rownames_to_column('site')|>
-#  pivot_longer(-site,names_to='fname',values_to='abundance')|>
-#  left_join(rbind(pfnames1,pfnames2))|>
-#    group_by(site,fname,aliquot,cohort) |>
-#  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-#  left_join(meta)
-
-         
-compsites <- plong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(site)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0
-
-ppcs<-plong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,site)|>
-  unique()|>
-  subset(site%in%compsites$site)|>
-  #subset(!is.na(site))|>
-  #subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',
-                     values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('site')|>
-  t()|>
-  prcomp()
-
-pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2,PC3)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-  dplyr::select(PC2,PC1,Specimen,Patient,cohort)|>
-  mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-    geom_point()+
-    #ggrepel::geom_label_repel()+
-    ggtitle("Phospho samples")+
-  ggplot2::scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
ph<- plong |>ungroup()|>
-  subset(site%in%compsites$site)|>
-  ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-
-cowplot::plot_grid(ph,pplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFPhosphoQC.png',width=10)
-
## Saving 10 x 5 in image
-
pplot
-

-
ggsave('phosphoPCA.pdf')
-
## Saving 7 x 5 in image
-

Clearly there is a strong batch effect.

-
-
-
-

Normalize golbal proteomics

-

Now we can move onto the global data

-
-

Cohort 1 global

-

Global proteomics in cohort 1 here.

-
####now process global
-#global1<-readr::read_tsv(syn$get('syn64906445')$path)
-global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |>
-  tidyr::separate_rows(Genes,sep=';')
-##logtransform, medina transform
-
-#global1[which(global1==0,arr.ind=TRUE)]<-NA
-
-gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-
-gmat1<-gmat1|>
-  as.data.frame()|>
-  mutate(Genes=global1$Genes)
-
-##extract aliquot info from file name
-gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
-
-glong1<-gmat1|>
-    tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|>
- left_join(gfnames1)|>
-   group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv')
-syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365'))
-
## File(files=['log2normMedCenteredGlobal.csv'], synapseStore=True, name='log2normMedCenteredGlobal.csv', _file_handle={'id': '163828754', 'etag': 'ecb596c9-4869-4413-9838-eb66cb83b76b', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:09:54.000Z', 'modifiedOn': '2025-10-02T02:09:54.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'd6c01c9ff6bf97322703ae1d5bbc8349', 'fileName': 'log2normMedCenteredGlobal.csv', 'storageLocationId': 1, 'contentSize': 20723175, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/75b0981e-7e42-4c6d-b5aa-1443652f6c43/log2normMedCenteredGlobal.csv', 'previewId': '163828755', 'isPreview': False, 'externalURL': None}, etag='f8f7705a-9072-42f8-9616-31c447e8f787', createdBy='1418096', path='log2normMedCenteredGlobal.csv', createdOn='2025-03-20T19:29:54.101Z', modifiedOn='2025-10-07T16:10:14.595Z', concreteType='org.sagebionetworks.repo.model.FileEntity', versionLabel='43', isLatestVersion=True, cacheDir='', dataFileHandleId='163828754', parentId='syn70078365', id='syn65599827', versionNumber=43, modifiedBy='1418096')
-
-
-

Cohort 2 global

-

October 7 we process the second cohort.

-
global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|>
-  tidyr::separate_rows(Genes,sep=';')
-
-#global2[which(global2==0,arr.ind=TRUE)]<-NA
-
-gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) 
-  0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-rownames(gmat2)<-global2$Genes
-
-gmat2<-gmat2|>
-  as.data.frame()|>
-  mutate(Genes=global2$Genes)
-
-gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-glong2<-gmat2|>
-  tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|>
-  left_join(gfnames2)|>
-  group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|>
-#  subset(numIso>1)
-
-
-readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv')
-syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, files=['log2normMedCenteredGlobal_cohort2.csv'], createdBy='1418096', dataFileHandleId='163828776', createdOn='2025-10-07T15:56:09.318Z', parentId='syn70078365', modifiedOn='2025-10-07T16:10:16.010Z', name='log2normMedCenteredGlobal_cohort2.csv', etag='d11f86af-5af5-49f7-abf6-2bae20de1c9b', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, _file_handle={'id': '163828776', 'etag': 'ea261f85-be11-4c15-b745-fe79324c9b19', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:12:01.000Z', 'modifiedOn': '2025-10-02T02:12:01.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '4da78f673cfedc27c22cdb76a0663db8', 'fileName': 'log2normMedCenteredGlobal_cohort2.csv', 'storageLocationId': 1, 'contentSize': 13474091, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/6c66fd73-e5d0-4f05-acc1-2aca33440948/log2normMedCenteredGlobal_cohort2.csv', 'previewId': '163828779', 'isPreview': False, 'externalURL': None}, cacheDir='', versionLabel='3', id='syn70078414', modifiedBy='1418096', path='log2normMedCenteredGlobal_cohort2.csv')
-
-
-

Global combined without batch correction

-

Now we can combine the global withot batch correction.

-
#ma<-mean(glong$abundance,na.rm=T)
-#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0
-glong <- rbind(glong1,glong2)|>
-  subset(Genes!="")
-      
-compsites <- glong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(Genes)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-gpcs<-glong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,Genes)|>
-  subset(!is.na(Genes))|>
-  subset(Genes!="")|>
-  subset(Genes%in%compsites$Genes)|>
-  subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('Genes')|>t()|>
-  prcomp()
-
-gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-     
-
-cowplot::plot_grid(hplot,gplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFGlobalQC.png',width=10)
-
## Saving 10 x 5 in image
-
gplot
-

-
ggsave('globalPCA.pdf')
-
## Saving 7 x 5 in image
-
-
-
-

Evaluate batch correction

-

Now we have two separate long tables with metadata, but we would like -to combine into a single one and batch correct.We can update this with -each cohort.

-
##phospho 
-##TODO: ideally we should use the long tables and reconvert
-pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2))
-
-gmat <- merge(gmat1,gmat2)
-
-##remove duplicated sites
-dsites<- unique(pmat$site[which(duplicated(pmat$site))])
-mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |>
-  t() |>
-  as.data.frame() |>
-  tibble::rownames_to_column('site')
-
-pmat <- pmat |>
-  subset(!site %in% dsites) |>
-  rbind(mvals)
-
-##now convert to matrix
-pmat <- pmat |>
-  tibble::remove_rownames() |>
-  tibble::column_to_rownames('site') |>
-  as.matrix()
-
-gmat <- gmat |>
-  subset(Genes!='')|>
-    tibble::remove_rownames() |>
-  tibble::column_to_rownames('Genes') |>
-    as.matrix()
-##sigh, batch correct?
-library(sva)
-
## Loading required package: mgcv
-
## Loading required package: nlme
-
## 
-## Attaching package: 'nlme'
-
## The following object is masked from 'package:dplyr':
-## 
-##     collapse
-
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
-
## Loading required package: genefilter
-
## Loading required package: BiocParallel
-
## Warning: package 'BiocParallel' was built under R version 4.4.3
-
  pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0
-  cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
  gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0
-  cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
 ppcs<-prcomp(t(cbmat))
- 
- pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(pfnames1,pfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected phospho samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-
## Joining with `by = join_by(aliquot, cohort)`
-
 pplot
-
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
 gpcs<-prcomp(t(cgmat))
- gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(gfnames1,gfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
 gplot
-
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
-
-

Upload batch-corrected data to synapse

-

Now we can reformat the batch-corrected data and upload to syanps

-
pc_long <- cbmat |>
-  as.data.frame() |>
-    tibble::rownames_to_column('site') |>
-    pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(pfnames1,pfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
gc_long <- cgmat |>
-   as.data.frame() |>
-    tibble::rownames_to_column('Gene') |>
-    pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(gfnames1,gfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(pc_long,file='batch12_correctedPhospho.csv')
-readr::write_csv(gc_long,file='batch12_correctedGlobal.csv')
-
-syn$store(File('batch12_correctedPhospho.csv',parentId='syn70078365'))
-
## File(modifiedOn='2025-10-07T16:10:24.600Z', synapseStore=True, etag='4ea27841-ef8e-4944-9a83-56d5ece5a31b', createdBy='1418096', path='batch12_correctedPhospho.csv', files=['batch12_correctedPhospho.csv'], id='syn70078415', parentId='syn70078365', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', name='batch12_correctedPhospho.csv', versionLabel='3', createdOn='2025-10-07T15:56:24.984Z', modifiedBy='1418096', _file_handle={'id': '163984316', 'etag': '634ec130-4da8-4401-9731-511b589704b8', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:25.000Z', 'modifiedOn': '2025-10-07T15:56:25.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'df202d20b8b8bcee6b0fbf1a7a712f37', 'fileName': 'batch12_correctedPhospho.csv', 'storageLocationId': 1, 'contentSize': 17060666, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/167bfcda-58b7-43d4-9457-4238fdae1118/batch12_correctedPhospho.csv', 'previewId': '163984318', 'isPreview': False, 'externalURL': None}, dataFileHandleId='163984316')
-
syn$store(File('batch12_correctedGlobal.csv',parentId='syn70078365'))
-
## File(synapseStore=True, createdBy='1418096', files=['batch12_correctedGlobal.csv'], name='batch12_correctedGlobal.csv', _file_handle={'id': '163984320', 'etag': 'f63e08df-542c-4642-a0f8-5314cda43a43', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:29.000Z', 'modifiedOn': '2025-10-07T15:56:29.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '27e185221a058dafc833d1c2a52e74c1', 'fileName': 'batch12_correctedGlobal.csv', 'storageLocationId': 1, 'contentSize': 33970670, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/c5da77ae-82a3-41a1-9045-d66fa87085a7/batch12_correctedGlobal.csv', 'previewId': '163984321', 'isPreview': False, 'externalURL': None}, id='syn70078416', createdOn='2025-10-07T15:56:29.301Z', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', versionNumber=3, versionLabel='3', parentId='syn70078365', path='batch12_correctedGlobal.csv', modifiedBy='1418096', etag='935bbae3-5660-463b-b659-3d4a64de414b', isLatestVersion=True, modifiedOn='2025-10-07T16:10:25.266Z', dataFileHandleId='163984320')
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/02_normalize_harmonize_proteomics.rmd b/02_normalize_harmonize_proteomics.rmd deleted file mode 100644 index aa2f322..0000000 --- a/02_normalize_harmonize_proteomics.rmd +++ /dev/null @@ -1,527 +0,0 @@ ---- -title: "Reformat and process proteomics" -author: "Sara Gosline" -date: "2025-09-28" -output: html_document ---- - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(ggplot2) -library(dplyr) -library(tidyr) -library(stringr) -library(synapser) -library(grid) -source('cNF_helper_code.R') -``` - -# Drop sample list. These were for protocol optimization -```{r} -remove_fname_substrings <- c( - "cNF_organoid_DIA_G_02_11Feb25", - "cNF_organoid_DIA_G_05_11Feb25", - "cNF_organoid_DIA_G_06_11Feb25", - "cNF_organoid_DIA_P_02_29Jan25", - "cNF_organoid_DIA_P_05_11Feb25", - "cNF_organoid_DIA_P_06_11Feb25" -) - -is_unwanted_fname <- function(x) { - vapply(x, function(s) - any(vapply(remove_fname_substrings, function(p) grepl(p, s, fixed = TRUE), logical(1))), - logical(1) - ) -} - - -``` - - - -## Normalize phospho-proteomics - -We now have phosphoproteomics from two cohorts. Here I'm trying to collect data from both and normalize but am clearly missing something. I do the following: - -1. replace all zero values with NA to avoid skewing normalization -2. remove any features that are absent from >50% of the samples -3. take the log of the data, then take a modified z score - -Each dataset is done individually then combined at the end. There is a clear batch effect. - -### Cohort 1 phospho - -We start with the cohort 1 phospho data here. - -```{r compare to proteomics,warning=FALSE} - -##cohort 1 phospho -##first we read in file, and get site info -phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |> - subset(!is.na(`Gene.Names`)) |> - subset(Gene.Names!='') |> - mutate(lsite=tolower(Residue)) |> - tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |> - tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |> - as.data.frame() - -phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA - -pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=1) |> - dplyr::filter(!is_unwanted_fname(fname)) - - - -##logtransform##median transform -pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x) - length(which(is.na(x)))/length(x) < 0.5)) - -pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2, - function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> - as.data.frame() |> - mutate(site=phospho1$site[pzeros]) - -##move to long form, upload -plong1<-pmat1|> - tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|> - left_join(pfnames1) |> - group_by(site,fname,aliquot,cohort) |> - summarize(meanAbundance=mean(abundance,na.rm=T))|> - subset(!is.na(meanAbundance))|> - left_join(meta) - -readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv') -syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365')) - -``` -The file is uploaded to synapse. - -### Cohort 2 phospho - -Now on October 7 we can process the second batch of phospho. - -```{r cohort 2 phospho} - -##cohort 2 phospho -##1 read in data -phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |> - subset(!is.na(`Gene.Names`)) |> - subset(Gene.Names != '') |> - mutate(lsite = tolower(Residue)) |> - tidyr::unite(c(Residue,Site,lsite),col = 'site',sep='') |> - tidyr::unite(c(`Gene.Names`,site),col = 'site',sep='-') - - -phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA - -pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=2)|> - dplyr::filter(!is_unwanted_fname(fname)) - -##remove missingness -tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5)) - -##log2 adjusted z score -pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2, - function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> - as.data.frame() |> - mutate(site=phospho2$site[tm]) - - - -plong2<-pmat2|> - tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |> - left_join(pfnames2)|> - group_by(site,fname,aliquot,cohort) |> - summarize(meanAbundance=mean(abundance,na.rm=T)) |> - subset(!is.na(meanAbundance))|> - left_join(meta) - -##save to file -readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv') -syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365')) - - -``` - -Now that we have two cohorts we can try to combine without batch correction. - -### Combined phospho -Combining the phoshpo data here. - -```{r combined phospho} - -##now we move back to long form -# plong <- rbind(plong1,plong2) - -plong1 <- plong1 |> dplyr::filter(!is_unwanted_fname(fname)) -plong2 <- plong2 |> dplyr::filter(!is_unwanted_fname(fname)) -plong <- rbind(plong1, plong2) |> dplyr::filter(!is_unwanted_fname(fname)) - - #pmat |> -# as.data.frame()|> -# tibble::rownames_to_column('site')|> -# pivot_longer(-site,names_to='fname',values_to='abundance')|> -# left_join(rbind(pfnames1,pfnames2))|> -# group_by(site,fname,aliquot,cohort) |> -# summarize(meanAbundance=mean(abundance,na.rm=T)) |> -# left_join(meta) - - -compsites <- plong|> -# subset(meanAbundance>(-5))|> - group_by(site)|> - summarize(spec = n_distinct(Specimen))|> - subset(spec==31) - -#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0 - -ppcs<-plong|>ungroup()|> - dplyr::select(Specimen,meanAbundance,site)|> - unique()|> - subset(site%in%compsites$site)|> - #subset(!is.na(site))|> - #subset(!is.na(meanAbundance))|> - tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance', - values_fn=mean,values_fill=0)|> - tibble::column_to_rownames('site')|> - t()|> - prcomp() - -pplot<-ppcs$x|> - as.data.frame()|> - dplyr::select(PC1,PC2,PC3)|> - tibble::rownames_to_column('Specimen')|> - left_join(meta)|> - dplyr::select(PC2,PC1,Specimen,Patient,cohort)|> - mutate(cohort=as.factor(cohort))|> - distinct()|> - ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ - geom_point()+ - #ggrepel::geom_label_repel()+ - ggtitle("Phospho samples")+ - ggplot2::scale_color_manual(values=pcols) - -ph<- plong |>ungroup()|> - subset(site%in%compsites$site)|> - ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() - -cowplot::plot_grid(ph,pplot) -ggsave('cNFPhosphoQC.png',width=10) - -pplot -ggsave('phosphoPCA.pdf') -``` - -Clearly there is a strong batch effect. -## Normalize golbal proteomics -Now we can move onto the global data - -### Cohort 1 global -Global proteomics in cohort 1 here. - -```{r global} -####now process global -#global1<-readr::read_tsv(syn$get('syn64906445')$path) -global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |> - tidyr::separate_rows(Genes,sep=';') -##logtransform, medina transform - -#global1[which(global1==0,arr.ind=TRUE)]<-NA - -gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) - -gmat1<-gmat1|> - as.data.frame()|> - mutate(Genes=global1$Genes) - -##extract aliquot info from file name -gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=1)|> - dplyr::filter(!is_unwanted_fname(fname)) - -glong1<-gmat1|> - tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|> - left_join(gfnames1)|> - group_by(Genes,fname,aliquot,cohort)|> - summarize(meanAbundance=mean(abundance))|> - subset(is.finite(meanAbundance))|> - left_join(meta) - - -readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv') -syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365')) -``` - -### Cohort 2 global -October 7 we process the second cohort. -```{r batch 2 global} -global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|> - tidyr::separate_rows(Genes,sep=';') - -#global2[which(global2==0,arr.ind=TRUE)]<-NA - -gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) - 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) -rownames(gmat2)<-global2$Genes - -gmat2<-gmat2|> - as.data.frame()|> - mutate(Genes=global2$Genes) - -gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=2)|> - dplyr::filter(!is_unwanted_fname(fname)) -gfnames1 - -glong2<-gmat2|> - tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|> - left_join(gfnames2)|> - group_by(Genes,fname,aliquot,cohort)|> - summarize(meanAbundance=mean(abundance))|> - subset(is.finite(meanAbundance))|> - left_join(meta) - -#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|> -# subset(numIso>1) - - -readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv') -syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365')) -``` - -### Global combined without batch correction -Now we can combine the global withot batch correction. - -```{r combined global test} -#ma<-mean(glong$abundance,na.rm=T) -#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0 -glong1 <- glong1 |> dplyr::filter(!is_unwanted_fname(fname)) -glong2 <- glong2 |> dplyr::filter(!is_unwanted_fname(fname)) -glong <- rbind(glong1, glong2) |> dplyr::filter(!is_unwanted_fname(fname)) |> - subset(Genes!="") - -n_spec <- meta |> dplyr::distinct(Specimen) |> nrow() -compsites <- glong|> -# subset(meanAbundance>(-5))|> - group_by(Genes)|> - summarize(spec = n_distinct(Specimen))|> - subset(spec==n_spec) - -gpcs<-glong|>ungroup()|> - dplyr::select(Specimen,meanAbundance,Genes)|> - subset(!is.na(Genes))|> - subset(Genes!="")|> - subset(Genes%in%compsites$Genes)|> - subset(!is.na(meanAbundance))|> - tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|> - tibble::column_to_rownames('Genes')|>t()|> - prcomp() - -gplot<-gpcs$x|> - as.data.frame()|> - dplyr::select(PC1,PC2)|> - tibble::rownames_to_column('Specimen')|> - left_join(meta)|> - dplyr::select(PC1,PC2,Specimen,Patient,cohort)|> - mutate(cohort=as.factor(cohort))|> - distinct()|> - ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ - geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+ - scale_color_manual(values=pcols) - - -hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() - - -cowplot::plot_grid(hplot,gplot) -ggsave('cNFGlobalQC.png',width=10) - - -gplot - -ggsave('globalPCA.pdf') - -``` - - -## Evaluate batch correction - -Now we have two separate long tables with metadata, but we would like to combine into a single one and batch correct.We can update this with each cohort. - -```{r combine and correct} - -##phospho -##TODO: ideally we should use the long tables and reconvert -pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2)) - -gmat <- merge(gmat1,gmat2) - -##remove duplicated sites -dsites<- unique(pmat$site[which(duplicated(pmat$site))]) -mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |> - t() |> - as.data.frame() |> - tibble::rownames_to_column('site') - -pmat <- pmat |> - subset(!site %in% dsites) |> - rbind(mvals) - -##now convert to matrix -pmat <- pmat |> - tibble::remove_rownames() |> - tibble::column_to_rownames('site') |> - as.matrix() - -gmat <- gmat |> - subset(Genes!='')|> - tibble::remove_rownames() |> - tibble::column_to_rownames('Genes') |> - as.matrix() -##sigh, batch correct? -library(sva) -# -# pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 -# cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE) -# -# gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 -# cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE) - - -pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 -## align ComBat batch vector to pmat columns via filename→cohort maps -sample_meta_p <- data.frame(fname = colnames(pmat)) |> - dplyr::left_join(rbind(pfnames1, pfnames2), by = "fname") |> - dplyr::select(fname, cohort) -keep_p <- !is.na(sample_meta_p$cohort) -pmat <- pmat[, keep_p, drop = FALSE] -pbatch <- sample_meta_p$cohort[keep_p] -cbmat <- sva::ComBat(dat = pmat, batch = pbatch, mean.only = FALSE) - -gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 -## align ComBat batch vector to gmat columns via filename→cohort maps -sample_meta_g <- data.frame(fname = colnames(gmat)) |> - dplyr::left_join(rbind(gfnames1, gfnames2), by = "fname") |> - dplyr::select(fname, cohort) -keep_g <- !is.na(sample_meta_g$cohort) -gmat <- gmat[, keep_g, drop = FALSE] -gbatch <- sample_meta_g$cohort[keep_g] -cgmat <- sva::ComBat(dat = gmat, batch = gbatch, mean.only = FALSE) - - -ppcs<-prcomp(t(cbmat)) -gpcs<-prcomp(t(cgmat)) - -``` - -# plot batch corrected data -```{r} - -# Reusable theme: white background + light grey gridlines -my_theme <- theme_bw() + - theme( - panel.grid.major = element_line(color = "grey85"), - panel.grid.minor = element_line(color = "grey92"), - panel.background = element_rect(fill = "white", color = NA), - plot.background = element_rect(fill = "white", color = NA), - legend.title = element_text(size = 9), - legend.text = element_text(size = 7), - legend.key.size = unit(0.5, "cm"), - legend.spacing.y = unit(0.2, "cm"), - legend.box.spacing= unit(0.3, "cm") - ) - - -# --- Corrected phospho plot (white bg + grey grid) --- -pplot <- ppcs$x |> - as.data.frame() |> - dplyr::select(PC1, PC2) |> - tibble::rownames_to_column("fname") |> - left_join(rbind(pfnames1, pfnames2)) |> - left_join(meta) |> - dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> - mutate( - Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), - Tumor = factor(Tumor), - cohort = as.factor(cohort) - ) |> - distinct() |> - dplyr::filter(!is.na(Patient), !is.na(Tumor)) |> - ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + - geom_point(size = 3) + - ggtitle("Corrected phospho samples") + - scale_color_manual(values = pcols) + - scale_shape_discrete(na.translate = FALSE) + - my_theme - -pplot - -# --- Corrected global plot (white bg + grey grid) --- -gplot <- gpcs$x |> - as.data.frame() |> - dplyr::select(PC1, PC2) |> - tibble::rownames_to_column("fname") |> - left_join(rbind(gfnames1, gfnames2)) |> - left_join(meta) |> - dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> - mutate( - Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), - Tumor = factor(Tumor), - cohort = as.factor(cohort) - ) |> - distinct() |> - ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + - geom_point(size = 3) + - ggtitle("Corrected global samples") + - scale_color_manual(values = pcols) + - my_theme - -gplot - - -ggsave("phosphoCorrectedPCA.pdf", plot = pplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) -ggsave("globalCorrectedPCA.pdf", plot = gplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) - -``` - -## Upload batch-corrected data to synapse - -Now we can reformat the batch-corrected data and upload to syanps - -```{r upload batch corrected} - -pc_long <- cbmat |> - as.data.frame() |> - tibble::rownames_to_column('site') |> - pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |> - left_join(rbind(pfnames1,pfnames2)) |> - left_join(meta) |> - distinct() - -gc_long <- cgmat |> - as.data.frame() |> - tibble::rownames_to_column('Gene') |> - pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |> - left_join(rbind(gfnames1,gfnames2)) |> - left_join(meta) |> - distinct() - - -readr::write_csv(pc_long,file = 'batch12_correctedPhospho.csv') -readr::write_csv(gc_long,file = 'batch12_correctedGlobal.csv') - -syn$store(File('batch12_correctedPhospho.csv',parentId = 'syn70078365')) -syn$store(File('batch12_correctedGlobal.csv',parentId = 'syn70078365')) - -``` diff --git a/03_drug_biomarkers_legacy_code.Rmd b/03_drug_biomarkers_legacy_code.Rmd new file mode 100644 index 0000000..ae9d97c --- /dev/null +++ b/03_drug_biomarkers_legacy_code.Rmd @@ -0,0 +1,1265 @@ +--- +title: "Evaluate drug and omics data for biomarker assessment" +author: "Sara gosline" +date: "2025-03-13" +output: html_document +--- + +This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics + +We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +library(ggplot2) +library(dplyr) +library(tidyr) + +``` + +# Pull processed files from previous markdowns + +We have already run the previous scripts and stored data on Synapse + +```{r pull files} +Sys.setenv() + +source("cNF_helper_code.R") +traceback() + +source('cNF_helper_code.R') +##read in drug code +fits <- readr::read_tsv(synGet('syn69947322')$path) + + +##read in proteomic data +glong <- readr::read_csv(synGet('syn70078416')$path) +plong <- readr::read_csv(synGet('syn70078415')$path) + + +##read in transcrniptomic data +#TODO: process transcritpomic data into long format + + +``` + +## Format protein data to collect correlation values + +Do simple correlations to identify putative trends in the data. + +Get most efficacious, variable, and heatmap + +```{r} +# ensure an output folder + +outdir <- "figs" +if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +shared <- intersect(fits$improve_sample_id, glong$Specimen) +message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", +length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) + +glob_dat <- glong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, Gene, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "Gene", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +phos_dat <- plong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, site, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "site", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +## drug data to matrix here + +drug_dat <- fits |> +subset(dose_response_metric == "uM_viability") |> +dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> +tidyr::pivot_wider( +names_from = "improve_drug_id", +values_from = "dose_response_value", +values_fn = mean +) |> +tibble::column_to_rownames("improve_sample_id") + +## summarize drugs + +drug_counts <- fits |> +subset(dose_response_metric == "uM_viability") |> +group_by(improve_drug_id) |> +distinct() |> +summarize( +meanResponse = mean(dose_response_value, na.rm = TRUE), +nMeasured = n_distinct(improve_sample_id), +variability = sd(dose_response_value, na.rm = TRUE), +.groups = "drop" +) + +# -------- Plot 1: most efficacious -------- + +p1 <- drug_counts |> +arrange(desc(meanResponse)) |> +subset(meanResponse < 0.5) |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most efficacious drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) + +# -------- Plot 2: most variable -------- + +p2 <- drug_counts |> +arrange(desc(variability)) |> +subset(variability > 0.15) |> +as.data.frame() |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most variable drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) + +# -------- Plot 3: heatmap of complete-measurement drugs -------- + +fulldrugs <- drug_counts |> +subset(nMeasured == nrow(drug_dat)) + +# Save large, rotate column labels, and shrink font to avoid overlap + +# pheatmap can write directly to a file via the `filename` arg. + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +filename = file.path(outdir, "drug_heatmap_large.pdf"), +width = 28, # inches: large so column labels are readable +height = 16, +dpi = 300, +angle_col = 45, # rotate column (drug) labels +fontsize_col = 6, # smaller font for many drugs +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +# Optional: also save a PDF (great for vector zooming) + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +filename = file.path("figs/drug_heatmap_large.pdf"), +width = 28, +height = 16, +angle_col = 45, +fontsize_col = 6, +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +# ---- PRINT plots in the document as well ---- + +print(p1) +print(p2) + +# Print the heatmap to the document (no filename draws it) + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +angle_col = 45, +fontsize_col = 6, +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +``` + + +# Basic correlation tests + +Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? + +We can define a simple test to correlate features and drugs and assess significance and correct: + +```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} + +#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers +#also coputes significance +computeCors <- function(drug_dat,feat_dat,shared){ + + cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> + as.data.frame() |> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> + arrange(desc(cor)) + + ##now lets try to get significance + csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(feat_dat),function(y){ + pval <- 1.0 + try(pval <- cor.test(drug_dat[shared,x], + feat_dat[shared,y], + use = 'pairwise.complete.obs', + method = 'spearman')$p.value,silent = TRUE) + + return(c(corp = pval,drug = x,gene = y)) + })) |> + as.data.frame() |> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + })) |> + as.data.frame() |> + mutate(drug = unlist(drug)) |> + mutate(gene = unlist(gene)) + + fullcors <- cres|>left_join(data.frame(csig)) |> + mutate(direction=ifelse(cor<0,'neg','pos')) + + return(fullcors) +} + +``` + +Now that we have a function we can compute correlations of each data type. + +```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} + +gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> + mutate(data='proteins') +pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> + mutate(data = 'phosphosites') + +allcor <- rbind(gcor,pcor) + +corsummary<-allcor |> subset(fdr<0.25) |> + group_by(drug,data,direction) |> + summarize(features=n(),meanCor=mean(cor)) + + +p_features <- corsummary |> + subset(features > 1) |> + ggplot(aes(x = drug,y = features,fill = direction)) + + facet_grid(data~.) + + geom_bar(position='dodge',stat='identity') + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + +ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), + plot = p_features, width = 12, height = 6, units = "in") + +corsummary |> + arrange(desc(features)) |> + subset(features > 100) |> + dplyr::select(drug,data,direction,features,meanCor) + + + +``` + +Now we have the correlation values. what do we do with them? +## Correlation based enrichment +Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. +```{r functional enrichment} +# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === +# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) +# Outputs: +# prot_enrich[[drug]]$top / $bottom +# phos_enrich[[drug]]$top / $bottom +# Optional CSVs in folder "leapR_top_paths/dir_split/" + +library(dplyr) +library(tidyr) +library(stringr) +library(SummarizedExperiment) +library(leapR) + +# ---- choose drugs to run (use your two, or set to a larger list) ---- +# target_drugs <- c("THZ1", "Onalespib") +target_drugs <- unique(fits$improve_drug_id) + +# ---- genesets ---- +data(msigdb); geneset_db <- msigdb +data(kinasesubstrates) + +# ---- helpers ---- +extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] + toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) +} + +# Correlate response vector vs each feature column (Spearman) +col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build one-column SE; column name must match primary_columns +build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) + assayNames(se) <- assay_label + se +} + +safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) + +# ---- feature and mapping vectors ---- +prot_features <- colnames(glob_dat) +phos_features <- colnames(phos_dat) +phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), + phos_features) + +# ---- results containers ---- +prot_enrich <- list() +phos_enrich <- list() + +# optional: write CSVs? +write_csvs <- TRUE +outdir <- "leapR_top_paths/dir_split" +if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +# ---- per-drug workflow ---- +for (drug in target_drugs) { + message("=== ", drug, " ===") + # build response vector (mean per sample if repeats) + dv <- fits %>% + filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% + group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") + if (nrow(dv) == 0) { message(" no response rows; skipping"); next } + dv_vec <- setNames(dv$resp, dv$improve_sample_id) + + # correlations + prot_cor <- col_spearman(dv_vec, glob_dat) + phos_cor <- col_spearman(dv_vec, phos_dat) + + # split by sign + prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] + prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] + phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] + phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] + + # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- + prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(prot_pos) >= 5) { + se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") + prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_top, assay_name = "proteomics", + primary_columns = paste0(drug, "_TOP")) + prot_enrich[[drug]]$top <- prot_top + if (write_csvs && !is.null(prot_top)) { + write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) + } + } else message(" PROT top: too few positive features (", length(prot_pos), ")") + + # BOTTOM (flip sign so more negative = larger positive rank) + if (length(prot_neg) >= 5) { + se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") + prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_bot, assay_name = "proteomics", + primary_columns = paste0(drug, "_BOTTOM")) + prot_enrich[[drug]]$bottom <- prot_bot + if (write_csvs && !is.null(prot_bot)) { + write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) + } + } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") + + # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- + phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(phos_pos) >= 5) { + se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_top, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") + phos_enrich[[drug]]$top <- phos_top + if (write_csvs && !is.null(phos_top)) { + write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) + } + } else message(" PHOS top: too few positive features (", length(phos_pos), ")") + + # BOTTOM (flip) + if (length(phos_neg) >= 5) { + se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_bot, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") + phos_enrich[[drug]]$bottom <- phos_bot + if (write_csvs && !is.null(phos_bot)) { + write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) + } + } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") + +} + +# Save all direction-split results for later reuse +save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") + +message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") + + +``` + + + +For each drug, how many terms do we see active? how many kinases? +```{r functional enrichment} +# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== +library(dplyr) +library(tidyr) +library(purrr) +library(tibble) +library(ggplot2) +library(forcats) +library(stringr) +library(scales) + +# Always load the precomputed enrichment lists here +load("leapR_enrichment_direction_split.Rdata") +if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") +if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") + +alpha <- 0.05 +topN <- 15 # <<< top 15 +dirs <- c("resistant","sensitive") + +# ---------- helpers ---------- +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) + if ("padj" %in% cols) return(list(kind="adj", col="padj")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) + NULL +} + +extract_term_col <- function(df) { + cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") + hit <- cands[cands %in% names(df)] + if (length(hit)) hit[[1]] else NULL +} + +tidy_one_result <- function(x) { + if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) + df <- as.data.frame(x) + if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) + + term_col <- extract_term_col(df) + if (is.null(term_col)) { + df <- tibble::rownames_to_column(df, "pathway") + } else { + df <- dplyr::mutate(df, pathway = .data[[term_col]]) + } + df$pathway <- as.character(df$pathway) + + pk <- pick_pcol(df) + if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) + adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") + + tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> + filter(is.finite(adj_p), !is.na(adj_p)) +} + +flatten_by_direction <- function(lst, omic_label) { + if (!length(lst)) return(tibble()) + purrr::imap_dfr(lst, function(two, drug) { + bind_rows( + tidy_one_result(two$top) |> mutate(direction = "resistant"), + tidy_one_result(two$bottom) |> mutate(direction = "sensitive") + ) |> + mutate(drug = as.character(drug), omic = omic_label) + }) +} + +# ---------- long-format enrichment and significance filter ---------- +prot_long <- flatten_by_direction(prot_enrich, "global") +phos_long <- flatten_by_direction(phos_enrich, "phospho") +enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() +stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) + +enrich_sig <- enrich_long |> + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) + +if (nrow(enrich_sig) == 0) { + message("No significant pathways at FDR < ", alpha, ".") + pathway_summary <- tibble() + drug_counts <- tibble() +} else { + pathway_summary <- enrich_sig |> + group_by(omic, direction, pathway) |> + summarise(n_drugs = n_distinct(drug), .groups = "drop") |> + arrange(desc(n_drugs)) + + all_drugs <- sort(unique(enrich_long$drug)) + drug_counts <- enrich_sig |> + group_by(drug, direction) |> + summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> + complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> + arrange(drug, direction) + + # ---------- summary figures ---------- + dir.create("figs", showWarnings = FALSE) + + reorder_within <- function(x, by, within, sep = "___") { + x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) + } + scale_y_reordered_wrap <- function(width = 32, sep = "___") { + ggplot2::scale_y_discrete( + labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) + ) + } + + pathway_summary_top <- pathway_summary |> + group_by(omic, direction) |> + slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> + ungroup() |> + mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) + + p_pathways <- ggplot(pathway_summary_top, + aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + + geom_col(position = position_dodge(width = 0.85), width = 0.85) + + facet_wrap(~ omic, scales = "free_y") + + scale_y_reordered_wrap(width = 36) + + scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + + labs(title = paste0("Top ", topN, " pathways enriched across drugs"), + y = "Pathway", x = "# Drugs") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + strip.text = element_text(face = "bold"), + axis.text.y = element_text(size = 7), + panel.grid.major.x = element_blank(), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) + + coord_cartesian(clip = "off") + + # --- PDF saves (summary figs) --- + ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, + width = 12, height = 10, units = "in", device = cairo_pdf) + print(p_pathways) + + drug_counts_full <- drug_counts |> + group_by(drug) |> + mutate(total = sum(n_pathways)) |> + ungroup() |> + mutate(drug = forcats::fct_reorder(drug, total)) + + p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + + geom_col(position = position_dodge(width = 0.9), width = 0.85) + + labs(title = "Number of enriched pathways per drug", + x = "Drug", y = "# Pathways") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), + panel.grid.major.x = element_blank() + ) + + ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, + width = 18, height = 7, units = "in", device = cairo_pdf) + print(p_counts) +} + +# ---------- Top-2 most efficacious & most variable drugs ---------- +top2_efficacious <- character(0) +top2_variable <- character(0) + +if (exists("fits")) { + eff_tbl <- fits |> + filter(dose_response_metric == "uM_viability") |> + group_by(improve_drug_id) |> + summarise( + meanResponse = mean(dose_response_value, na.rm = TRUE), + variability = sd(dose_response_value, na.rm = TRUE), + nMeasured = dplyr::n_distinct(improve_sample_id), + .groups = "drop" + ) + top2_efficacious <- eff_tbl |> + arrange(meanResponse, desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) + top2_variable <- eff_tbl |> + arrange(desc(variability), desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) +} else { + some <- unique(enrich_long$drug) + top2_efficacious <- head(some, 2) + top2_variable <- head(rev(some), 2) +} + +# ---- Force-include Onalespib in top_interest (case-insensitive) ---- +ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) +if (length(ona_matches) == 0) ona_matches <- "Onalespib" + +top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) +message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) +message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) +message("Force-included: ", paste(ona_matches, collapse = ", ")) + +# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- +pick_pcol_plot <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return("BH_pvalue") + if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") + if ("adj.P.Val" %in% cols) return("adj.P.Val") + if ("padj" %in% cols) return("padj") + if ("pvalue" %in% cols) return("pvalue") + if ("P.Value" %in% cols) return("P.Value") + NA_character_ +} + +sig_stars <- function(p) dplyr::case_when( + is.na(p) ~ "", + p < 0.001 ~ "***", + p < 0.01 ~ "**", + p < 0.05 ~ "*", + TRUE ~ "" +) + +prep_plot_df <- function(res_df, n = 15) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) + if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") + col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) + + # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) + adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] + + df |> + mutate( + adj_p = as.numeric(adj_p), + score = -log10(pmax(adj_p, 1e-300)), + stars = sig_stars(adj_p), + signif = !is.na(adj_p) & adj_p < 0.05, + feature = stringr::str_wrap(as.character(feature), width = 40) + ) |> + filter(is.finite(adj_p)) |> + arrange(adj_p, desc(score)) |> + slice_head(n = n) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) return(NULL) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + star_pad <- 0.15 + + ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + + scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + + geom_text(aes(y = score + star_pad, label = stars), + size = 3, hjust = 0) + + coord_flip(clip = "off") + + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 7), + axis.text.x = element_text(size = 8), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) +} + +safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + +dir.create("figs", showWarnings = FALSE) +for (drug in top_interest) { + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + + p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " — global"), "top") + p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " — global"), "bottom") + p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " — Phospho"), "top") + p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " — Phospho"), "bottom") + + if (!is.null(p1)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), + p1, width = 7, height = 5, device = cairo_pdf); print(p1) + } + if (!is.null(p2)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), + p2, width = 7, height = 5, device = cairo_pdf); print(p2) + } + if (!is.null(p3)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), + p3, width = 7, height = 5, device = cairo_pdf); print(p3) + } + if (!is.null(p4)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), + p4, width = 7, height = 5, device = cairo_pdf); print(p4) + } +} + + + +``` + + + + +# Print siginficant results for all drugs if we want. +```{r} +library(dplyr) +library(ggplot2) +library(tibble) +library(rlang) + +alpha <- 0.05 # significance threshold +top_n_to_show <- 15 + +# Prefer adjusted p if available; fall back to raw and adjust per-run +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + return(NULL) +} + +prep_plot_df <- function(res_df, n = top_n_to_show) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df) + if (!nrow(df)) return(NULL) + + pick <- pick_pcol(df) + if (is.null(pick)) return(NULL) + + # unify to adj p + if (pick$kind == "adj") { + df <- df %>% mutate(adj_p = !!sym(pick$col)) + } else { # raw p → adjust within this run + df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) + } + + df %>% + rownames_to_column("feature") %>% + arrange(adj_p) %>% + # keep only significant ones; if none, return empty (caller will message) + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% + head(n) %>% + mutate(score = -log10(pmax(adj_p, 1e-300))) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) { + message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") + return(NULL) + } + # Correct labels & colors for uM_viability convention: + # TOP -> resistant (red), BOTTOM -> sensitive (blue) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + + p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(fill = fillc) + + coord_flip() + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 6), + axis.text.x = element_text(size = 8) + ) + print(p); invisible(p) +} + +plot_drug_panels <- function(drug) { + message("\n=== ", drug, " ===") + + # global + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") + plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") + + # Phospho pathways + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") + plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") +} + + +# --- run for your drugs --- +for (d in target_drugs) plot_drug_panels(d) + +``` + + + + +# Basic drug list +```{r} +drug_counts <- fits %>% + filter(dose_response_metric == "uM_viability") %>% + group_by(improve_drug_id) %>% + summarise( + n_rows = dplyr::n(), # total rows/measurements + n_specimens = n_distinct(improve_sample_id), # unique samples tested + meanResponse = mean(dose_response_value, na.rm = TRUE), + sdResponse = sd(dose_response_value, na.rm = TRUE), + .groups = "drop" + ) %>% + arrange(desc(n_specimens), improve_drug_id) + +# Plain list of drugs + total count +drug_list <- sort(unique(drug_counts$improve_drug_id)) +n_drugs <- length(drug_list) + +message(sprintf("Total unique drugs: %d", n_drugs)) +print((drug_list)) + +``` + + + + + + + + + + + + + + + + + + + + + +## Visualization +How should we visualize? Here is some older code +```{r plot cors, eval=FALSE} + +plotCors <- function(features,druglist,dataType='proteins'){ + ##subset a list of features and drugs and plot those in a graph + require(ggplot2) + if(dataType=='proteins'){ + ptab<-glong|>dplyr::rename(feature='Gene') + }else{ + ptab<-plong|>dplyr::rename(feature='site') + } + dtab<-fits|> + subset(dose_response_metric=='uM_viability')|> + dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> + subset(Drug%in%druglist) + + + ftab<-features|>left_join(ptab)|>left_join(dtab)|> + subset(!is.na(Drug)) + + feats <- unique(features$feature) + plots <- lapply(feats,function(x){ + corval <- ftab[ftab$feature==x,'cor'] + #corval <- ftab[ftab$feature==x,'pCor'] + + ftab|>subset(feature==x)|> + ggplot(aes(x=correctedAbundance,y=dose_response_value, + col=Patient,size=1))+ + geom_point()+ + facet_grid(~Drug)+ + ggtitle(paste(x,'Drug correlation'))+ + scale_color_manual(values=pcols) + }) + cowplot::plot_grid(plotlist= plots,ncol=2) + +} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.25)|> + subset(abs(cor)>0.7)|> + subset(data=='proteins')|> + arrange(desc(abs(cor))) + +plotCors(rename(features[1:10,],feature='gene'),druglist) + +ggsave('onalespibFDR0.25Cors.pdf',height=20) + +``` + +# Random forest predictor + +Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. + +First we build the data frames needed - I've included cohort as a covariate but may remove it. + +```{r random forest} +## separate out cohorts for prediction +cohorts <- meta |> + select(Specimen,cohort) |> + distinct() |> + tibble::column_to_rownames('Specimen') + +##for each drug, build model of cohort + protein ~ drug response +#removed cohort for now +gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) +gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) + +pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) +pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) + +mdf <- meta[-c(2,5),]|> ##have duplication here + tibble::column_to_rownames('Specimen') + + +``` + +Now we can loop through every drug, build model, and assess accuracy. + +```{r evaluate predictivty} + +#trying ou tthis function tos ee how it goes +rfFeatures <- function(drug_dat,fdf, mdf){ + complete_drugs <- which(apply(drug_dat,2, + function(x) length(which(!is.na(x)))==length(x))) + print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) + all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ + + dg <- fdf#[,-gnas] + + ##create the metadata df with the drug of interest + dmdf <- mdf |> + mutate(drug=drug_dat[rownames(mdf),drug]) + + rf <- randomForest::randomForest(x=dg, + y=drug_dat[rownames(dg),drug], + importance=TRUE,ntree=500) + + im <- randomForest::importance(rf)|> + as.data.frame() |> + mutate(drug=drug) + pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) + + #pheatmap::pheatmap(t(glob_dat[pord, + # rownames(im)]),annotation_col=dmdf, + # cellheight=10,cluster_cols = TRUE) + return(im) + ##what do we return?x + })) + return(all_preds) +} + + +``` + +Now what do we do with the importance features? + +```{r rf processing} + +##get importance for global +gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) + +##get importance for phospho +pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) + +``` + + + + + + + + + + + + + + + + + +# Old correlation code, dont run + +now we can visualize correlations + +```{r check out HSP90s, eval=FALSE} + +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) + +ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) + +plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') + +ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) + + +``` + +### Now lets look only at IC50 values + +There are a few drugs for which we have IC50 values + +```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='fit_ic50') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(fdr<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+ + facet_grid(~direction)+ + geom_bar(position='dodge',stat='identity') + + + +``` + +Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . + +```{r plot individual sites, eval=FALSE} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.05)|> + subset(data=='global') + +plotCors(rename(features,feature='gene'),druglist) + + +druglist<-c('Digoxin') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.1)|> + subset(data=='phospho') + +#plotCors(rename(features,feature='gene'),druglist,data='phospho') + + + +``` + +Now we can check the HSP proteins directly + +```{r HSP correlation, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) + +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) + +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` + +The IC50 result is similar to the viability. Now we can check AUC + +```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='auc') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(bh_p<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') + +print(corsummary) + +``` + +```{r HSP correlation again, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>s +ubset(drug=='Onalespib')|>subset(corp<0.05) +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) +cor_hsps + +plotCors(rename(cor_hsps, feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` diff --git a/cNF_Analysis_and_fig_descriptions.docx b/cNF_Analysis_and_fig_descriptions.docx new file mode 100644 index 0000000000000000000000000000000000000000..016a474dbadaf63777738bfd1972473b9fe079b6 GIT binary patch literal 27471 zcmeFZ)0bw!)298D@sw@bwr$%syKJM&wyVqNvTd_VUAApc|K4wXYi8EWfACHoXl@E6IX^qX8fR&;S5{7{HR~sA&NL0F*!g0H^?HP+d_+2Ul|kS3@-~Cvz8l z22XoCq9SlmssaG$Kl}gJ{u|FgQ_7^xAQO`KbJ$nNe9Nj#2UW??cu~ALuGJez6q|zB zqeyYrSI<5da|szU+xQ64?1s-QG;_%O-z+;AoUpkxDUPTIAV1Ns0nY>M3! zrY+%djzBxMu5N5O4iF~N=E0+Rn$cvqIe}**$XllFkv{W$4>=B&aDi=_6{4Ozy(gUSD9OUOU zs^=VL15yU*FVIM+84(wBJVcECV)ad;vh{4Z`Yx3a#!!2nACn>+~;aayncjVSz6AMl?%4S$&1xiB*Pr~iMl{BNwq|K-)I zlluS35>DtkD)Tj8Wgmyv>fXCs?EyRC*SHSsQ#(3iIF@f(WDv{=ca4^%)7X^Z1`sw9ZVt2PsVp& z3q2H_uj8mAY(b7|V(liwT}Rv;ec}6mLF)h1N5B%{+$<9SpzIFjx=9*#fE7|k3_ z-R%Fxu>SmLMPA0@-=O;JIpnlLzM&JeI1>(hrwBNXd7{%`-yElf;KV{0LR4l33*vQz=FYf_u zSzZglVTrge*X}7`UXms~_*}JuUaj}%uXXcViAAe-_>(~2Cx_!Kao+EnFTwA!-d?^R zpuCyPf@wz3f;=5L>(}n3&U}JAd@&c)+1m5{Q_X`!up_rY_)`cWx2w%-yXHtd5tY?r zuIz|l+2Y6_dDgE$+P}zyYnqP|E+XeB1|uMkqCPfZ%$*aXN>_YR_jN9-1_V;wqe68e z>e>V{o%c#3#4|S5M4tv={ncp_Hd~Ir(4$$eIFmZ~RL=cLttUyFuQ?Y`c&5{k`7&4u zd+?xkr=oUExwvz*cUR1|0)vhiNV}BdVtw}}H>wTZ|TYx*7lTNW6&yY4DSOyL9 zO-oC{Y+aHS29RwznqBYziGO+DQ+$HFKl`Nn&^u1hA2vWMyAF3V8a2EIj)*An!ev4^ z!F^br&9GiEbzWQD9J1k?IimtFSQ-7Ue$@UUYj&%ynQbK|tebfl-kRnUxX+eUJHYAu zDQf2UCsX8=b1`!Y5OPo0gJ}iZr*6ZN?+S3U1`JxTBSyQ%v|#Pe%~aryI@>0x^B`=a z7@7iNyX;H zvUDV8_P=Z`fAgZTKGpN~Zl5wtmn}Lu;VWl&g@EqZ$`~Y%fV9Uez^FJ^^n>MKHgvEr zp44+B$`13Xoi{!NdNkMllj>d0G3KS$LM))z@w6|izc;=olS^rLtMDar&`xG|0z%-~ zmmz-d84vG@^GJ%Y%gD@Yq0p5DzUY+=VD?iRpoK2&N8|%TrOwh#7NrdhO-8RL%fsBs zr86~pK&!X)t*he-`wMOt2zizo@|6#uT&;OBt2FwDHN8J64G%5 zzW6m}qfIV23l(2%sS6tfU(nug_?8fBqyd+Ht%mrBh1%SqwAo;?B;T~q{#E>b+vPlP zmC{2frxUSdyQ}&WAsnqIwJwH26!Z?{j|sSw?J%fcT$v!6QAblF`B7w0{w|3MgU+L- z7)A1+Zj3o6!tf3y6%A|ix-}ElPK6ze9ZK9rPrzM%P6M2G!6~v_ciF-jOPH{NRCd5xYePnDTCKA}O7%weOAAl&UreG$ zvx7$~Or+q>$emwO3d8FAG+>y!jfyz-=I%(@h)UpAlSnd}pse^PsDJhO?uO}mT{GR{ zbGZ8*T_7V!wjB0P!qnRXBSFk0Fdckt#W+w#7~n2J?N^z=HGOXBolzsHZj!KESs*8v z27Zo&lszH*a#0(v=1zc~5OlvSXFnN2^Y7?3Ij<6o5?LX7#q6HevcU>t5#4G zEG8b(6={=poy_rosxHXRb+Dlkh<5o5VGcH!z`{4~JQcSCnK-z!i0s^DD5;F?JV; z-awOJSLzK8?XDa{`z7$GJXj@S3}l_&N~DfoEIceaOD`;w_W319!!WSCgT^yK$*@ul9y0W2#SD45JVgj)tz}&fTHJ2M$ ziGt5Rcg3ACU4{YY4@|BwUSm^@?0-Nwnt6fH9v__Uj|X7PNEm?9TtA=Pl$Kh!XK(J- zO=WY4o@|J1HQ_|rlx-&R9u(CIZIP_}`tf3Zhp*%bm0me(W_W zwgY}GgQ?Kib**!ISUOn|fPwMBTcq`Z3%}S&PR7sw{^uSCE?-l_{CVJeGy&4Hxe4?X zBy9^G{yUAM0&E19{LlIzYdMkef_Xd5(!5Q1qAT328mM|z6{j+jLbj?wJ2A#SbEV5Z&_+zk0!=rnkRLVj?_xmtuV zlwTLy5|dpGs*Ide93It1x{BVqz@?DkN?b4eYD){rLs&Q`SAi*^k`npj0zd*{WzA+- zAcK(MImoqi6PavSG5r8z%t$CPvRfni8Jvzas@KGvAc_dN^QHGt*(b}4`nAnH z-gts$X4N85j|q>)5V;5gH=W)SZ*T&@M3|I^qQoT(=b6_R=olpN*jwBHvfK~ECi!A2 z5gm)NjkWVZva1MT5QbmxOvKMkMJ&hK6&IO>-rm*&RoZ z(Aiom(Ky9cd0(MCf&z2fnN1q!SYA3qODYOemXB{YZ#I6v#I1TL+4x0-ZLyy=zFwiJ zn<VZ$*GvATi49~4cyaxb*2>*7(}Q^ zC$yznsXCp;dgwS(s%_hY_cC(ljjAtcrSIN^B8)DwWH?DEMEHfwx04<_VcoXGb<+i4 z6*}Qgh?r9}Or;4c@sVyLD~ksc^dZV^XI*KOYSE)_alndh`=;=`c`rNJ2F1P{%BnLM ztNzVq@gb0U^el|_Lpg;u%Krq*34kR4W0c92E@LYwUBF%k0aV7D7J|cN7JkMAO(pw= z2C*{_`k9FoqwIX6G)BszB8^Pnj`2Ecr72Yl<@H)g@)KyC89j*7kxzM}pxovJ4{_%HY1@_$x#yfuKWTA&~EnUYifO52Tsv15~iyxVSko>z-5OTp!ig z{%lcYYN4g{6GSXXchWw%xlJG+(JAM{l+!6^axbmB`;TK)=;&h;Z?Kpc9GniTxoLzB z#6}Adm;FFF@-IEq@v!e0$TlbW7j;RTaMeOf9EuZ z8Qv=i(NKf-Y?^~EcfP;Qp(u@!Vl>`cn^RS|}Jw@^EpuXDf4CdZDHRK07W==H9xN*O!|2G?J$ z&b>a{1^GGq9?ZDxA#cP6$RQkTNXq<4lVcqx;Vb*0m*W1Ovc1c|i|M!5=cbupy-vRa zrz<$65?5YBoZ>3%=<{@*pyx$(voA|!7Bdosw3UnD#Jp$t@05M{Er(O^jf*L2h|z-<_;8HXYm@a0A7e^wve0Q$<}WA1 zOABj1n!pWs$)cOK1_yw;%TAcfHy32e0{a9aM4wEVQpkt6?Z7`;QLlK85y;id9)91S z+hSZ6gD@oFubm>!qCL>2S4kR?NB)1B$b6xRP9xk`o4oZ&KfA5D43PDJPIuroRs05R2Gahu3cKPTJFo*@Z=xp_3@Y zcLp8@C}hOHKZr`fr&yRrT=6l8ktWjw!Dx_bz*&}dAtDybcp4MNQ1lf*2W!!ZGiOUS zi^#$^{)X3~x1WS*$uVN(4yyaPR5Nlbks(YxN-V2AmI1m#_9R{^ca=`PVpA^qC9^(< z)*WxwLHPYm;s67zCb~j6+4h)T>mW3#zhEn#Nwsk*>DKaPls4nr8E_X5lJ)4-a z>4_bQhy|%8aAjeY?it|}% z=>;hDyi_kLG*rXB1&7RywL5K`tkEmi0mN5;z21m>Fd=4Z49_VArVc8IR;r}FVks&{ z7Et-%y%OpzGI;BF6wcDkfmQfDc4|Rkgp^kgJkEv?h{*>0b7yRE^%(zXy-CJ?a~mMe zUyV6Wb%JS0G&U9((?t?j!hG8Y*zx-+mZSDH;S`FfJ*N8Gve31l`{BCS8V9TfMt{=CK#^LtRNGE=7^JwV^Vd4&2YsfskF*7) z=0}I&llS1Nj#1COgA#=tEzC8vE1}AwdX5|;OmwHLIlIuDX<1w2_je_1 zC;CcHUD+6_4S))dWtquIVHq`G85(SeISo&d>r^_9)#}l?s91=F(bGzO9_rJTgCgE! z)!^TAOtm#WpT$@nGhgEjeN8=P20sgabp!N%*sOs-B2xakj;n?Es|RJ^&?$)`9B;w=Tz44Mj+s-f>5-_Pbx^$|bR2rIhL0?>=$)%m-VC2{Y3#0`-2-zCb5 zTiMiW>c^%H6?0MlDzUZms?oM5L@h7B+hD@uzIL$dpCjDqK64cgL$gI+!gTW&5@tZK z`hI|xW_w!&jB7LF=V478)VKX0!Z>5p22MVXktJ3k?d3>3AwYp;(r8Ip6m3+xUa}x& zS13P9YBAu|3H-Ko2y+O1-PdZqd%q6YMb_dfzV7{Q-ec6CW}3{tQEFJpWPCQ-Vdkx z>2W@ltp4EFW81}Dwio+Vtn)fHHDTy+Et55obR26%17=<`1Y&m49#5k*zjo6dp1L_L zZ+;pwXt8-%uGHN>P}k{4^xSk4`MF$;2)})tYKtek{(QQkAgQ{ucduPsfsL;oaZV(&U?~b7;ND#~qz-B# zZ8ncCM|bl?zK8>{T{nTqZ>&QQ2FB8rGw5@=u!D+GaESOQFb?aGY@NqxR#L;p>okpb zu8RU1Oz#()W8lCVOG`eCO ze%0!V_`XclG$N-L@{$?1+4@12E3^$-4;OemNB(y5((+J4m5Y9tf>!*-^o~d2=YgU} zjPht4il+YFPx4;4Lj#?hC>yz~h(QgBK#`WO?{h_$J>3WO$NeP*st4cfp713w{*)2U zs959{6)Y!Oh`g;-&G&YglgJqO?$igBU<~$F=x1%kZOpM&$PVUon~Ti?HOV=@E*kjC zhwBNg;uFp`xOgSV&RaJsDpaBD{Ty;g?k{0aXynI?f^E>~3>%MFfEk!YM0E|O*}ZTL zpvL1UoZJVwp!*nzXf#x7wjRrU>Un7Vb0u{)OF?urs+?hRFA4QDUY138vzu-hsI==C z9#0rQowwjHDKD?*k>ZY;e=~YTrh75U-QM^5M-Ws|x^;&z{=0tOW<0zQl>c+U1b}`4 zrRQ($>YB057+_WI4WxJ;A4p;GK2C-(%X1v0JYsD4jb$(OM2!&Q2$J^jb`ZB|S0&hP zCUvwToPj)A2JpaN1J|1l1h)(dSpjYwVXb^qXZB0{C=YnER9DT4iqp)hw&5}Ghg3qo zDPiz6I?>%T4Y2rAhyoQfNzJemWkcrcjz0tvC!W7KSI?sJ@gXa!o~Z5jMHZwG91ySG z9g7)gLyCo$&jUFYgMvHvcN7sQW;)vuYLwvzOD=Gga4tU)Il9a01iT zPd+8K^NG32E0}wb4sAuU*@dc(O_{dMb(zO5K){Li27WI)52<;Ld73a@OrL0^4!Y-&N}j0wrXQV{r56yd2{?S6yAQyB3ZLWS@a?R0yG^HJ>UA zlrs@kOs4H)yZyk<3&57hbyIX}Ut133n8o_&8npO$Waab5t~2i#ih?{v92g}N9bmNG8Vua(sapWj6+SvC)3mw`hv#x##6kPzm`3(eQ`; zBTg%5HNxc_cQWzRcMeJzjL3h60>jJVhQerX$W)*n^@B_FN~}{TpM;ZFWX5^&)odo< zi?rZ?+s>H@)5_&)MmPuNb%o04S9}LBh4SPM6CL}}3qWbxG2-%4p_j*th{63Zv)g?E zEP}m--BlF=!s#>eF3m7U3HIr4r~5x{;5i56#)3^5UDF{e$`k)@jX=(Z7w2|uUe{KJ3{sg5_esLMK z5J(Qr5C|4T%$a=_*z(?-g|kS&;;{2u_ayrkG@2$G?O%0hIR?z&6Ad@hPT%Z_nARZVMBL=#_9T1 zwsq0}>MGdF9el?n&xM$zh?HPQX+pB1{^#^=!3r)?9j61TEfjaj<8@v$<~Dy3+IIeQ zCWI_nF8Ygu1c@tXo2*DsL{U6`TGnAb?Sex`p>@0e)?SKWeJIR=LspM2g2nIDoNbkG z{G|&UU0+WOUnfd|BUOw>SWneRDtAjX$wnvo6nP66oG!3l<_4y&)(l$o!NCkD<;tK? zPURN*G@86w{)@m)-i!R@0SNi?XDFE}{Sk=vMeLKPAgsHLwmQ1JZ>_ev(pBS;4P;ue z)8*CmegL{D+SmPyos^wpmZcWWlHUV>@?OkIt;TDayH1Wzx?PD`( zwsem4HoDP&dE8{1tU>K6!F9u&9@d_&qk?(Np%@%5z1sY$YNKBl{g zC~jv484x#mB!@4``5+Xr3QNh!JVlwN2b<+?AWw()sLM6?09p?zW@!R3_JT*Vz+GyBM7ldHeDS^<^`D zMp@&A5&rFV(9&dBeuHRVYTWUtFa>Hn4i4+hw?Zwzd?@qSZ5!f7cX#)7Z_W#6xXZLz1iXh`hJwRDC1Eyb`cjNW-Ce7{dvr@cF%T$ zE7VPMkxXmE*_-ZHQjW1G;ce%cdvMqmTJA+ZY%ThjjHD)4&bybr9`WAS<^F<7-Vd-c zyPI>M!;`g7j>f@WgNc^Q*%sV8DD^I@bPPJChESw3()(KP9@g| zi|=Vf+)=D35`H_aCzpO>r0N(xJLw2>G&aIX-g&`z?mj|!mGu7wS zucVK%tG5H&S4nov)+&-B^^HezZj%TS58mJo-Nfra=qVuiq#S}{xD0@1B;?~LPM z#>wibNuD3jID%_9#YumO~_Au!#Sh8$$D4b_{xgfuDbqm(jo%N{E z<5?+xTD@3VU+3^iW~o1za(HrYXMQtD^IJ!G7S6&$yCz1@IWbj+WpL$z9`P)=hLJ{Zw=_q-a;Iw6}|az?8I5JR{7qkzd^ zR}Sz~ur;49NcYh)ulB+BaA2b%DYEX4ua>h?rGj0=CU(pbY0-LMtzs=NyrYmDHKs9)BE)B@&;8)!hPtw)WlU3 z`VYCR6EyX8_lw(|8acN8Bprvx_!8r9H4zTtaVRZB^4V%E{Se5M(y0pzL3McP0CT<) zfmZ0ogYeek^J-}^t!hn6!%q$jv$xj9_-n6?JuD6ua&^05{>UvU1Ao%n@C~wND(b0w4_F`!9eL0N{WA|H2u;q(o zDN@(jY91-=U$a*{7X)IHl~%GkicZmYCb|pfSIj>tcD5+i`S6EUWPq)Y%TyGZu@iM- z?y4QCWNi)xXae?rEEMpj?WWl9=v%HA0pFhijLaK2tPC?{X!_}QZJlotHXjJJ4?(K> zDD}gF=RS2`vuu@N!Sp|(@s?+gFXBKY7GkAL+XGPQDbQf<6nM_}vr{G5GpDa8kx}ky zv7nRu+lP+x0u;~1YA`&7W<3z`^t|6ad7Ig?0R9BU<| z_h_zNRX5nG^wjtIwQR6y;YzRcq~rYH)mReoQ?Y|E$8$=u5J`8)sWHIfSh0BkhphbU zuhOY1*8~N2Qnoy*gU@jL) z^)O0y=Rt`$c{Fs`4oGj$a%QBQakYLyK#xI0{rP1ns7j^w2#wnuOL(R76;b_grTkFk z66he2B=UUR-0n1V{nDS(^vtnp3;6Q|Si)IvS4%1xr;&HM>cyYp9Iow_8t)elOdc9% zbMsXLum5)9hkP0TX7~4kgAUV3r@^r9^N$F@x0|wbtm1)88kTkJ@DxX`p9d{V68(8y zDp{UI{N?Jazs*~+Kc9?b(HUjXyNTjLu#qKn`YqXwFJw(;b7x5ABHzSz~NXRvBDl=Ke(GooFS5spo#hunSb zLv3sT(Xi=K=;?kK@gcHpT`zIrqUoop>(_$k31pkygLyN=F4wJ})21frWFy?#7Y%k} z4WGjrDT0YDB)0I)ZaJASSF6|x`OF{#?YlJz-jh#9q zZtHd?;lt!@x@TMi8_bCpnrJkyii|QRl6~~3aUan)X=t)@w2cr8+%^e3zZ%AI*| zJJzh-sGq!jcm$*87_)0M)-Q<7V39Q6R{z?UrxG!6>gxZ1Sqbrcy|wq>*4;}Z%!m(b z9e-?L!8OXg;a7T8z??4$wA0yUOCc7Y+QMD?`&qg${YEdq*wCH8=*sTGcomay5YxI`9s|MC5gn;mvB zLja1Q?<81%KmHO5?o^M-*pk92?Ql4(p_l6M(xMb}RK<7{Pc_$R)@IP(3$zU8e%rZ= zEcMLlAB|MWsXboNCivYq zhXAWu(Itd_u3Gpny&V+668+Lf+_7s$mDHbCTf-d{@Y(pAY*Q>XXEL(*O$P><8k(ny z+0s=W7x_3{mRqc;>TtXQEhkfko7U^J37}yJDi!mlBjuz+~?u zQAFF0tkxj|FlJSw5k{!VPrZE4(?zoT<6a^Qb^ZzROKa4bjTj_b-tX2Ftq>GP`iVzU zWfKD9`udcYql08})FohW9+2Aly<1I;diW%f5knW2tW6jOJ=@E;dYId@dbgmt+G0PK z-nKd@7Z)%o&*h2y8z&QAU9D%mXeDI+9Gm!+ATBmuz%8V{23l4rMXk20WA{1WlhA-k z*(^qEYkwEK1f2WKCF^L``6%5@C$_9kas_Hx1pN{8o z^yUz%0L|kn>=wTEt8g#1g$t@AmRAq|Y!26*_fzK(;y-2BA7zBwpO3_9hohWDkFZna z#R%)Gb{Ch78XOV&@~0YEXXTqSPUR{;qkQXU(eR$BqV@L#hh`do%1DCd34)Dv&UGG;Z?i|1cQpqIan*RLYc4Rxeo0%nnAjGAK z&#))&tM;`BJNZ4-qgT|8A)SKu+b?$|ZAaLCSYUN&qK^&#K$CTnH(^Ro!rmx^x-92@ zuW&q;MGBycLrqEhSo8LC)g*8_f8P&gwIrXzpP|lR2HCj671rq599fV6vY2e(A)QMq z*4N#AEHcA&z7+fQ$<4U;xAIAx5LB0*e(_-cC_q~8MgO8dtd7*R&bGOCLHE*Maq;|S z2UdAYqT8|ptteg|~pVV#(oedmWj&BUc61SPIOdnl1h z(J4h2f5>~%c$=`YPb@F6(uSVdlGyquinxL1uTXHPe(NFCNpqQ#LZ1Vh6*K*K(Y#@M z8q{(NHT2pKg<7YdK8}=^`!H#-5yz*_tv<+KuJwCD~?9BuLXCknQe<#_n7qy&G5s^vkkw{XjIFXy3qFM<}$4E9ObJyMz{ z-W;a;ev5ByjDd}FfHr)(m?%T=P65mC?pPp%M3l*mmN zu5Ojm)JvReFVu^kH1DnC9TEN&-c@V{Po8w~m8e_z-d;T)XeD$G<=ylYKe)`yP8Fo9 z6rd81`s|)MYGl|QiY=c~N@Jb|=!xxNDgw=I+I#+1QDm|cbOzgNCVp*ye|AxBM%4Z3 zDLv~p9!{`qim-{-%=1asdC_S0B(COfW)OaB$Z6uI{%NX#R=+k@{P)wDNV|)CF5sh+ z=}49pIMhtL_4wz}WU4S>hdW`9E`9PKQA!EcM5$qTZu|TFI}we$`9@U%4cfaRUzx|H zw|-=`!88HjSLt)^zi%-8qi1;C!F7W$r(UqiF;5e;1)Ovs&9zE9D%gC<9X|QrX_IOU zPqz?2>r!MsgI;hWOGg7`E6~>;-o}2sXf@Y)tMQ!FFxzL~u1<4gW<@S5^SqTZrzKr3 zO9$wxnp_zhLtqQA$a)c=ITor)Qf(?QVdN%5;!K*;%(}IP8~$m+rTye=1TaP#IlzS% z0cw4?;kX9_;^S_*A#+*&dA!{SzA|G)`T$TGfi3JTA1^KvZ1A)f$Q&I2L`ThO&Hg6QS0XwIm%&PM=7`RrfV*AY95#xisqTU~@EwkGWc6=M-(V#-UHUNaocGf}GdC4! z!&`#C1}Gr5{JZ9>1Dw6N=NMFJ5oynw^27CXlcL(px)!-x6N=6^D-QV^7_?o5 z5CS z;K*(Vd{4#N;8tLm=VcT{pJsk}$MJKhr>-i3Qr};q9yowq750qyJ1U zzA1RVG&QaB?mpc97`-nJhf^eZ+LeWGSc30fV{;?>4Z2pjsWEHiHchiu%h$1-U@yDU zskX~asq0)N)S|oBnfYP>k=uO@p;b*8dE6T5Z%dx9Ii=7Au%NBv*`|1N=zEj=*p_S% z$tfD$f!c7Dqmta;RUhmS$)4nDvV(<3hPe~*eEWL;-pfmUyqW<;^uuo=P^`(2Df^{9 z8PIpR-8oPHa@OK%Pn^eDA9l@*09snJlz< z+AbFqh2xk7`i2GPR(9K4S|*v4XqGA8%k)78Zm$zOk-KQKA??4$wNNLHudbkFjXr@M z!JpAh=Wi|PTy2U!89%HD-_;3U7Uixid;&C4men#6SARZC9g^#6iN(epgGgldkHbwi zNGR`+Gb54?6$m>x8X#aCu#?syLRlGm+7U|SS0iK04k(RLJ&G1p&Fg5@M#RoOd3EQ> zV&-v~o61p+lCVKDD8HA$Hbz|KYCW;q4fE_$b|1I}@L?2(HlkU*vh~J7U87}irwt(E z5=Ou{mJEc5+CoHg!n>@hK&E#2n$FoF@{Fid{-rhdY##N;*&4$h1zX(*YlE^FFOE;K zIWtD)V|!LQKrB#!3$B7D-VxhG;)@e|Jnr9bNnvGcD`w6(4Syw3Og*awk7Bx-qMK_#?kqtFhbC@o!Zdjf7`HV_NRHITR6sH;FzoHu zXTz+BK61x+4MbPdI#@ugUL7mhj@f<=@w|dy&g)YDMyZ9)q_z8`=f-60f!z}?QZ!hO zr?(wQtBF)mB!NVotQx`y9TB;aHv5))4 zqHIKF5&+QF38i%6PG#PVj3UPiALXJ5b(VOrtn1=ymo25d`(>WoaHM&lUTDlc!=irQ z&p)3HPlP)>9g_FY{8_?qYRb-7+rOz<8T1qIZ9mD?>tZNvT=0PM9<$x$weGfz5$vVA z!P8rT8ROoqR{AU9gr5g;EOG0HJ%u8>`*s!b8dY7HtI(+GjNij#*q+HtMRZOD3&Z}L z^hGl=C*SxWUiVB%{;SIxovnTxnqT1%r;>O(kde9g~8e9$k?Di!U8OC`Z zU;J-7uKl%Ux&_v*p3$2q53 z(m=xXTYqDMNmpQ;m6r2W6+-jFPyD&i^bsC(J5V&}=k0pm>jZei)5g z7j8cp?0kv2bX|}pGmy|MZb6+=dM!9cZ}5@}>%(ESD`7_0vO+wgU~a<>-}94mGANP! zAJ^{)|9$)a6QZ#U{Aiki1OTRq008uV7XP)3T&>LQ%^ClzXZ|mcMteFMmlLfQ^Gz_a zotGfKH{CNtuuwSG;;RVhz>FINxzxVa&skY1G-;jhr)FG}?hzM{bXZ z5RpL;y*?{Ym{_GrZK5l^_0NaoDkn*BDzSfAf-h7{j!>j$nbv{n8vH9dPO(&+JTp_JvSc4Bg7*jquhX3!(91agRm=n(UXDue+8n zi<*6PxGqTY_|!{wgse}4r9NAwJFU%uR{h_Lbhw`IG6c4|FEYLQ&u@?hA)vv&9(X0x!4%Qg<(t)XS*CKMeguTlrYI?(adxKe@c3}{@SCdkFqc(QuD7%3qH zN6bM$s3E2zJ@-z73No-W7dzR!*s4L`p|dn+&@ zYaH{ty}vFFJq$lz7gd2>(>qi^!aXEsCtz+cFIx(*NiI<0;*u;JWJvp``+*CVFuAGl zv$#=#BRx?Nh8|R=d)KI0M;*xT*?9N?Nsw%y9mp1(2$-i3^{JAD{-Qj)<}&R4SgYd9 zk@f~Yw4$Ah2WQtt{wT{;y}UW65ahHdi9UiK8ZpQptXyI<^Hnor@EqD^-2Pbs;J$^)n4*6Pfmb z09sJ&!F+Hme25D%|0Su2a6m9u>-dHRFyu;}>OO7h63PrYON!!VDtsd%#UhcP=NytL zrY6T3&8$BEHWN-68lj>tso)4f+5Qv?>#*aUKq9&Jx%D(Mm&%u zY^=0vT7-1C1-nH;((($ousa?psLUxCsqH-zUeR>$i24*aXo|*X%px0iE=GMR4k^7k ziN=90Zg>NFjr3p8CpB?2VLyvzXps+|%Nh%G96I=H#|%QB%Km z9Yipz&{P{2pDu=s-CoW$QBhT7Nd6?U)p*M_Y2Y72%c?9MT$a*G8}2}7_N2A`%hrspHeC>U-OVDM8r zgJ0XmdveglA9(tv9;+y@*S4IQzE`+2%f9j#yyzXO!1|ohUzRtoD$TRPa_w~auC3~= zI7Ih1feLBQo|Q8T9@#2@sHW|j+IRB`ili+a{%&afsug-e!vG9eeJp zP1_HtarVvl*&lDT=VolVMfGE+(%C=gluhvc6zgo}+R6OTtfNc8+z@X7Bu z237>gElhC>{i(2?B_zB`n5~n!@9C$J-pp>VZJ%)SDjRwygvPcxHG^ayUKOFgWUtUS z5>Ci;z|Zos91_om%vP>Kh*a0F(A8yZJw4Hq@K@BRav^Cer3q+eTGj7PlVb_G(l<|s z0-eHk;6~iE>0EfM+zehXe*&id7^Y{dy!=t9Uy~;LX)}K5)+M<>MFK6mWOfzwQLl(G z>G9kVaQ^AnM$`N|##-xF2LgkQ@#k01!C`MNl$Oqs@iZ>mtFI|Wo_Gj zxAZ*Gm9=v9f2XXQ%y-eS{`~-f3JLH(DN+}6S66EXOPBuyzb4J?_-rmDf5FqQ2&&n1 zct|*KH{C=+z1A}W#c7S_i@;#+z zX72(K76bkG`BYYdd76FhD)*IrMvqf6ocLcP>I$$1^XIpT6StkMz28qqNV1h^<>Sao zYAg(m=eGtGJb4P{EoJ11nq8&nfw~MbMLWtDrR2!Z^8DkT+Ni@E)kbGrZupc~9&8`p z8<8|uR21Zs;OV|qP0~}Xx@q%Qzgv-hueGvk-&IpvmrTW}3IlI~G=I$6tTm7P1Q(VdWZaMBiKEuayRZ9T$=ewk4DT!28O`A)9m|fU();}^`fYvB%8*&* z8-T5!UtG~b9(D1PPu*;?jh?~uVHRs0l-AQdU`Cn|h$DbZFi9M7SE8F%=};JNM;dA6 zBcZuqjn6!zJ8Y}uMYQ8`!b6{P`I9KK$V5LGE^Z@OQ|6{H^kJ$=68P~j^uZIKthyO{@r z4dQs^f>yrrLzEO9Ok$BDU?Ak0TZp#%o^tGYv%Zv?yfTRCvuIk5Vj*~`RSufLp%U-z zW0K?FBcs=WkVb8ZyfXD8Qp{XakMGWWiobIteSu9lZFKQcO}=w$Mx|fJKN6UHx@59F z3h*-;X5jr;V-#?yRj7<3gmB@s=3hDqtWfXg zjC)JBTIg?})Z=>c!W*X!klpMKm6U>$g7lTSU~3Labk+ zFVO4tt0E8@9T_*e@J_L3teYGKnY~R5b?U;gw5I2tdUa3Xn)x%{Lt$T;dnQ11)6F&T z%K?gvmiYDLKUF4MyQp4*j;^6T!ph-;H?k*w0HMm32 z!7V@r2oN+0PO#t!gS%^hJIS}t*?Z^gv+i2o?>n`oS9iVrJT+7EcFkK|PgjAl{09%X zpxcBNTB~D$6Izx7Z>VNJy7OU0KCgiyr?@7~@+h!z`S0PrqL&=?^G0Mreac-N@1Fu7ZdCFVPqg-9+M%}s*R#gaCW+NqhHU%%q~s{@IfRT5+E(d?#iinI(rB)5m6eNm7?=X;;>5 zYQ=tY_liw6Mg^q%L$7S5Us`9BX5>6xW5#WKZ=WN0Tx;8zuoU#PD7U?=gz1gMfJVzH zVS`wh(X{RNS^|+zSi)h_I)|CI*ctN*1`BOu5hE`{8?eP6D61~ zb$aJaC45(&-&JRFv#JCZ1KHrU33-gZ99e7FeJzbx&3`5|c}&Swr3fY?E`t`tO;a3% z9*TK9`nGQXqU+c}6|3K{(a*{VX!mxXCVO06lOiU@LKs`7z9_he#wHh7rmI{}=iMJX zzz-6e#G=2V$d5NWnQ|$-&mXv5E7`czYL|4HQ%DzY9%m!{as4Rue0O*@FJ(Lr;JJCJ zrjaX}2)~bS)JfQez!He`^}&MaR(vVGFaWiyg@pz}jyIa-9(* z?00&Q&&8arqIvA6Xp526m?GB$`MSYmRMzKbt&^#k+AuVVi<9?k$G=&=%vqWE8NtD; z#fKECr$16bL^=8suSrYN^iuk#=&g&NyhH6KVXijaSWFG>$}5&ud+KB2o8; zGMKg0#as2PZ_;(C+CF2)&JPk>46vxRsa3PtnVmi2#qpg)7C0M0mn`a`nG>CY6oKcu zbjE--klG%L0GGU(l{Q~ED1V&Q=hP#?P(W>6>5VOCvr|QBwQp|vYJ_@u_ z-e*b;TpfhRt8V)C3DcUgY0E{v$HJV=@(vZu9pb;T6Q?x^m0Z-``*NhSxj7G&6pf6vZ{g}WLTn+v791TCER}@Tz^1LD@6mjL_wV$&0AEH|ahyWtjJa)6 zh&}j@3-4upjF*y}xosbjEe1V}+uiu%(v|pYm-%xzi{)4uuCc?~Ogd~5YY44-B zFRnH8J^NX_I61?h-ey$vcP4^Djz!GaK7v1aY7#(r3q$%W(h+S_%0};Ejzj^`Mvizb zBuC_)z66+R=IwJBYUbZT)%8UAeLX3!r8rW9+^g&9`8-DuqNvF@j> zZ5{&;0gr(L84dp5iTrUGkV>NHtqJ_#eQNjO!l|U^hc0ZfNuA+)x84;~MK4}MWA?T; zpF@hr0#vZ=_;udZI|Xr6DgpIsH3|FqTN|;b8zpC&n6iIhayKVrf70fa(PbC}FQFFw zKy*Bm!=4VuqG z2N}46ta-p{7Hw{~*Rw7|Ojm<%0VLb~6W_oo#V^2xQLASct6QfdyEdB&(#G}t=Fv~D z`zH{`5`*v73m&en@8ygEOaNRasq8M@2vUXEBHa#QiP2tUlg1w`4XYK4!lZM_#O}Q9 z;>L}8wT9xD{qFrSz9cMe$F2Z?NEVe|W;JF^<{0#K7m-=(nqA?3B47SfVA4jcZ`yO) zOQZ1vIg*+u_$Po2)DC5=<>dZt3Tqw|sUhKEAlF8}+KH)SB7Z7BZSI!IkR_sP3?doA z3(%*o?>wWN0}V~p&^~+|o5q3YVu}P48+$m7=$B*(mBHnbgYP5K6y%ydbn8%2vl~3C zM;;SkSg~On=I>ofhg@ErnS1z%03w zc>lR;``li4FVU|k_>;!1oT3oHcO?lME6p`~-rLMMx%QHW_at^nsF&QjGy})+dv%VU zNPR$A6F$uNZysDb)&t+T*x)+ZA71*dyQc&Pf%###1WllRJ!h^;weHzJ! zXQ><-VzdjPyde&~&GSZIIt&|lX7bEWUk;TEX6BaDFg0Kl^}ao||E^U5v9zLhWTg1; z!>E@Aalo=ZrO?Hmp?-02^RT4u*jE1&HmO1zXY0x7LZTkW6LKr=gnY?&?BGE1uIS<@ za5MF!9Mv&5W`6LfUz6U$t;7B&Z#mXM?R%O$l_pYVzq1{KPtVy725TN`_J zVf-0X3erj4I}b$cNlAfmo;$vuJG|v?@&Z7Tc-s;ViisC`{6%hj_aFghSjHkO1ESY- z5A`M_jm9}X*ZcS;VE;fAe}_WxFjyJ7p#9t`fN?I{?kLG>Ptc-D?oH+VNd39RK#K=+ zQRKIX)!Y{OqU>1*QjO+4Va{9JEfR_}$c!)WL?4Qe?ifF7Vt%;JuctD;Jif*^>HdsA z<{6_G!G0=pY1a6~b4vD`fUoj!bx;nL6lp$=P}C6hNj*6hxwVC}#*mMKx_2-YX4=zs z8+vsA3NFZ(uw4lFLz-&_bHZU*_+zJ)d4l4n90j(knWGd z$AU|?zm+P#oJ1`u?qFiWwX5IF{KjPQ8VC$y0%y5Vq~yv9EO%Y^D%W!qN#AC=c}OaD zN8^DVTc4!hAUJ)7_;N#WyB(iG?;@=lDls*pcOsCgp9u~R&Cvq(Xm=!{UX(mJGtk?* zDbC-qcMyf#PljNdv*<&oU;WUhDHC^G*#SGy{emBQ zfTSo#<42x|y70;IrpQkP!x$cC*s1#UQ6wI{5TH3fCTNQrU}XJl_ddkR4JaRhQfw~s zN@L5NGZ&g+;v2*H%mlf)rH1Z!~qJUtE_QS6`ps$U2YDtz5%^nHoD5Jf6Waf zI39LWb~ec3RvWP(Z(o{S+S0xVBwgW(F>TX*e!I()lG2f`)6;3y($!tYTkuuEl44}} zl$9$(N8x)fQHq9-#r}lBz_f)B=Ri(YU;4V8g3{Lw%dU0n^!HYNGZO?Ye#;Y@yPiXh zD!ANw-m_b1Hct0%M~|sSeP%Y~9H0Z%3siPab89bcoQ#{8)4n~ynx01Zq$)H5*>8{j z=#;3DM2XDEBBW?z)Ji{(bP*jAKb=KQ$nO=f+s-1jIGeJinsj=pnb~$XgA6{v_3B6? z1oLl7Gme(uhF8_FLTNsZejmV|5IKO*i=xsA0l`n#_!YvLgcLWwu0?E7l`P_I*&>gj zL2Qxt-6k;RZ-MMltAsWF!w8Fbd0<3mv?g_WB>i%6YYbI(VK7#`f-_XJln!-1?;=<|E0Kp!tKO{|9_O{bZ^J4(dPV?_FGl{sMtlj zNn2yyjdn7wxjC> zuUkS1ud{q^qsB3rEr&9=z{P2TDeVC#E|Ygipk>I& zFhQSc`rKCh`9%VEop1}*xZ09d{I**LPpl0EsF%RcF)CslM{Gx45pCr)50$NQ62$)!ZD2puOtBc zpt*D!wiosnQ;OKXP1yMF%TajlwPjgdV5GFubV~AC*!cCoNb$O3?ctzm0%0d*6f(^k zHC3zz+QnqGSzOHFNWCR>AD*HdcSvNH+rX|TQ9ryA)DJ0BB(KUxy4W@IZa%2*Fm%wLiU`Qof3@Jy zj|kUw4M}r08(7o0E0ag$&#xq96s&%Zqft|r-D+Gv?6?H6Z%!nJZs;DESo%@v@Oo{ccp~VeHj_lctj@HC z`JKGpHxV6$kpn^lY9m6UyR{m~-dhy3z5+To0jz>LZ~}KO8CF?e;t< z#elxUjOoM__XQ7f*L4wByu@l_zO1M@;TO}p35r>FVP9^jrEU-v+dIEqjV8ZRFu0%p zSlqr6UOC%xrZ?+hpcGiPw(Hq6n*VH=K(_hEv7k1&X3`*9Pb$KdYmE zu*0EeQ<2Mk_ZzUw&BNw)I1tcdSFl}$rTPN8KbqO^Ey-Ju5;p?n9*wDFD;2_e7fe!h z_|)eNHksG@xbiek``toi`oYB)XDpQNNB4V3P4d)`p3HMs8#E8p(g#xciPUV0ve+;5 z_@ylbv**=kmo^YJCxXxQj~Uy@Xz+O**$a@0?u^Mp^719trJ2Obk~fQNl!1wy62S`o zI7UUXUra0QxF}SkVbcXl-3VWG0A)vCRoJ%rX5t;2#=5FSM!uSiqT$DruQ zycewqZBH?^7vmxrjec4W?&u2Rq7f-xQHy^34#MiE3Iuua9@_1!s)zHRCdV!{2s-3M zkQqo%KBerT7~~RUmeu(vg+6RZsI|^5;#o#jOCpgr_*qvk%05aq(z!r=3!mzDcBg>6vvJ)v#fK4^ zpkhleUR+{kn=&0S{Z=ft&jJP$anzZ>XFj_Aa+-uEok!$kiUDGuaNQRY1B%OU@53Vu zWCI_Cu!x#w_764Ud6QQ0P(Ix=_D4Tx0zY{RMk%gDQp!iAy$U)4abyMe)eSOG2<=#Q zx+!J$PTySZ>a}ZZm~yoH|LkW5R55pO5?5R}noz66Dwn)_;oja>XBiR13=Pm7Z6#dF zMfZwuiOTDYyx7>dYj?o!jsgoJ`75nh>;}8cUf`SLB zRZ?){0fi5yfof|dAXa+J)MU*q6r+O}K!O}od{V0D!K}|3hu0_OgZ}I!0B%jn^|Pui z%IXx`GPk;%K76v!3jQj)AJ+|AivvlgizSAIp>~F$v72vqt3V=LX4g2&lLcH1{g4Kd zN#V;*GQJHUPIfyNiK#7*Y*j0Gsy1`OyW_Daf>(Ekp(1lo{|)F#%w=2$u$vJbH&y^7_X1mU;A1%nqf#I4kQ#?I7X_(D8Ntr9Hj}eQ*w^G`_XQQp9y{xqJ zv`W(SE6!E{JR|&q_rU(LsrH7}Kztv8V-K zVE&UZt1D8zs2n!)w;9a(?*SES*aZFBX6BAozXnv&3|toIh+eF)>`Uqfy97mh=%iGt z7}()1^v>9gAwkK~Vv*M_4t+kT?o5G*EA7?ZrjgNp;|rAdmb1WV)1A$#Qkog@*^3GV zXqu(fkQ>oeP*5vx_6D0k#9Z^W9fVQzeO}O=``aucO)_e!kU|zJwcan8_5^Yi8DWe; zoa_v58ehjSBEGXHcPlY|Hcg!VUM7Nzwb+`p)D3_}8P^uZ5c|^ngzh+o(9H1RDLxAD zu_uVV1MI|A=u|Su&dfVOu`#SS0*#76eDqnAPPHLfpJapL(PZ4G!Z!f~?F2S;FFI-I z_Tf-69q-7BBk-li1S2(c0?|(GoZV8Aj^+?GGhTMt8IGR|h zM|=7yb7a!;I^gdjSrBwJW%a4)Kfo=z-QtMk2O3~hjK73OZ+!Bi4}2oFn z(s%mVGTn~w)g#$yl#Y)O+~Dr!kHamwjOpnGKyU2hO)uPNN10m4#p6(2_doXXG-A6& zr#?#85K`WC4HdR(58`JbLvh&3o@yo*fGj?*O)Bzc@JcIFD%ahX!>Zpv+kJ}A{>6=H zziFFVp&lBSzzC|s2JlsKdO8BMR6UPgIle4NW^~hxRVk2OM5HGFp~-3mZicXQOc%k% z^FA@gi3A1u`wKjY{Ny^-bx20uWz)CxNMO^Q(W6x}m34P5E}ou@OpJX-e#7e3P;R-F zC`KFeH94t|5vksx3FHc~0PWUp$|hZDQ`n9n;OfVEV?l5LgG;z!qalGLy-!*a*sA1M z{A|Qfe+sOozlA+LKO}1(nVk?pjl0z6J1gsCwwFaVDtjd5)jxjFmFSvqg&n8>g^T+yw}#Da|9@gtSk>hwuc-C#;C)8vwHM0I!XvlOG=7zN&w}N=F4>e4w zlj4XJd&qQ5Cz!fA2g_9}o*bE{qCkbeeqaaXXo|~IZhFF|zof+L!vW9-xs}Gzje@P? z1quxsQF4J3)Fj`~7!$tf^GtOxh$=prSK7&YOaU*?OGGCn8Vi+#W~-0P1hJBcM) z)S=)iGHL+hreB)srN46j`X=~+EM&A`CA%uZaK>0e%5T7S2f|J_cJ%lxX^FEdwCw_- z7O0Gk2UeC~)aI;77V5^riemG18m~2^>cuW#mN6B|j@=Df4j z5);oR2|+Ny(=w3)nub_A*RGxpiD{Zc^HSXqcw=(fNe`IXk~W~Ho52#bp6C0J;PUDe zPhFPdY0rR{pVS%qK#`~vH#F```jb4&4j@KverxR{#zCI{em6tT^J#%sLP!fT0qZQ? z?%Iab$02eC;N-5W#azHP`m)qvJP^-hvcP&qbWbElcUd^KY(hLLX6x}cgVsn^XJJbC z5Vyj3^G-t!5lY?_D$Cax72!q39PE`NWcL#R#c4qMx~n1`*9k9O08V~v$p<~-o^2X2 z^*U+jW4+I^_j(2=>SNKg^mCZ*v?Y?jW3wb>Rr;03cBZl_czOKz!nZ0%sITol7*wS6 zGu9@&84qXJq2*uo(+f<&*W8_1*FfRqOt+XL!}@v^CCyKPZKrk&ciKp`8(toXfRGU` z%i~KIxa}-VFX8Pe^doXkR7lL1tj5~g-DZ>TUVRC&tfa(gO<05zlBm|{c4;Rae14QX zEdNk+Tz{%rdcCN9?$n8-mv9&ro?a%ZV#RQUxI~}niE!s|IiBxuuUgXVPeWT%US~rb z)Ju~mq94935M2QKb^l)z4v)YI3mE@-<)B|q@R!X$tRtkZ^ml^4uWj-dEL;KXV*ATV zC%*%KU-aQmXctU<@*hh;{0{&7QxkuJ;ov%i{(%3NXDEJW`dv8xCl`|Ff05DuPVu|+ z@J|Y4=|3p`Dn$Gp{=4MnPq?!DAMn2kaDK=CJ|g-jeqZ@t#s7PJ^mp*@9f?1|r$&E( zf9qEK&hUGH`zJ%5$sY`V3wM9V|2>fS6AcIFWex}TA7RDs@V__ie}&Uo{RRF Date: Mon, 24 Nov 2025 14:46:58 -0800 Subject: [PATCH 3/7] moved legacy code --- 03_drug_biomarkers.Rmd | 1265 ---------------------------------------- 1 file changed, 1265 deletions(-) delete mode 100644 03_drug_biomarkers.Rmd diff --git a/03_drug_biomarkers.Rmd b/03_drug_biomarkers.Rmd deleted file mode 100644 index ae9d97c..0000000 --- a/03_drug_biomarkers.Rmd +++ /dev/null @@ -1,1265 +0,0 @@ ---- -title: "Evaluate drug and omics data for biomarker assessment" -author: "Sara gosline" -date: "2025-03-13" -output: html_document ---- - -This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics - -We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(synapser) -library(ggplot2) -library(dplyr) -library(tidyr) - -``` - -# Pull processed files from previous markdowns - -We have already run the previous scripts and stored data on Synapse - -```{r pull files} -Sys.setenv() - -source("cNF_helper_code.R") -traceback() - -source('cNF_helper_code.R') -##read in drug code -fits <- readr::read_tsv(synGet('syn69947322')$path) - - -##read in proteomic data -glong <- readr::read_csv(synGet('syn70078416')$path) -plong <- readr::read_csv(synGet('syn70078415')$path) - - -##read in transcrniptomic data -#TODO: process transcritpomic data into long format - - -``` - -## Format protein data to collect correlation values - -Do simple correlations to identify putative trends in the data. - -Get most efficacious, variable, and heatmap - -```{r} -# ensure an output folder - -outdir <- "figs" -if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - -shared <- intersect(fits$improve_sample_id, glong$Specimen) -message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", -length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) - -glob_dat <- glong |> -ungroup() |> -subset(Specimen %in% shared) |> -dplyr::select(Specimen, Gene, correctedAbundance) |> -tidyr::pivot_wider( -names_from = "Gene", -values_from = "correctedAbundance", -values_fill = 0, -values_fn = mean -) |> -tibble::column_to_rownames("Specimen") - -phos_dat <- plong |> -ungroup() |> -subset(Specimen %in% shared) |> -dplyr::select(Specimen, site, correctedAbundance) |> -tidyr::pivot_wider( -names_from = "site", -values_from = "correctedAbundance", -values_fill = 0, -values_fn = mean -) |> -tibble::column_to_rownames("Specimen") - -## drug data to matrix here - -drug_dat <- fits |> -subset(dose_response_metric == "uM_viability") |> -dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> -tidyr::pivot_wider( -names_from = "improve_drug_id", -values_from = "dose_response_value", -values_fn = mean -) |> -tibble::column_to_rownames("improve_sample_id") - -## summarize drugs - -drug_counts <- fits |> -subset(dose_response_metric == "uM_viability") |> -group_by(improve_drug_id) |> -distinct() |> -summarize( -meanResponse = mean(dose_response_value, na.rm = TRUE), -nMeasured = n_distinct(improve_sample_id), -variability = sd(dose_response_value, na.rm = TRUE), -.groups = "drop" -) - -# -------- Plot 1: most efficacious -------- - -p1 <- drug_counts |> -arrange(desc(meanResponse)) |> -subset(meanResponse < 0.5) |> -ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + -geom_point() + -theme_minimal() + -theme( -axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels -) + -labs(title = "Most efficacious drugs", -y = "Mean cell viability (fraction)", -x = "Drug") - -ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) - -# -------- Plot 2: most variable -------- - -p2 <- drug_counts |> -arrange(desc(variability)) |> -subset(variability > 0.15) |> -as.data.frame() |> -ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + -geom_point() + -theme_minimal() + -theme( -axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels -) + -labs(title = "Most variable drugs", -y = "Mean cell viability (fraction)", -x = "Drug") - -ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) - -# -------- Plot 3: heatmap of complete-measurement drugs -------- - -fulldrugs <- drug_counts |> -subset(nMeasured == nrow(drug_dat)) - -# Save large, rotate column labels, and shrink font to avoid overlap - -# pheatmap can write directly to a file via the `filename` arg. - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -filename = file.path(outdir, "drug_heatmap_large.pdf"), -width = 28, # inches: large so column labels are readable -height = 16, -dpi = 300, -angle_col = 45, # rotate column (drug) labels -fontsize_col = 6, # smaller font for many drugs -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -# Optional: also save a PDF (great for vector zooming) - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -filename = file.path("figs/drug_heatmap_large.pdf"), -width = 28, -height = 16, -angle_col = 45, -fontsize_col = 6, -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -# ---- PRINT plots in the document as well ---- - -print(p1) -print(p2) - -# Print the heatmap to the document (no filename draws it) - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -angle_col = 45, -fontsize_col = 6, -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -``` - - -# Basic correlation tests - -Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? - -We can define a simple test to correlate features and drugs and assess significance and correct: - -```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} - -#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers -#also coputes significance -computeCors <- function(drug_dat,feat_dat,shared){ - - cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> - as.data.frame() |> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> - arrange(desc(cor)) - - ##now lets try to get significance - csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(feat_dat),function(y){ - pval <- 1.0 - try(pval <- cor.test(drug_dat[shared,x], - feat_dat[shared,y], - use = 'pairwise.complete.obs', - method = 'spearman')$p.value,silent = TRUE) - - return(c(corp = pval,drug = x,gene = y)) - })) |> - as.data.frame() |> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - })) |> - as.data.frame() |> - mutate(drug = unlist(drug)) |> - mutate(gene = unlist(gene)) - - fullcors <- cres|>left_join(data.frame(csig)) |> - mutate(direction=ifelse(cor<0,'neg','pos')) - - return(fullcors) -} - -``` - -Now that we have a function we can compute correlations of each data type. - -```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} - -gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> - mutate(data='proteins') -pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> - mutate(data = 'phosphosites') - -allcor <- rbind(gcor,pcor) - -corsummary<-allcor |> subset(fdr<0.25) |> - group_by(drug,data,direction) |> - summarize(features=n(),meanCor=mean(cor)) - - -p_features <- corsummary |> - subset(features > 1) |> - ggplot(aes(x = drug,y = features,fill = direction)) + - facet_grid(data~.) + - geom_bar(position='dodge',stat='identity') + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - -ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), - plot = p_features, width = 12, height = 6, units = "in") - -corsummary |> - arrange(desc(features)) |> - subset(features > 100) |> - dplyr::select(drug,data,direction,features,meanCor) - - - -``` - -Now we have the correlation values. what do we do with them? -## Correlation based enrichment -Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. -```{r functional enrichment} -# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === -# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) -# Outputs: -# prot_enrich[[drug]]$top / $bottom -# phos_enrich[[drug]]$top / $bottom -# Optional CSVs in folder "leapR_top_paths/dir_split/" - -library(dplyr) -library(tidyr) -library(stringr) -library(SummarizedExperiment) -library(leapR) - -# ---- choose drugs to run (use your two, or set to a larger list) ---- -# target_drugs <- c("THZ1", "Onalespib") -target_drugs <- unique(fits$improve_drug_id) - -# ---- genesets ---- -data(msigdb); geneset_db <- msigdb -data(kinasesubstrates) - -# ---- helpers ---- -extract_gene_from_site <- function(site_id) { - if (is.na(site_id) || site_id == "") return(NA_character_) - g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] - toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) -} - -# Correlate response vector vs each feature column (Spearman) -col_spearman <- function(vec, mat) { - shared <- intersect(names(vec), rownames(mat)) - if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) - v <- vec[shared] - m <- as.matrix(mat[shared, , drop = FALSE]) - apply(m, 2, function(col) { - if (all(is.na(col))) return(NA_real_) - if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) - suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) - }) -} - -# Build one-column SE; column name must match primary_columns -build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { - v <- rep(NA_real_, length(features_all)); names(v) <- features_all - common <- intersect(names(cor_named_vec), features_all) - v[common] <- cor_named_vec[common] - mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) - rd <- DataFrame(feature_id = features_all) - rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] - se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) - assayNames(se) <- assay_label - se -} - -safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) - -# ---- feature and mapping vectors ---- -prot_features <- colnames(glob_dat) -phos_features <- colnames(phos_dat) -phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), - phos_features) - -# ---- results containers ---- -prot_enrich <- list() -phos_enrich <- list() - -# optional: write CSVs? -write_csvs <- TRUE -outdir <- "leapR_top_paths/dir_split" -if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - -# ---- per-drug workflow ---- -for (drug in target_drugs) { - message("=== ", drug, " ===") - # build response vector (mean per sample if repeats) - dv <- fits %>% - filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% - group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") - if (nrow(dv) == 0) { message(" no response rows; skipping"); next } - dv_vec <- setNames(dv$resp, dv$improve_sample_id) - - # correlations - prot_cor <- col_spearman(dv_vec, glob_dat) - phos_cor <- col_spearman(dv_vec, phos_dat) - - # split by sign - prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] - prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] - phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] - phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] - - # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- - prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) - - # TOP - if (length(prot_pos) >= 5) { - se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") - prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_prot_top, assay_name = "proteomics", - primary_columns = paste0(drug, "_TOP")) - prot_enrich[[drug]]$top <- prot_top - if (write_csvs && !is.null(prot_top)) { - write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) - } - } else message(" PROT top: too few positive features (", length(prot_pos), ")") - - # BOTTOM (flip sign so more negative = larger positive rank) - if (length(prot_neg) >= 5) { - se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") - prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_prot_bot, assay_name = "proteomics", - primary_columns = paste0(drug, "_BOTTOM")) - prot_enrich[[drug]]$bottom <- prot_bot - if (write_csvs && !is.null(prot_bot)) { - write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) - } - } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") - - # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- - phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) - - # TOP - if (length(phos_pos) >= 5) { - se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), - map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") - phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_phos_top, assay_name = "phosphoproteomics", - primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") - phos_enrich[[drug]]$top <- phos_top - if (write_csvs && !is.null(phos_top)) { - write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) - } - } else message(" PHOS top: too few positive features (", length(phos_pos), ")") - - # BOTTOM (flip) - if (length(phos_neg) >= 5) { - se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), - map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") - phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_phos_bot, assay_name = "phosphoproteomics", - primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") - phos_enrich[[drug]]$bottom <- phos_bot - if (write_csvs && !is.null(phos_bot)) { - write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) - } - } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") - -} - -# Save all direction-split results for later reuse -save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") - -message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") - - -``` - - - -For each drug, how many terms do we see active? how many kinases? -```{r functional enrichment} -# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== -library(dplyr) -library(tidyr) -library(purrr) -library(tibble) -library(ggplot2) -library(forcats) -library(stringr) -library(scales) - -# Always load the precomputed enrichment lists here -load("leapR_enrichment_direction_split.Rdata") -if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") -if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") - -alpha <- 0.05 -topN <- 15 # <<< top 15 -dirs <- c("resistant","sensitive") - -# ---------- helpers ---------- -pick_pcol <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) - if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) - if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) - if ("padj" %in% cols) return(list(kind="adj", col="padj")) - if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) - if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) - NULL -} - -extract_term_col <- function(df) { - cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") - hit <- cands[cands %in% names(df)] - if (length(hit)) hit[[1]] else NULL -} - -tidy_one_result <- function(x) { - if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) - df <- as.data.frame(x) - if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) - - term_col <- extract_term_col(df) - if (is.null(term_col)) { - df <- tibble::rownames_to_column(df, "pathway") - } else { - df <- dplyr::mutate(df, pathway = .data[[term_col]]) - } - df$pathway <- as.character(df$pathway) - - pk <- pick_pcol(df) - if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) - adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") - - tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> - filter(is.finite(adj_p), !is.na(adj_p)) -} - -flatten_by_direction <- function(lst, omic_label) { - if (!length(lst)) return(tibble()) - purrr::imap_dfr(lst, function(two, drug) { - bind_rows( - tidy_one_result(two$top) |> mutate(direction = "resistant"), - tidy_one_result(two$bottom) |> mutate(direction = "sensitive") - ) |> - mutate(drug = as.character(drug), omic = omic_label) - }) -} - -# ---------- long-format enrichment and significance filter ---------- -prot_long <- flatten_by_direction(prot_enrich, "global") -phos_long <- flatten_by_direction(phos_enrich, "phospho") -enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() -stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) - -enrich_sig <- enrich_long |> - filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) - -if (nrow(enrich_sig) == 0) { - message("No significant pathways at FDR < ", alpha, ".") - pathway_summary <- tibble() - drug_counts <- tibble() -} else { - pathway_summary <- enrich_sig |> - group_by(omic, direction, pathway) |> - summarise(n_drugs = n_distinct(drug), .groups = "drop") |> - arrange(desc(n_drugs)) - - all_drugs <- sort(unique(enrich_long$drug)) - drug_counts <- enrich_sig |> - group_by(drug, direction) |> - summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> - complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> - arrange(drug, direction) - - # ---------- summary figures ---------- - dir.create("figs", showWarnings = FALSE) - - reorder_within <- function(x, by, within, sep = "___") { - x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) - } - scale_y_reordered_wrap <- function(width = 32, sep = "___") { - ggplot2::scale_y_discrete( - labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) - ) - } - - pathway_summary_top <- pathway_summary |> - group_by(omic, direction) |> - slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> - ungroup() |> - mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) - - p_pathways <- ggplot(pathway_summary_top, - aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + - geom_col(position = position_dodge(width = 0.85), width = 0.85) + - facet_wrap(~ omic, scales = "free_y") + - scale_y_reordered_wrap(width = 36) + - scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + - labs(title = paste0("Top ", topN, " pathways enriched across drugs"), - y = "Pathway", x = "# Drugs") + - theme_minimal(base_size = 11) + - theme( - legend.position = "top", - strip.text = element_text(face = "bold"), - axis.text.y = element_text(size = 7), - panel.grid.major.x = element_blank(), - plot.margin = margin(5.5, 30, 5.5, 5.5) - ) + - coord_cartesian(clip = "off") - - # --- PDF saves (summary figs) --- - ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, - width = 12, height = 10, units = "in", device = cairo_pdf) - print(p_pathways) - - drug_counts_full <- drug_counts |> - group_by(drug) |> - mutate(total = sum(n_pathways)) |> - ungroup() |> - mutate(drug = forcats::fct_reorder(drug, total)) - - p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + - geom_col(position = position_dodge(width = 0.9), width = 0.85) + - labs(title = "Number of enriched pathways per drug", - x = "Drug", y = "# Pathways") + - theme_minimal(base_size = 11) + - theme( - legend.position = "top", - axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), - panel.grid.major.x = element_blank() - ) - - ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, - width = 18, height = 7, units = "in", device = cairo_pdf) - print(p_counts) -} - -# ---------- Top-2 most efficacious & most variable drugs ---------- -top2_efficacious <- character(0) -top2_variable <- character(0) - -if (exists("fits")) { - eff_tbl <- fits |> - filter(dose_response_metric == "uM_viability") |> - group_by(improve_drug_id) |> - summarise( - meanResponse = mean(dose_response_value, na.rm = TRUE), - variability = sd(dose_response_value, na.rm = TRUE), - nMeasured = dplyr::n_distinct(improve_sample_id), - .groups = "drop" - ) - top2_efficacious <- eff_tbl |> - arrange(meanResponse, desc(nMeasured)) |> - slice_head(n = 2) |> - pull(improve_drug_id) - top2_variable <- eff_tbl |> - arrange(desc(variability), desc(nMeasured)) |> - slice_head(n = 2) |> - pull(improve_drug_id) -} else { - some <- unique(enrich_long$drug) - top2_efficacious <- head(some, 2) - top2_variable <- head(rev(some), 2) -} - -# ---- Force-include Onalespib in top_interest (case-insensitive) ---- -ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) -if (length(ona_matches) == 0) ona_matches <- "Onalespib" - -top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) -message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) -message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) -message("Force-included: ", paste(ona_matches, collapse = ", ")) - -# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- -pick_pcol_plot <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return("BH_pvalue") - if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") - if ("adj.P.Val" %in% cols) return("adj.P.Val") - if ("padj" %in% cols) return("padj") - if ("pvalue" %in% cols) return("pvalue") - if ("P.Value" %in% cols) return("P.Value") - NA_character_ -} - -sig_stars <- function(p) dplyr::case_when( - is.na(p) ~ "", - p < 0.001 ~ "***", - p < 0.01 ~ "**", - p < 0.05 ~ "*", - TRUE ~ "" -) - -prep_plot_df <- function(res_df, n = 15) { - if (is.null(res_df)) return(NULL) - df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) - if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") - col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) - - # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) - adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] - - df |> - mutate( - adj_p = as.numeric(adj_p), - score = -log10(pmax(adj_p, 1e-300)), - stars = sig_stars(adj_p), - signif = !is.na(adj_p) & adj_p < 0.05, - feature = stringr::str_wrap(as.character(feature), width = 40) - ) |> - filter(is.finite(adj_p)) |> - arrange(adj_p, desc(score)) |> - slice_head(n = n) -} - -plot_bar <- function(df_plot, title_text, label_type) { - if (is.null(df_plot) || !nrow(df_plot)) return(NULL) - lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") - fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") - star_pad <- 0.15 - - ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + - geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + - scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + - geom_text(aes(y = score + star_pad, label = stars), - size = 3, hjust = 0) + - coord_flip(clip = "off") + - scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + - labs(title = paste0(title_text, ", ", lbl), - x = NULL, y = expression(-log[10]("FDR"))) + - theme_minimal(base_size = 9) + - theme( - plot.title = element_text(size = 11), - axis.text.y = element_text(size = 7), - axis.text.x = element_text(size = 8), - plot.margin = margin(5.5, 30, 5.5, 5.5) - ) -} - -safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) - -dir.create("figs", showWarnings = FALSE) -for (drug in top_interest) { - pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL - pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL - ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL - fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL - - p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " — global"), "top") - p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " — global"), "bottom") - p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " — Phospho"), "top") - p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " — Phospho"), "bottom") - - if (!is.null(p1)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), - p1, width = 7, height = 5, device = cairo_pdf); print(p1) - } - if (!is.null(p2)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), - p2, width = 7, height = 5, device = cairo_pdf); print(p2) - } - if (!is.null(p3)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), - p3, width = 7, height = 5, device = cairo_pdf); print(p3) - } - if (!is.null(p4)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), - p4, width = 7, height = 5, device = cairo_pdf); print(p4) - } -} - - - -``` - - - - -# Print siginficant results for all drugs if we want. -```{r} -library(dplyr) -library(ggplot2) -library(tibble) -library(rlang) - -alpha <- 0.05 # significance threshold -top_n_to_show <- 15 - -# Prefer adjusted p if available; fall back to raw and adjust per-run -pick_pcol <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) - if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) - if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) - return(NULL) -} - -prep_plot_df <- function(res_df, n = top_n_to_show) { - if (is.null(res_df)) return(NULL) - df <- as.data.frame(res_df) - if (!nrow(df)) return(NULL) - - pick <- pick_pcol(df) - if (is.null(pick)) return(NULL) - - # unify to adj p - if (pick$kind == "adj") { - df <- df %>% mutate(adj_p = !!sym(pick$col)) - } else { # raw p → adjust within this run - df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) - } - - df %>% - rownames_to_column("feature") %>% - arrange(adj_p) %>% - # keep only significant ones; if none, return empty (caller will message) - filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% - head(n) %>% - mutate(score = -log10(pmax(adj_p, 1e-300))) -} - -plot_bar <- function(df_plot, title_text, label_type) { - if (is.null(df_plot) || !nrow(df_plot)) { - message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") - return(NULL) - } - # Correct labels & colors for uM_viability convention: - # TOP -> resistant (red), BOTTOM -> sensitive (blue) - lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") - fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") - - p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + - geom_col(fill = fillc) + - coord_flip() + - labs(title = paste0(title_text, ", ", lbl), - x = NULL, y = expression(-log[10]("FDR"))) + - theme_minimal(base_size = 9) + - theme( - plot.title = element_text(size = 11), - axis.text.y = element_text(size = 6), - axis.text.x = element_text(size = 8) - ) - print(p); invisible(p) -} - -plot_drug_panels <- function(drug) { - message("\n=== ", drug, " ===") - - # global - pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL - pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL - plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") - plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") - - # Phospho pathways - ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL - fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL - plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") - plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") -} - - -# --- run for your drugs --- -for (d in target_drugs) plot_drug_panels(d) - -``` - - - - -# Basic drug list -```{r} -drug_counts <- fits %>% - filter(dose_response_metric == "uM_viability") %>% - group_by(improve_drug_id) %>% - summarise( - n_rows = dplyr::n(), # total rows/measurements - n_specimens = n_distinct(improve_sample_id), # unique samples tested - meanResponse = mean(dose_response_value, na.rm = TRUE), - sdResponse = sd(dose_response_value, na.rm = TRUE), - .groups = "drop" - ) %>% - arrange(desc(n_specimens), improve_drug_id) - -# Plain list of drugs + total count -drug_list <- sort(unique(drug_counts$improve_drug_id)) -n_drugs <- length(drug_list) - -message(sprintf("Total unique drugs: %d", n_drugs)) -print((drug_list)) - -``` - - - - - - - - - - - - - - - - - - - - - -## Visualization -How should we visualize? Here is some older code -```{r plot cors, eval=FALSE} - -plotCors <- function(features,druglist,dataType='proteins'){ - ##subset a list of features and drugs and plot those in a graph - require(ggplot2) - if(dataType=='proteins'){ - ptab<-glong|>dplyr::rename(feature='Gene') - }else{ - ptab<-plong|>dplyr::rename(feature='site') - } - dtab<-fits|> - subset(dose_response_metric=='uM_viability')|> - dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> - subset(Drug%in%druglist) - - - ftab<-features|>left_join(ptab)|>left_join(dtab)|> - subset(!is.na(Drug)) - - feats <- unique(features$feature) - plots <- lapply(feats,function(x){ - corval <- ftab[ftab$feature==x,'cor'] - #corval <- ftab[ftab$feature==x,'pCor'] - - ftab|>subset(feature==x)|> - ggplot(aes(x=correctedAbundance,y=dose_response_value, - col=Patient,size=1))+ - geom_point()+ - facet_grid(~Drug)+ - ggtitle(paste(x,'Drug correlation'))+ - scale_color_manual(values=pcols) - }) - cowplot::plot_grid(plotlist= plots,ncol=2) - -} - -druglist<-c('Onalespib') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.25)|> - subset(abs(cor)>0.7)|> - subset(data=='proteins')|> - arrange(desc(abs(cor))) - -plotCors(rename(features[1:10,],feature='gene'),druglist) - -ggsave('onalespibFDR0.25Cors.pdf',height=20) - -``` - -# Random forest predictor - -Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. - -First we build the data frames needed - I've included cohort as a covariate but may remove it. - -```{r random forest} -## separate out cohorts for prediction -cohorts <- meta |> - select(Specimen,cohort) |> - distinct() |> - tibble::column_to_rownames('Specimen') - -##for each drug, build model of cohort + protein ~ drug response -#removed cohort for now -gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) -gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) - -pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) -pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) - -mdf <- meta[-c(2,5),]|> ##have duplication here - tibble::column_to_rownames('Specimen') - - -``` - -Now we can loop through every drug, build model, and assess accuracy. - -```{r evaluate predictivty} - -#trying ou tthis function tos ee how it goes -rfFeatures <- function(drug_dat,fdf, mdf){ - complete_drugs <- which(apply(drug_dat,2, - function(x) length(which(!is.na(x)))==length(x))) - print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) - all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ - - dg <- fdf#[,-gnas] - - ##create the metadata df with the drug of interest - dmdf <- mdf |> - mutate(drug=drug_dat[rownames(mdf),drug]) - - rf <- randomForest::randomForest(x=dg, - y=drug_dat[rownames(dg),drug], - importance=TRUE,ntree=500) - - im <- randomForest::importance(rf)|> - as.data.frame() |> - mutate(drug=drug) - pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) - - #pheatmap::pheatmap(t(glob_dat[pord, - # rownames(im)]),annotation_col=dmdf, - # cellheight=10,cluster_cols = TRUE) - return(im) - ##what do we return?x - })) - return(all_preds) -} - - -``` - -Now what do we do with the importance features? - -```{r rf processing} - -##get importance for global -gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) - -##get importance for phospho -pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) - -``` - - - - - - - - - - - - - - - - - -# Old correlation code, dont run - -now we can visualize correlations - -```{r check out HSP90s, eval=FALSE} - -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) -#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) - -plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) - -ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) -#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) - -plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') - -ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) - - -``` - -### Now lets look only at IC50 values - -There are a few drugs for which we have IC50 values - -```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} - -ifits<-subset(fits,dose_response_metric=='fit_ic50') - -shared<-intersect(ifits$improve_sample_id,glong$Specimen) -print(paste('Found',length(shared),'shared samples')) - -## a full join might be a challenge, maybe just take two matrices -drug_dat <- ifits|> - dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> - tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> - tibble::column_to_rownames('improve_sample_id') - -gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets try to get significance - -gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(glob_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - }))|> - as.data.frame()|> - mutate(drug=unlist(drug))|> - mutate(gene=unlist(gene)) - -fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') - -pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets look at correlations - -psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(phos_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - }))|>as.data.frame() - -fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') - -#combine all correlations -allcor<-rbind(fullcors,fullpcors)|> - mutate(direction=ifelse(pCor<0,'neg','pos')) - - -##lets count the correlations and plot - -corsummary<-allcor|>subset(fdr<0.1)|> - group_by(drug,data,direction)|> - summarize(features=n(),meanCor=mean(pCor)) - -corsummary|> - #subset(features>1)|> - ggplot(aes(x=data,y=features,fill=drug))+ - facet_grid(~direction)+ - geom_bar(position='dodge',stat='identity') - - - -``` - -Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . - -```{r plot individual sites, eval=FALSE} - -druglist<-c('Onalespib') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.05)|> - subset(data=='global') - -plotCors(rename(features,feature='gene'),druglist) - - -druglist<-c('Digoxin') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.1)|> - subset(data=='phospho') - -#plotCors(rename(features,feature='gene'),druglist,data='phospho') - - - -``` - -Now we can check the HSP proteins directly - -```{r HSP correlation, eval=FALSE} -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) - -print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) - -plotCors(rename(cor_hsps,feature='gene'),'Onalespib') - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) - -print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) -``` - -The IC50 result is similar to the viability. Now we can check AUC - -```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} - -ifits<-subset(fits,dose_response_metric=='auc') - -shared<-intersect(ifits$improve_sample_id,glong$Specimen) -print(paste('Found',length(shared),'shared samples')) - -## a full join might be a challenge, maybe just take two matrices -drug_dat <- ifits|> - dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> - tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> - tibble::column_to_rownames('improve_sample_id') - -gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets try to get significance - -gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(glob_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(bh_p=p.adjust(unlist(corp),method='BH')) - }))|> - as.data.frame()|> - mutate(drug=unlist(drug))|> - mutate(gene=unlist(gene)) - -fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') - -pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets look at correlations - -psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(phos_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(bh_p=p.adjust(unlist(corp),method='BH')) - }))|>as.data.frame() - -fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') - -#combine all correlations -allcor<-rbind(fullcors,fullpcors)|> - mutate(direction=ifelse(pCor<0,'neg','pos')) - - -##lets count the correlations and plot - -corsummary<-allcor|>subset(bh_p<0.1)|> - group_by(drug,data,direction)|> - summarize(features=n(),meanCor=mean(pCor)) - -corsummary|> - #subset(features>1)|> - ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') - -print(corsummary) - -``` - -```{r HSP correlation again, eval=FALSE} -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>s -ubset(drug=='Onalespib')|>subset(corp<0.05) -print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) -cor_hsps - -plotCors(rename(cor_hsps, feature='gene'),'Onalespib') - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) -print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) -``` From 6196676b35795d68fa2ffcd1aa62db898d46faee Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 1 Dec 2025 15:20:09 -0800 Subject: [PATCH 4/7] Updated documentation for run_modality() in 02_run_normalize_omics.Rmd --- 02_run_normalize_omics.Rmd | 84 ++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 49 deletions(-) diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd index 5c852ec..bf77c93 100644 --- a/02_run_normalize_omics.Rmd +++ b/02_run_normalize_omics.Rmd @@ -26,66 +26,52 @@ source("02_normalize_batchcorrect_omics.R") # Run batch correction / normalization across phospho, global, and rna samples. ```{r} - # --------------------------------------------------------------------------- # run_modality() — quick reference for args & expected batch structure # --------------------------------------------------------------------------- # Args: -# modality : Character. One of "Global", "Phospho", or "rna". -# Controls feature-ID parsing and normalization rules. -# -# batches : List of per-cohort lists describing each input table. -# See “Batch spec” below. -# -# meta : Data frame with sample metadata. Must contain at least: -# - cohort (int or factor; matches batches[[i]]$cohort) -# - Specimen, Patient, Tumor (optional but used if present) -# For RNA, if aliquot joins fail, headers are matched to -# meta$Specimen using a normalized form. Generated by cNF_helper_code.R -# -# syn : An initialized synapser client (e.g., syn <- synapser::synLogin()). -# -# drop_name_substrings: Character vector of substrings. Any sample column whose -# name contains one of these (regex OR) is dropped. -# Use NULL or character(0) to keep all samples. -# -# out_dir : Directory where outputs/QC are written. Created if missing. # -# out_prefix : String used in filenames (e.g., "global", "phospho", "rna"). -# If save_basename is NULL, this is also used as the basename. +# - modality : "global" | "phospho" | "rna" (case-insensitive). +# - batches : list of cohort specs; each has: +# syn_id, cohort, fname_aliquot_index, (optional) value_start_col +# (auto-detects; fallback = 5 for global/phospho, 3 for rna). +# - meta : data.frame joined by (aliquot, cohort). Uses Specimen/Patient/Tumor +# if present. For RNA, falls back to normalized Specimen matching. +# - syn : synapser client (e.g., syn <- synLogin()). +# - drop_name_substrings : character vector; OR-regex to drop sample cols (NULL = keep all). +# - out_dir : output dir (auto-created). +# - out_prefix: filename stem (used unless save_basename set). +# - upload_parent_id : Synapse folder ID for uploads (NULL = no upload). +# - pcols : named colors for Patient in PCA (optional). +# - write_outputs : TRUE = write CSV/PDF (+upload if parent set); FALSE = in-memory only. +# - save_basename : override file stem (else out_prefix). +# - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect). # -# upload_parent_id : Synapse folder/entity ID to upload written files. -# If NULL, no uploads occur. -# -# pcols : Optional named color vector for plotting (e.g., by patient). -# If NULL, ggplot defaults are used. -# -# write_outputs : Logical. If TRUE, write CSVs/PDFs to out_dir (and upload -# when upload_parent_id is set). If FALSE, nothing is written/ -# uploaded; results are returned in-memory only. -# -# save_basename : Optional string to control the root of output filenames. -# If NULL, falls back to out_prefix. Useful when you want the -# directory name (out_dir) and the file basename to differ. -# -# do_batch_correct : Logical. If TRUE, applies ComBat across cohorts (batch-only). -# If FALSE, skips ComBat; filenames will include “_noBatchCorrect”. -# -# Returns: -# A list with: -# - se_batches : List of per-batch SummarizedExperiment objects (normalized). -# - se_combined : Combined SE after feature intersection. -# - se_corrected : Post-ComBat SE (or the same as combined if do_batch_correct=FALSE). -# - long_pre : Long-format data frame pre-ComBat (finite values only). -# - long_post : Long-format data frame post-ComBat (or pre if no ComBat). -# - pca_df_pre / pca_df_post : Data frames used for PCA scatter plots. -# - plots : ggplot objects for PCA & histograms. -# - files : Paths of any written files (if write_outputs=TRUE). +# Per-modality normalization +# - phospho : 0→NA → drop >50% missing → log2(x+0.01) → per-sample modified z. +# - global : log2(x) → per-sample modified z. +# - rna : drop >50% missing → log2(TPM+1) → per-sample modified z. # +# Returns (list) +# - se_batches : per-batch normalized SEs. +# - se_combined: intersection-features combined SE. +# - se_corrected: post-ComBat SE (NULL if do_batch_correct=FALSE). +# - se_post : SE used “post” (se_corrected or se_combined). +# - did_combat: TRUE/FALSE. +# - long_pre : long table from se_combined (finite only). +# - long_post : long table from se_post. +# - pca_df_pre, pca_df_post : PCA inputs (complete-case features). +# - plots : pre_pca, pre_hist, pca, hist (ggplot). +# - files : if write_outputs=TRUE, $queued = written file paths. # +# Notes +# - ComBat drops samples with NA cohort; requires colData(se)$cohort. +# - Global: splits multi-symbol Genes by ';'. RNA aliquot may be NA (Specimen fallback). +# - Sample headers auto-detected if path-like (*.raw, *.mzML, paths). # --------------------------------------------------------------------------- + # Substrings to drop (These were the protocol optimization samples) drop_subs <- c( "cNF_organoid_DIA_G_02_11Feb25", From 56f25659577aa8c05857220a741fb5d6497a11e8 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Wed, 3 Dec 2025 10:04:23 -0800 Subject: [PATCH 5/7] cleaned up comments --- 02_normalize_batchcorrect_omics.R | 26 ++++++++++++-------------- 03_analyze_modality_correlations.R | 20 ++++++-------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/02_normalize_batchcorrect_omics.R b/02_normalize_batchcorrect_omics.R index a5dad15..5fe3611 100644 --- a/02_normalize_batchcorrect_omics.R +++ b/02_normalize_batchcorrect_omics.R @@ -17,7 +17,6 @@ suppressPackageStartupMessages({ }) # Small helpers - modified_zscore <- function(x, na.rm = TRUE) { m <- suppressWarnings(stats::median(x, na.rm = na.rm)) md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) @@ -55,7 +54,6 @@ make_dropper <- function(substrings) { } # Functions to clean up irregular names - basename_only <- function(x) sub("^.*[\\\\/]", "", x) basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) @@ -121,7 +119,6 @@ parse_rna_header_triplet <- function(fnames) { } # Functions to get data from Synapse - read_wide_from_synapse <- function(syn, syn_id) { message(" Reading Synapse file: ", syn_id) df <- read.table( @@ -179,8 +176,9 @@ parse_fnames <- function(fnames, aliquot_field_index, cohort) { out } -# -------------------------- Feature ID builders ------------------------------ - +##### +#Feature ID builders +##### build_phospho_ids <- function(df) { lsite <- tolower(df$Residue) paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) @@ -587,9 +585,9 @@ perform_uploads <- function(paths, syn, parent_id) { message("Uploads complete.") } - - -# ------------------------------- Main entry ---------------------------------- +##### +# Main entry +##### # This is how we call the function / pipeline run_modality <- function( @@ -672,7 +670,7 @@ run_modality <- function( } } - # ---- Combine (INTERSECTION) & pre-QC --------------------------------------- + # Combine & pre-QC message("--------------------------------------------------") se_combined <- combine_batches_intersection(se_list) @@ -688,7 +686,7 @@ run_modality <- function( upload_queue <- c(upload_queue, pre_pca_pdf, pre_hist_pdf) } - # ---- Optional ComBat (batch-only) ------------------------------------------ + # ComBat if (isTRUE(do_batch_correct)) { message("--------------------------------------------------") se_post <- combat_by_cohort(se_combined) @@ -702,7 +700,7 @@ run_modality <- function( post_title <- paste0("Combined ", modality, " samples (no ComBat)") } - # ---- Exports ---------------------------------------------------------------- + # Exports message(" Building long tables") long_pre <- se_to_long(se_combined, modality) |> dplyr::filter(is.finite(correctedAbundance)) @@ -716,7 +714,7 @@ run_modality <- function( upload_queue <- c(upload_queue, path_pre, path_post) } - # ---- Post (either ComBat or not) QC ---------------------------------------- + # Post ComBat/QC message(" Post-QC plots (PCA & histogram)") pc_df <- pca_df_present_in_all(se_post) gpca <- plot_pca(pc_df, post_title, pcols = pcols) @@ -729,7 +727,7 @@ run_modality <- function( upload_queue <- c(upload_queue, post_pca_pdf, post_hist_pdf) } - # ---- Pack results for return ------------------------------------------------ + # Pack results for return results <- list( se_batches = se_list, se_combined = se_combined, @@ -744,7 +742,7 @@ run_modality <- function( files = if (write_outputs) list(queued = upload_queue) else list() ) - # ---- FINAL STEP: Uploads ---------------------------------------------------- + # FINAL STEP: Uploads if (write_outputs && !is.null(upload_parent_id)) { perform_uploads(upload_queue, syn, upload_parent_id) } else if (!write_outputs) { diff --git a/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R index 10fa56e..0ec90c1 100644 --- a/03_analyze_modality_correlations.R +++ b/03_analyze_modality_correlations.R @@ -13,12 +13,9 @@ dir.create("figs", showWarnings = FALSE) # Helpers make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { - # Strict cleaning: trim to character, drop NA/blank IDs, then pivot df <- df_long %>% ungroup() %>% - # keep only shared sample IDs first (as before) dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% - # coerce and trim IDs mutate( !!sample_col := trimws(as.character(.data[[sample_col]])), !!feature_col := trimws(as.character(.data[[feature_col]])) @@ -27,8 +24,6 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va # Drop rows with NA/blank sample/feature IDs bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" - if (any(bad_sample)) message("[make_feature_matrix] Dropping ", sum(bad_sample), " rows with NA/blank ", sample_col) - if (any(bad_feature)) message("[make_feature_matrix] Dropping ", sum(bad_feature), " rows with NA/blank ", feature_col) df <- df[!(bad_sample | bad_feature), , drop = FALSE] if (!nrow(df)) { @@ -48,7 +43,6 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va ) %>% as.data.frame(check.names = FALSE) - # Guard against NA/blank rownames after pivot rn <- wide[[sample_col]] bad_rn <- is.na(rn) | rn == "" if (any(bad_rn)) { @@ -63,7 +57,7 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va return(out) } - # Finalize rownames (must be unique, non-empty) + # Finalize rownames rownames(wide) <- make.unique(as.character(rn), sep = "_dup") wide[[sample_col]] <- NULL wide @@ -200,22 +194,20 @@ summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = " # --------------------------- # Main wrapper # --------------------------- -# Returns: list(drug_mat, feat_mat, shared_ids, cor_tbl, cor_summary, cor_plot, drug_summary) analyze_modality <- function( fits, df_long, sample_col, # e.g., "Specimen" feature_col, # e.g., "feature_id" | "Gene" | "site" value_col, # e.g., "correctedAbundance" - metric = "uM_viability", + metric = "uM_viability", # Or fit_auc outdir = "figs", heatmap_filename = "drug_heatmap_large.pdf", fdr_thresh = 0.25 ) { - # Pre-intersection like original (all metrics) + shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) - # Feature matrix over shared IDs feat_mat <- make_feature_matrix( df_long = df_long, shared_ids = shared_ids, @@ -224,7 +216,7 @@ analyze_modality <- function( value_col = value_col ) - # Drug matrix for the exact metric (no normalization/fallbacks) + # Drug matrix for the metric drug_mat <- make_drug_matrix( fits = fits, metric = metric, @@ -234,7 +226,7 @@ analyze_modality <- function( metric_col = "dose_response_metric" ) - # Summaries & heatmap (original style) + # Summaries & heatmap dsum <- summarize_drugs( fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir ) @@ -257,7 +249,7 @@ analyze_modality <- function( } } - # Correlations (only if overlap) + # Correlations shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) cor_tbl <- if (length(shared_after) > 0L) { compute_cors(drug_mat, feat_mat, shared_samples = shared_after) From 020b9088c1acb86a1b4ebfc2b6623f45420dbe69 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 5 Dec 2025 09:14:19 -0800 Subject: [PATCH 6/7] Added README to clarify run order. Moved all old code to leagcy code directory --- 02_run_normalize_omics.Rmd | 13 +- README.md | 13 + cNF_helper_code.R | 3 +- .../02_normalize_harmonize_proteomics.rmd | 527 +++++++ .../03_analyze_modality_correlations.R | 0 legacy_code/03_drug_biomarkers.Rmd | 1261 +++++++++++++++++ .../03_drug_biomarkers_legacy_code.Rmd | 0 7 files changed, 1809 insertions(+), 8 deletions(-) create mode 100644 legacy_code/02_normalize_harmonize_proteomics.rmd rename 03_analyze_modality_correlations.R => legacy_code/03_analyze_modality_correlations.R (100%) create mode 100644 legacy_code/03_drug_biomarkers.Rmd rename 03_drug_biomarkers_legacy_code.Rmd => legacy_code/03_drug_biomarkers_legacy_code.Rmd (100%) diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd index bf77c93..1dbe95e 100644 --- a/02_run_normalize_omics.Rmd +++ b/02_run_normalize_omics.Rmd @@ -48,9 +48,9 @@ source("02_normalize_batchcorrect_omics.R") # - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect). # # Per-modality normalization -# - phospho : 0→NA → drop >50% missing → log2(x+0.01) → per-sample modified z. -# - global : log2(x) → per-sample modified z. -# - rna : drop >50% missing → log2(TPM+1) → per-sample modified z. +# - phospho : 0 toNA to drop >50% missing to log2(x+0.01) to per-sample modified z. +# - global : log2(x) to per-sample modified z. +# - rna : drop >50% missing to log2(TPM+1) to per-sample modified z. # # Returns (list) # - se_batches : per-batch normalized SEs. @@ -100,7 +100,7 @@ phospho <- run_modality( upload_parent_id = "syn70078365", pcols = pcols, write_outputs = FALSE, - save_basename = "Phospho_batch_corrected", + save_basename = "phospho_batch12_corrected", do_batch_correct = TRUE ) @@ -121,7 +121,7 @@ global <- run_modality( upload_parent_id = "syn70078365", pcols = pcols, write_outputs = FALSE, - save_basename = "Global_batch_corrected", + save_basename = "global_batch12_corrected", do_batch_correct = TRUE ) @@ -143,9 +143,8 @@ rna <- run_modality( upload_parent_id = "syn71099587", pcols = pcols, write_outputs = FALSE, - save_basename = "RNA_Matrix_no_batch_correct", + save_basename = "RNA_12_no_batch_correct", do_batch_correct = FALSE #Note this is set to false right now. Not needed for RNA ) - ``` diff --git a/README.md b/README.md index 7c6c957..69f7152 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,16 @@ The data for this code is hosted on Synapse at http://synapse.org/cnfDrugRespons ## Current analysis We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. + +## Quick run order (and what each file sources) +1) **01_harmonize_drug_data.Rmd** + - *Sources:* `cNF_helper_code.R` + - Download & quick QC of drug fits. + +2) **02_run_normalize_omics.Rmd** + - *Sources:* `cNF_helper_code.R`, `02_normalize_batchcorrect_omics.R` + - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. + +3) **04_analyze_modality_and_pathway_enrich.Rmd** + - *Sources:* `cNF_helper_code.R`, `03_analyze_modality_correlations.R`, `04_leapr_biomarker.R` + - Builds drug/feature matrices, modality correlations, leapR pathway enrichment and plots for all. diff --git a/cNF_helper_code.R b/cNF_helper_code.R index 711858a..9ae26c5 100644 --- a/cNF_helper_code.R +++ b/cNF_helper_code.R @@ -6,7 +6,8 @@ library(synapser) synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) - +library(tidyr) +library(dplyr) meta1 <- readxl::read_xlsx(syn$get('syn65595365')$path) |> tidyr::separate(Specimen,into=c('Patient','Tumor'),sep='_',remove = FALSE)|> diff --git a/legacy_code/02_normalize_harmonize_proteomics.rmd b/legacy_code/02_normalize_harmonize_proteomics.rmd new file mode 100644 index 0000000..aa2f322 --- /dev/null +++ b/legacy_code/02_normalize_harmonize_proteomics.rmd @@ -0,0 +1,527 @@ +--- +title: "Reformat and process proteomics" +author: "Sara Gosline" +date: "2025-09-28" +output: html_document +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(ggplot2) +library(dplyr) +library(tidyr) +library(stringr) +library(synapser) +library(grid) +source('cNF_helper_code.R') +``` + +# Drop sample list. These were for protocol optimization +```{r} +remove_fname_substrings <- c( + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25" +) + +is_unwanted_fname <- function(x) { + vapply(x, function(s) + any(vapply(remove_fname_substrings, function(p) grepl(p, s, fixed = TRUE), logical(1))), + logical(1) + ) +} + + +``` + + + +## Normalize phospho-proteomics + +We now have phosphoproteomics from two cohorts. Here I'm trying to collect data from both and normalize but am clearly missing something. I do the following: + +1. replace all zero values with NA to avoid skewing normalization +2. remove any features that are absent from >50% of the samples +3. take the log of the data, then take a modified z score + +Each dataset is done individually then combined at the end. There is a clear batch effect. + +### Cohort 1 phospho + +We start with the cohort 1 phospho data here. + +```{r compare to proteomics,warning=FALSE} + +##cohort 1 phospho +##first we read in file, and get site info +phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |> + subset(!is.na(`Gene.Names`)) |> + subset(Gene.Names!='') |> + mutate(lsite=tolower(Residue)) |> + tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |> + tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |> + as.data.frame() + +phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA + +pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=1) |> + dplyr::filter(!is_unwanted_fname(fname)) + + + +##logtransform##median transform +pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x) + length(which(is.na(x)))/length(x) < 0.5)) + +pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2, + function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> + as.data.frame() |> + mutate(site=phospho1$site[pzeros]) + +##move to long form, upload +plong1<-pmat1|> + tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|> + left_join(pfnames1) |> + group_by(site,fname,aliquot,cohort) |> + summarize(meanAbundance=mean(abundance,na.rm=T))|> + subset(!is.na(meanAbundance))|> + left_join(meta) + +readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv') +syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365')) + +``` +The file is uploaded to synapse. + +### Cohort 2 phospho + +Now on October 7 we can process the second batch of phospho. + +```{r cohort 2 phospho} + +##cohort 2 phospho +##1 read in data +phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |> + subset(!is.na(`Gene.Names`)) |> + subset(Gene.Names != '') |> + mutate(lsite = tolower(Residue)) |> + tidyr::unite(c(Residue,Site,lsite),col = 'site',sep='') |> + tidyr::unite(c(`Gene.Names`,site),col = 'site',sep='-') + + +phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA + +pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=2)|> + dplyr::filter(!is_unwanted_fname(fname)) + +##remove missingness +tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5)) + +##log2 adjusted z score +pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2, + function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> + as.data.frame() |> + mutate(site=phospho2$site[tm]) + + + +plong2<-pmat2|> + tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |> + left_join(pfnames2)|> + group_by(site,fname,aliquot,cohort) |> + summarize(meanAbundance=mean(abundance,na.rm=T)) |> + subset(!is.na(meanAbundance))|> + left_join(meta) + +##save to file +readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv') +syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365')) + + +``` + +Now that we have two cohorts we can try to combine without batch correction. + +### Combined phospho +Combining the phoshpo data here. + +```{r combined phospho} + +##now we move back to long form +# plong <- rbind(plong1,plong2) + +plong1 <- plong1 |> dplyr::filter(!is_unwanted_fname(fname)) +plong2 <- plong2 |> dplyr::filter(!is_unwanted_fname(fname)) +plong <- rbind(plong1, plong2) |> dplyr::filter(!is_unwanted_fname(fname)) + + #pmat |> +# as.data.frame()|> +# tibble::rownames_to_column('site')|> +# pivot_longer(-site,names_to='fname',values_to='abundance')|> +# left_join(rbind(pfnames1,pfnames2))|> +# group_by(site,fname,aliquot,cohort) |> +# summarize(meanAbundance=mean(abundance,na.rm=T)) |> +# left_join(meta) + + +compsites <- plong|> +# subset(meanAbundance>(-5))|> + group_by(site)|> + summarize(spec = n_distinct(Specimen))|> + subset(spec==31) + +#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0 + +ppcs<-plong|>ungroup()|> + dplyr::select(Specimen,meanAbundance,site)|> + unique()|> + subset(site%in%compsites$site)|> + #subset(!is.na(site))|> + #subset(!is.na(meanAbundance))|> + tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance', + values_fn=mean,values_fill=0)|> + tibble::column_to_rownames('site')|> + t()|> + prcomp() + +pplot<-ppcs$x|> + as.data.frame()|> + dplyr::select(PC1,PC2,PC3)|> + tibble::rownames_to_column('Specimen')|> + left_join(meta)|> + dplyr::select(PC2,PC1,Specimen,Patient,cohort)|> + mutate(cohort=as.factor(cohort))|> + distinct()|> + ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ + geom_point()+ + #ggrepel::geom_label_repel()+ + ggtitle("Phospho samples")+ + ggplot2::scale_color_manual(values=pcols) + +ph<- plong |>ungroup()|> + subset(site%in%compsites$site)|> + ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() + +cowplot::plot_grid(ph,pplot) +ggsave('cNFPhosphoQC.png',width=10) + +pplot +ggsave('phosphoPCA.pdf') +``` + +Clearly there is a strong batch effect. +## Normalize golbal proteomics +Now we can move onto the global data + +### Cohort 1 global +Global proteomics in cohort 1 here. + +```{r global} +####now process global +#global1<-readr::read_tsv(syn$get('syn64906445')$path) +global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |> + tidyr::separate_rows(Genes,sep=';') +##logtransform, medina transform + +#global1[which(global1==0,arr.ind=TRUE)]<-NA + +gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) + +gmat1<-gmat1|> + as.data.frame()|> + mutate(Genes=global1$Genes) + +##extract aliquot info from file name +gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=1)|> + dplyr::filter(!is_unwanted_fname(fname)) + +glong1<-gmat1|> + tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|> + left_join(gfnames1)|> + group_by(Genes,fname,aliquot,cohort)|> + summarize(meanAbundance=mean(abundance))|> + subset(is.finite(meanAbundance))|> + left_join(meta) + + +readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv') +syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365')) +``` + +### Cohort 2 global +October 7 we process the second cohort. +```{r batch 2 global} +global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|> + tidyr::separate_rows(Genes,sep=';') + +#global2[which(global2==0,arr.ind=TRUE)]<-NA + +gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) + 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) +rownames(gmat2)<-global2$Genes + +gmat2<-gmat2|> + as.data.frame()|> + mutate(Genes=global2$Genes) + +gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=2)|> + dplyr::filter(!is_unwanted_fname(fname)) +gfnames1 + +glong2<-gmat2|> + tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|> + left_join(gfnames2)|> + group_by(Genes,fname,aliquot,cohort)|> + summarize(meanAbundance=mean(abundance))|> + subset(is.finite(meanAbundance))|> + left_join(meta) + +#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|> +# subset(numIso>1) + + +readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv') +syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365')) +``` + +### Global combined without batch correction +Now we can combine the global withot batch correction. + +```{r combined global test} +#ma<-mean(glong$abundance,na.rm=T) +#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0 +glong1 <- glong1 |> dplyr::filter(!is_unwanted_fname(fname)) +glong2 <- glong2 |> dplyr::filter(!is_unwanted_fname(fname)) +glong <- rbind(glong1, glong2) |> dplyr::filter(!is_unwanted_fname(fname)) |> + subset(Genes!="") + +n_spec <- meta |> dplyr::distinct(Specimen) |> nrow() +compsites <- glong|> +# subset(meanAbundance>(-5))|> + group_by(Genes)|> + summarize(spec = n_distinct(Specimen))|> + subset(spec==n_spec) + +gpcs<-glong|>ungroup()|> + dplyr::select(Specimen,meanAbundance,Genes)|> + subset(!is.na(Genes))|> + subset(Genes!="")|> + subset(Genes%in%compsites$Genes)|> + subset(!is.na(meanAbundance))|> + tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|> + tibble::column_to_rownames('Genes')|>t()|> + prcomp() + +gplot<-gpcs$x|> + as.data.frame()|> + dplyr::select(PC1,PC2)|> + tibble::rownames_to_column('Specimen')|> + left_join(meta)|> + dplyr::select(PC1,PC2,Specimen,Patient,cohort)|> + mutate(cohort=as.factor(cohort))|> + distinct()|> + ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ + geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+ + scale_color_manual(values=pcols) + + +hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() + + +cowplot::plot_grid(hplot,gplot) +ggsave('cNFGlobalQC.png',width=10) + + +gplot + +ggsave('globalPCA.pdf') + +``` + + +## Evaluate batch correction + +Now we have two separate long tables with metadata, but we would like to combine into a single one and batch correct.We can update this with each cohort. + +```{r combine and correct} + +##phospho +##TODO: ideally we should use the long tables and reconvert +pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2)) + +gmat <- merge(gmat1,gmat2) + +##remove duplicated sites +dsites<- unique(pmat$site[which(duplicated(pmat$site))]) +mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |> + t() |> + as.data.frame() |> + tibble::rownames_to_column('site') + +pmat <- pmat |> + subset(!site %in% dsites) |> + rbind(mvals) + +##now convert to matrix +pmat <- pmat |> + tibble::remove_rownames() |> + tibble::column_to_rownames('site') |> + as.matrix() + +gmat <- gmat |> + subset(Genes!='')|> + tibble::remove_rownames() |> + tibble::column_to_rownames('Genes') |> + as.matrix() +##sigh, batch correct? +library(sva) +# +# pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 +# cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE) +# +# gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 +# cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE) + + +pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 +## align ComBat batch vector to pmat columns via filename→cohort maps +sample_meta_p <- data.frame(fname = colnames(pmat)) |> + dplyr::left_join(rbind(pfnames1, pfnames2), by = "fname") |> + dplyr::select(fname, cohort) +keep_p <- !is.na(sample_meta_p$cohort) +pmat <- pmat[, keep_p, drop = FALSE] +pbatch <- sample_meta_p$cohort[keep_p] +cbmat <- sva::ComBat(dat = pmat, batch = pbatch, mean.only = FALSE) + +gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 +## align ComBat batch vector to gmat columns via filename→cohort maps +sample_meta_g <- data.frame(fname = colnames(gmat)) |> + dplyr::left_join(rbind(gfnames1, gfnames2), by = "fname") |> + dplyr::select(fname, cohort) +keep_g <- !is.na(sample_meta_g$cohort) +gmat <- gmat[, keep_g, drop = FALSE] +gbatch <- sample_meta_g$cohort[keep_g] +cgmat <- sva::ComBat(dat = gmat, batch = gbatch, mean.only = FALSE) + + +ppcs<-prcomp(t(cbmat)) +gpcs<-prcomp(t(cgmat)) + +``` + +# plot batch corrected data +```{r} + +# Reusable theme: white background + light grey gridlines +my_theme <- theme_bw() + + theme( + panel.grid.major = element_line(color = "grey85"), + panel.grid.minor = element_line(color = "grey92"), + panel.background = element_rect(fill = "white", color = NA), + plot.background = element_rect(fill = "white", color = NA), + legend.title = element_text(size = 9), + legend.text = element_text(size = 7), + legend.key.size = unit(0.5, "cm"), + legend.spacing.y = unit(0.2, "cm"), + legend.box.spacing= unit(0.3, "cm") + ) + + +# --- Corrected phospho plot (white bg + grey grid) --- +pplot <- ppcs$x |> + as.data.frame() |> + dplyr::select(PC1, PC2) |> + tibble::rownames_to_column("fname") |> + left_join(rbind(pfnames1, pfnames2)) |> + left_join(meta) |> + dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> + mutate( + Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), + Tumor = factor(Tumor), + cohort = as.factor(cohort) + ) |> + distinct() |> + dplyr::filter(!is.na(Patient), !is.na(Tumor)) |> + ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + + geom_point(size = 3) + + ggtitle("Corrected phospho samples") + + scale_color_manual(values = pcols) + + scale_shape_discrete(na.translate = FALSE) + + my_theme + +pplot + +# --- Corrected global plot (white bg + grey grid) --- +gplot <- gpcs$x |> + as.data.frame() |> + dplyr::select(PC1, PC2) |> + tibble::rownames_to_column("fname") |> + left_join(rbind(gfnames1, gfnames2)) |> + left_join(meta) |> + dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> + mutate( + Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), + Tumor = factor(Tumor), + cohort = as.factor(cohort) + ) |> + distinct() |> + ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + + geom_point(size = 3) + + ggtitle("Corrected global samples") + + scale_color_manual(values = pcols) + + my_theme + +gplot + + +ggsave("phosphoCorrectedPCA.pdf", plot = pplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) +ggsave("globalCorrectedPCA.pdf", plot = gplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) + +``` + +## Upload batch-corrected data to synapse + +Now we can reformat the batch-corrected data and upload to syanps + +```{r upload batch corrected} + +pc_long <- cbmat |> + as.data.frame() |> + tibble::rownames_to_column('site') |> + pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |> + left_join(rbind(pfnames1,pfnames2)) |> + left_join(meta) |> + distinct() + +gc_long <- cgmat |> + as.data.frame() |> + tibble::rownames_to_column('Gene') |> + pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |> + left_join(rbind(gfnames1,gfnames2)) |> + left_join(meta) |> + distinct() + + +readr::write_csv(pc_long,file = 'batch12_correctedPhospho.csv') +readr::write_csv(gc_long,file = 'batch12_correctedGlobal.csv') + +syn$store(File('batch12_correctedPhospho.csv',parentId = 'syn70078365')) +syn$store(File('batch12_correctedGlobal.csv',parentId = 'syn70078365')) + +``` diff --git a/03_analyze_modality_correlations.R b/legacy_code/03_analyze_modality_correlations.R similarity index 100% rename from 03_analyze_modality_correlations.R rename to legacy_code/03_analyze_modality_correlations.R diff --git a/legacy_code/03_drug_biomarkers.Rmd b/legacy_code/03_drug_biomarkers.Rmd new file mode 100644 index 0000000..2fa4b21 --- /dev/null +++ b/legacy_code/03_drug_biomarkers.Rmd @@ -0,0 +1,1261 @@ +--- +title: "Evaluate drug and omics data for biomarker assessment" +author: "Sara gosline" +date: "2025-03-13" +output: html_document +--- + +This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics + +We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +library(ggplot2) +library(dplyr) +library(tidyr) + +``` + +# Pull processed files from previous markdowns + +We have already run the previous scripts and stored data on Synapse + +```{r pull files} +Sys.setenv(SYNAPSE_AUTH_TOKEN="eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIiwibW9kaWZ5Il0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc1ODMyNTgwMiwiaWF0IjoxNzU4MzI1ODAyLCJqdGkiOiIyNjE1MSIsInN1YiI6IjM0NTM5NTUifQ.VcGcVRv0P50Cb6mm7B7hGxzcWdxG4TvMhq8lRZDNEgktWNxdhMA0zacJ1jeOEilfEI-9RRpA7jE2WIM3zjIgYTL-l-UobBMKnvL_gu6itQuf2DyKR6K9OBQJER4cy7N0o6_4qwq5YPflpF6uWuvgAfskuPmQH8Yz9Z80UjeLxFw2yKUJcvtanghAWwFerOEJxxb-PDxHHC6gM_VK-HGGprPQy9_Z33dCZYcmrDbCgV5rWUV5AdTyCHhDBrx4YMw43J2U7os88SPpEEmbvxVpfSFTusOsP1FzYn7ifnpw2t6Ip5ZwLmwShBViShMlTe9X0tgjO5htY5UdXyJBMVK0Qw") + +source("cNF_helper_code.R") +traceback() + +source('cNF_helper_code.R') +##read in drug code +fits <- readr::read_tsv(synGet('syn69947322')$path) + + +##read in proteomic data +glong <- readr::read_csv(synGet('syn70078416')$path) +plong <- readr::read_csv(synGet('syn70078415')$path) + + +##read in transcrniptomic data +#TODO: process transcritpomic data into long format + + +``` + +## Format protein data to collect correlation values + +Do simple correlations to identify putative trends in the data. + +Get most efficacious, variable, and heatmap + +```{r} +# ensure an output folder + +outdir <- "figs" +if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +shared <- intersect(fits$improve_sample_id, glong$Specimen) +message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", +length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) + +glob_dat <- glong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, Gene, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "Gene", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +phos_dat <- plong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, site, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "site", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +## drug data to matrix here + +drug_dat <- fits |> +subset(dose_response_metric == "uM_viability") |> +dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> +tidyr::pivot_wider( +names_from = "improve_drug_id", +values_from = "dose_response_value", +values_fn = mean +) |> +tibble::column_to_rownames("improve_sample_id") + +## summarize drugs + +drug_counts <- fits |> +subset(dose_response_metric == "uM_viability") |> +group_by(improve_drug_id) |> +distinct() |> +summarize( +meanResponse = mean(dose_response_value, na.rm = TRUE), +nMeasured = n_distinct(improve_sample_id), +variability = sd(dose_response_value, na.rm = TRUE), +.groups = "drop" +) + +# -------- Plot 1: most efficacious -------- + +p1 <- drug_counts |> +arrange(desc(meanResponse)) |> +subset(meanResponse < 0.5) |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most efficacious drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) + +# -------- Plot 2: most variable -------- + +p2 <- drug_counts |> +arrange(desc(variability)) |> +subset(variability > 0.15) |> +as.data.frame() |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most variable drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) + +# -------- Plot 3: heatmap of complete-measurement drugs -------- + +fulldrugs <- drug_counts |> +subset(nMeasured == nrow(drug_dat)) + +# Save large, rotate column labels, and shrink font to avoid overlap + +# pheatmap can write directly to a file via the `filename` arg. +# 1. Sort samples alphanumerically +drug_dat_alpha <- drug_dat[order(rownames(drug_dat)), , drop = FALSE] + +# 2. Get patient prefix (e.g. "NF0017" from "NF0017_T1") +samples <- rownames(drug_dat_alpha) +prefixes <- sub("_.*$", "", samples) + +# 3. Indices where the prefix changes (group boundaries) +gap_idx <- which(prefixes[-1] != prefixes[-length(prefixes)]) + +# 4. Heatmap with thin black lines + group separators, saved to file +pheatmap::pheatmap( + as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), + filename = file.path("figs/drug_heatmap_grouped.pdf"), + width = 28, + height = 16, + angle_col = 45, + fontsize_col = 6, + cluster_rows = FALSE, # keep alphabetical order + cluster_cols = TRUE, + show_rownames = TRUE, + show_colnames = TRUE, + gaps_row = gap_idx, # group separators + border_color = "black" # thin black grid lines +) + + + +# ---- PRINT plots in the document as well ---- + +print(p1) +print(p2) + +pheatmap::pheatmap( +as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), +angle_col = 45, +fontsize_col = 6, +cluster_rows = FALSE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +``` + + +# Basic correlation tests + +Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? + +We can define a simple test to correlate features and drugs and assess significance and correct: + +```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} + +#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers +#also coputes significance +computeCors <- function(drug_dat,feat_dat,shared){ + + cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> + as.data.frame() |> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> + arrange(desc(cor)) + + ##now lets try to get significance + csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(feat_dat),function(y){ + pval <- 1.0 + try(pval <- cor.test(drug_dat[shared,x], + feat_dat[shared,y], + use = 'pairwise.complete.obs', + method = 'spearman')$p.value,silent = TRUE) + + return(c(corp = pval,drug = x,gene = y)) + })) |> + as.data.frame() |> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + })) |> + as.data.frame() |> + mutate(drug = unlist(drug)) |> + mutate(gene = unlist(gene)) + + fullcors <- cres|>left_join(data.frame(csig)) |> + mutate(direction=ifelse(cor<0,'neg','pos')) + + return(fullcors) +} + +``` + +Now that we have a function we can compute correlations of each data type. + +```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} + +gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> + mutate(data='proteins') +pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> + mutate(data = 'phosphosites') + +allcor <- rbind(gcor,pcor) + +corsummary<-allcor |> subset(fdr<0.25) |> + group_by(drug,data,direction) |> + summarize(features=n(),meanCor=mean(cor)) + + +p_features <- corsummary |> + subset(features > 1) |> + ggplot(aes(x = drug,y = features,fill = direction)) + + facet_grid(data~.) + + geom_bar(position='dodge',stat='identity') + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + +ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), + plot = p_features, width = 12, height = 6, units = "in") + +corsummary |> + arrange(desc(features)) |> + subset(features > 100) |> + dplyr::select(drug,data,direction,features,meanCor) + + + +``` + +Now we have the correlation values. what do we do with them? +## Correlation based enrichment +Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. +```{r functional enrichment} +# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === +# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) +# Outputs: +# prot_enrich[[drug]]$top / $bottom +# phos_enrich[[drug]]$top / $bottom +# Optional CSVs in folder "leapR_top_paths/dir_split/" + +library(dplyr) +library(tidyr) +library(stringr) +library(SummarizedExperiment) +library(leapR) + +# ---- choose drugs to run (use your two, or set to a larger list) ---- +# target_drugs <- c("THZ1", "Onalespib") +target_drugs <- unique(fits$improve_drug_id) + +# ---- genesets ---- +data(msigdb); geneset_db <- msigdb # or ncipid +data(kinasesubstrates) + +# ---- helpers ---- +extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] + toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) +} + +# Correlate response vector vs each feature column (Spearman) +col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build one-column SE; column name must match primary_columns +build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) + assayNames(se) <- assay_label + se +} + +safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) + +# ---- feature and mapping vectors ---- +prot_features <- colnames(glob_dat) +phos_features <- colnames(phos_dat) +phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), + phos_features) + +# ---- results containers ---- +prot_enrich <- list() +phos_enrich <- list() + +# optional: write CSVs? +write_csvs <- TRUE +outdir <- "leapR_top_paths/dir_split" +if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +# ---- per-drug workflow ---- +for (drug in target_drugs) { + message("=== ", drug, " ===") + # build response vector (mean per sample if repeats) + dv <- fits %>% + filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% + group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") + if (nrow(dv) == 0) { message(" no response rows; skipping"); next } + dv_vec <- setNames(dv$resp, dv$improve_sample_id) + + # correlations + prot_cor <- col_spearman(dv_vec, glob_dat) + phos_cor <- col_spearman(dv_vec, phos_dat) + + # split by sign + prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] + prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] + phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] + phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] + + # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- + prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(prot_pos) >= 5) { + se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") + prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_top, assay_name = "proteomics", + primary_columns = paste0(drug, "_TOP")) + prot_enrich[[drug]]$top <- prot_top + if (write_csvs && !is.null(prot_top)) { + write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) + } + } else message(" PROT top: too few positive features (", length(prot_pos), ")") + + # BOTTOM (flip sign so more negative = larger positive rank) + if (length(prot_neg) >= 5) { + se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") + prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_bot, assay_name = "proteomics", + primary_columns = paste0(drug, "_BOTTOM")) + prot_enrich[[drug]]$bottom <- prot_bot + if (write_csvs && !is.null(prot_bot)) { + write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) + } + } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") + + # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- + phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(phos_pos) >= 5) { + se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_top, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") + phos_enrich[[drug]]$top <- phos_top + if (write_csvs && !is.null(phos_top)) { + write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) + } + } else message(" PHOS top: too few positive features (", length(phos_pos), ")") + + # BOTTOM (flip) + if (length(phos_neg) >= 5) { + se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_bot, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") + phos_enrich[[drug]]$bottom <- phos_bot + if (write_csvs && !is.null(phos_bot)) { + write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) + } + } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") + +} + +# Save all direction-split results for later reuse +save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") + +message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") + + +``` + + + +For each drug, how many terms do we see active? how many kinases? +```{r functional enrichment} +# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== +library(dplyr) +library(tidyr) +library(purrr) +library(tibble) +library(ggplot2) +library(forcats) +library(stringr) +library(scales) + +# Always load the precomputed enrichment lists here +load("leapR_enrichment_direction_split.Rdata") +if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") +if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") + +alpha <- 0.05 +topN <- 15 # <<< top 15 +dirs <- c("resistant","sensitive") + +# ---------- helpers ---------- +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) + if ("padj" %in% cols) return(list(kind="adj", col="padj")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) + NULL +} + +extract_term_col <- function(df) { + cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") + hit <- cands[cands %in% names(df)] + if (length(hit)) hit[[1]] else NULL +} + +tidy_one_result <- function(x) { + if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) + df <- as.data.frame(x) + if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) + + term_col <- extract_term_col(df) + if (is.null(term_col)) { + df <- tibble::rownames_to_column(df, "pathway") + } else { + df <- dplyr::mutate(df, pathway = .data[[term_col]]) + } + df$pathway <- as.character(df$pathway) + + pk <- pick_pcol(df) + if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) + adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") + + tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> + filter(is.finite(adj_p), !is.na(adj_p)) +} + +flatten_by_direction <- function(lst, omic_label) { + if (!length(lst)) return(tibble()) + purrr::imap_dfr(lst, function(two, drug) { + bind_rows( + tidy_one_result(two$top) |> mutate(direction = "resistant"), + tidy_one_result(two$bottom) |> mutate(direction = "sensitive") + ) |> + mutate(drug = as.character(drug), omic = omic_label) + }) +} + +# ---------- long-format enrichment and significance filter ---------- +prot_long <- flatten_by_direction(prot_enrich, "global") +phos_long <- flatten_by_direction(phos_enrich, "phospho") +enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() +stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) + +enrich_sig <- enrich_long |> + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) + +if (nrow(enrich_sig) == 0) { + message("No significant pathways at FDR < ", alpha, ".") + pathway_summary <- tibble() + drug_counts <- tibble() +} else { + pathway_summary <- enrich_sig |> + group_by(omic, direction, pathway) |> + summarise(n_drugs = n_distinct(drug), .groups = "drop") |> + arrange(desc(n_drugs)) + + all_drugs <- sort(unique(enrich_long$drug)) + drug_counts <- enrich_sig |> + group_by(drug, direction) |> + summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> + complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> + arrange(drug, direction) + + # ---------- summary figures ---------- + dir.create("figs", showWarnings = FALSE) + + reorder_within <- function(x, by, within, sep = "___") { + x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) + } + scale_y_reordered_wrap <- function(width = 32, sep = "___") { + ggplot2::scale_y_discrete( + labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) + ) + } + + pathway_summary_top <- pathway_summary |> + group_by(omic, direction) |> + slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> + ungroup() |> + mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) + + p_pathways <- ggplot(pathway_summary_top, + aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + + geom_col(position = position_dodge(width = 0.85), width = 0.85) + + facet_wrap(~ omic, scales = "free_y") + + scale_y_reordered_wrap(width = 36) + + scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + + labs(title = paste0("Top ", topN, " pathways enriched across drugs"), + y = "Pathway", x = "# Drugs") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + strip.text = element_text(face = "bold"), + axis.text.y = element_text(size = 7), + panel.grid.major.x = element_blank(), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) + + coord_cartesian(clip = "off") + + # --- PDF saves (summary figs) --- + ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, + width = 12, height = 10, units = "in", device = cairo_pdf) + print(p_pathways) + + drug_counts_full <- drug_counts |> + group_by(drug) |> + mutate(total = sum(n_pathways)) |> + ungroup() |> + mutate(drug = forcats::fct_reorder(drug, total)) + + p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + + geom_col(position = position_dodge(width = 0.9), width = 0.85) + + labs(title = "Number of enriched pathways per drug", + x = "Drug", y = "# Pathways") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), + panel.grid.major.x = element_blank() + ) + + ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, + width = 18, height = 7, units = "in", device = cairo_pdf) + print(p_counts) +} + +# ---------- Top-2 most efficacious & most variable drugs ---------- +top2_efficacious <- character(0) +top2_variable <- character(0) + +if (exists("fits")) { + eff_tbl <- fits |> + filter(dose_response_metric == "uM_viability") |> + group_by(improve_drug_id) |> + summarise( + meanResponse = mean(dose_response_value, na.rm = TRUE), + variability = sd(dose_response_value, na.rm = TRUE), + nMeasured = dplyr::n_distinct(improve_sample_id), + .groups = "drop" + ) + top2_efficacious <- eff_tbl |> + arrange(meanResponse, desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) + top2_variable <- eff_tbl |> + arrange(desc(variability), desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) +} else { + some <- unique(enrich_long$drug) + top2_efficacious <- head(some, 2) + top2_variable <- head(rev(some), 2) +} + +# ---- Force-include Onalespib in top_interest (case-insensitive) ---- +ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) +if (length(ona_matches) == 0) ona_matches <- "Onalespib" + +top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) +message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) +message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) +message("Force-included: ", paste(ona_matches, collapse = ", ")) + +# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- +pick_pcol_plot <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return("BH_pvalue") + if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") + if ("adj.P.Val" %in% cols) return("adj.P.Val") + if ("padj" %in% cols) return("padj") + if ("pvalue" %in% cols) return("pvalue") + if ("P.Value" %in% cols) return("P.Value") + NA_character_ +} + +sig_stars <- function(p) dplyr::case_when( + is.na(p) ~ "", + p < 0.001 ~ "***", + p < 0.01 ~ "**", + p < 0.05 ~ "*", + TRUE ~ "" +) + +prep_plot_df <- function(res_df, n = 15) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) + if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") + col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) + + # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) + adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] + + df |> + mutate( + adj_p = as.numeric(adj_p), + score = -log10(pmax(adj_p, 1e-300)), + stars = sig_stars(adj_p), + signif = !is.na(adj_p) & adj_p < 0.05, + feature = stringr::str_wrap(as.character(feature), width = 40) + ) |> + filter(is.finite(adj_p)) |> + arrange(adj_p, desc(score)) |> + slice_head(n = n) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) return(NULL) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + star_pad <- 0.15 + + ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + + scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + + geom_text(aes(y = score + star_pad, label = stars), + size = 3, hjust = 0) + + coord_flip(clip = "off") + + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 7), + axis.text.x = element_text(size = 8), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) +} + +safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + +dir.create("figs", showWarnings = FALSE) +for (drug in top_interest) { + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + + p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " - global"), "top") + p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " - global"), "bottom") + p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " - Phospho"), "top") + p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " - Phospho"), "bottom") + + if (!is.null(p1)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), + p1, width = 7, height = 5, device = cairo_pdf); print(p1) + } + if (!is.null(p2)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), + p2, width = 7, height = 5, device = cairo_pdf); print(p2) + } + if (!is.null(p3)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), + p3, width = 7, height = 5, device = cairo_pdf); print(p3) + } + if (!is.null(p4)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), + p4, width = 7, height = 5, device = cairo_pdf); print(p4) + } +} + + + +``` + + + + +# Print siginficant results for all drugs if we want. +```{r} +library(dplyr) +library(ggplot2) +library(tibble) +library(rlang) + +alpha <- 0.05 # significance threshold +top_n_to_show <- 15 + +# Prefer adjusted p if available; fall back to raw and adjust per-run +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + return(NULL) +} + +prep_plot_df <- function(res_df, n = top_n_to_show) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df) + if (!nrow(df)) return(NULL) + + pick <- pick_pcol(df) + if (is.null(pick)) return(NULL) + + # unify to adj p + if (pick$kind == "adj") { + df <- df %>% mutate(adj_p = !!sym(pick$col)) + } else { # raw p → adjust within this run + df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) + } + + df %>% + rownames_to_column("feature") %>% + arrange(adj_p) %>% + # keep only significant ones; if none, return empty (caller will message) + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% + head(n) %>% + mutate(score = -log10(pmax(adj_p, 1e-300))) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) { + message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") + return(NULL) + } + # Correct labels & colors for uM_viability convention: + # TOP -> resistant (red), BOTTOM -> sensitive (blue) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + + p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(fill = fillc) + + coord_flip() + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 6), + axis.text.x = element_text(size = 8) + ) + print(p); invisible(p) +} + +plot_drug_panels <- function(drug) { + message("\n=== ", drug, " ===") + + # global + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") + plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") + + # Phospho pathways + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") + plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") +} + + +# --- run for your drugs --- +for (d in target_drugs) plot_drug_panels(d) + +``` + + + + +# Basic drug list +```{r} +drug_counts <- fits %>% + filter(dose_response_metric == "uM_viability") %>% + group_by(improve_drug_id) %>% + summarise( + n_rows = dplyr::n(), # total rows/measurements + n_specimens = n_distinct(improve_sample_id), # unique samples tested + meanResponse = mean(dose_response_value, na.rm = TRUE), + sdResponse = sd(dose_response_value, na.rm = TRUE), + .groups = "drop" + ) %>% + arrange(desc(n_specimens), improve_drug_id) + +# Plain list of drugs + total count +drug_list <- sort(unique(drug_counts$improve_drug_id)) +n_drugs <- length(drug_list) + +message(sprintf("Total unique drugs: %d", n_drugs)) +print((drug_list)) + +``` + + + + + + + + + + + + + + + + + + + + + +## Visualization +How should we visualize? Here is some older code +```{r plot cors, eval=FALSE} + +plotCors <- function(features,druglist,dataType='proteins'){ + ##subset a list of features and drugs and plot those in a graph + require(ggplot2) + if(dataType=='proteins'){ + ptab<-glong|>dplyr::rename(feature='Gene') + }else{ + ptab<-plong|>dplyr::rename(feature='site') + } + dtab<-fits|> + subset(dose_response_metric=='uM_viability')|> + dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> + subset(Drug%in%druglist) + + + ftab<-features|>left_join(ptab)|>left_join(dtab)|> + subset(!is.na(Drug)) + + feats <- unique(features$feature) + plots <- lapply(feats,function(x){ + corval <- ftab[ftab$feature==x,'cor'] + #corval <- ftab[ftab$feature==x,'pCor'] + + ftab|>subset(feature==x)|> + ggplot(aes(x=correctedAbundance,y=dose_response_value, + col=Patient,size=1))+ + geom_point()+ + facet_grid(~Drug)+ + ggtitle(paste(x,'Drug correlation'))+ + scale_color_manual(values=pcols) + }) + cowplot::plot_grid(plotlist= plots,ncol=2) + +} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.25)|> + subset(abs(cor)>0.7)|> + subset(data=='proteins')|> + arrange(desc(abs(cor))) + +plotCors(rename(features[1:10,],feature='gene'),druglist) + +ggsave('onalespibFDR0.25Cors.pdf',height=20) + +``` + +# Random forest predictor + +Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. + +First we build the data frames needed - I've included cohort as a covariate but may remove it. + +```{r random forest} +## separate out cohorts for prediction +cohorts <- meta |> + select(Specimen,cohort) |> + distinct() |> + tibble::column_to_rownames('Specimen') + +##for each drug, build model of cohort + protein ~ drug response +#removed cohort for now +gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) +gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) + +pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) +pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) + +mdf <- meta[-c(2,5),]|> ##have duplication here + tibble::column_to_rownames('Specimen') + + +``` + +Now we can loop through every drug, build model, and assess accuracy. + +```{r evaluate predictivty} + +#trying ou tthis function tos ee how it goes +rfFeatures <- function(drug_dat,fdf, mdf){ + complete_drugs <- which(apply(drug_dat,2, + function(x) length(which(!is.na(x)))==length(x))) + print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) + all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ + + dg <- fdf#[,-gnas] + + ##create the metadata df with the drug of interest + dmdf <- mdf |> + mutate(drug=drug_dat[rownames(mdf),drug]) + + rf <- randomForest::randomForest(x=dg, + y=drug_dat[rownames(dg),drug], + importance=TRUE,ntree=500) + + im <- randomForest::importance(rf)|> + as.data.frame() |> + mutate(drug=drug) + pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) + + #pheatmap::pheatmap(t(glob_dat[pord, + # rownames(im)]),annotation_col=dmdf, + # cellheight=10,cluster_cols = TRUE) + return(im) + ##what do we return?x + })) + return(all_preds) +} + + +``` + +Now what do we do with the importance features? + +```{r rf processing} + +##get importance for global +gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) + +##get importance for phospho +pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) + +``` + + + + + + + + + + + + + + + + + +# Old correlation code, dont run + +now we can visualize correlations + +```{r check out HSP90s, eval=FALSE} + +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) + +ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) + +plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') + +ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) + + +``` + +### Now lets look only at IC50 values + +There are a few drugs for which we have IC50 values + +```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='fit_ic50') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(fdr<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+ + facet_grid(~direction)+ + geom_bar(position='dodge',stat='identity') + + + +``` + +Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . + +```{r plot individual sites, eval=FALSE} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.05)|> + subset(data=='global') + +plotCors(rename(features,feature='gene'),druglist) + + +druglist<-c('Digoxin') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.1)|> + subset(data=='phospho') + +#plotCors(rename(features,feature='gene'),druglist,data='phospho') + + + +``` + +Now we can check the HSP proteins directly + +```{r HSP correlation, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) + +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) + +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` + +The IC50 result is similar to the viability. Now we can check AUC + +```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='auc') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(bh_p<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') + +print(corsummary) + +``` + +```{r HSP correlation again, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>s +ubset(drug=='Onalespib')|>subset(corp<0.05) +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) +cor_hsps + +plotCors(rename(cor_hsps, feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` diff --git a/03_drug_biomarkers_legacy_code.Rmd b/legacy_code/03_drug_biomarkers_legacy_code.Rmd similarity index 100% rename from 03_drug_biomarkers_legacy_code.Rmd rename to legacy_code/03_drug_biomarkers_legacy_code.Rmd From 08f7fe7e6baa3876e57557969641c0aa360aacdf Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 5 Dec 2025 09:16:50 -0800 Subject: [PATCH 7/7] moved 03_analyze_modality_correlations.R out of legacy code --- ..._modality_correlations.R => 03_analyze_modality_correlations.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename legacy_code/03_analyze_modality_correlations.R => 03_analyze_modality_correlations.R (100%) diff --git a/legacy_code/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R similarity index 100% rename from legacy_code/03_analyze_modality_correlations.R rename to 03_analyze_modality_correlations.R