UMCUGenetics · ALuesink · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.github/workflows/dims_test.yml b/.github/workflows/dims_test.yml
@@ -23,7 +23,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        run: Rscript -e "install.packages(c('testthat', 'withr'))"
+        run: Rscript -e "install.packages(c('testthat', 'withr', 'vdiffr'))"
 
       - name: Run tests
         run: Rscript tests/testthat.R
diff --git a/.lintr b/.lintr
@@ -1,6 +1,7 @@
 linters: linters_with_defaults(
     line_length_linter(127),
     object_usage_linter = NULL,
-    return_linter = NULL
+    return_linter = NULL,
+    pipe_consistency_linter = lintr::pipe_consistency_linter("auto")
   )
 encoding: "UTF-8"
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
@@ -115,10 +115,6 @@ if (z_score == 1) {
   # save outlist for GenerateQC step
   save(outlist, file = "outlist.RData")
 
-  # get the IDs of the patients and sort
-  patient_ids <- unique(gsub("\\.[0-9]*", "", patient_columns))
-  patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)]
-
   # get Helix IDs for extra Excel file
   metabolite_files <- list.files(
     path = paste(path_metabolite_groups, "Diagnostics", sep = "/"),
@@ -151,51 +147,56 @@ if (z_score == 1) {
     relocate(c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID), .after = last_col()) %>%
     rename(Name = H_Name)
 
-  for (row_index in seq_len(nrow(outlist))) {
-    # get HMDB ID
-    hmdb_name <- rownames(outlist[row_index, ])
+  # Get intensity columns for controls and patients
+  intensities_plots_df <- outlist %>% select(HMDB_key, matches("^C|^P[0-9]"), -ends_with("_Zscore"))
 
-    # get intensities of controls and patient for a metabolite, get intensity columns,
-    # pivot to long format, arrange Samples nummerically, change Sample names, get group size and
-    # set Intensities to numeric.
-    intensities <- outlist %>%
+  for (row_index in seq_len(nrow(intensities_plots_df))) {
+    # get HMDB ID
+    hmdb_id <- intensities_plots_df %>%
       slice(row_index) %>%
-      select(all_of(intensity_col_ids)) %>%
-      as.data.frame() %>%
-      pivot_longer(everything(), names_to = "Samples", values_to = "Intensities") %>%
-      arrange(nchar(Samples)) %>%
-      mutate(
-        Samples = gsub("\\..*", "", Samples),
-        Samples = gsub("(C).*", "\\1", Samples),
-        Intensities = as.numeric(Intensities),
-        type = ifelse(Samples == "C", "Control", "Patients")
-      ) %>%
-      group_by(Samples) %>%
-      mutate(group_size = n()) %>%
-      ungroup()
+      pull(HMDB_key)
+
+    # Transform dataframe to long format
+    intensities_plots_df_long <- transform_ints_df_plots(intensities_plots_df, row_index)
 
     # set plot width to 40 times the number of samples
-    plot_width <- length(unique(intensities$Samples)) * 40
+    plot_width <- length(unique(intensities_plots_df_long$Samples)) * 40
     col_width <- plot_width * 2
 
+    if (hmdb_id %in% metab_list_helix) {
+      # Make separate plot for Helix Excel containing all samples
+      plot.new()
+      tmp_png_helix <- paste0("plots/plot_helix_", hmdb_id, ".png")
+      png(filename = tmp_png_helix, width = plot_width, height = 300)
+
+      boxplot_excel_helix <- create_boxplot_excel(intensities_plots_df_long, hmdb_id)
+
+      print(boxplot_excel_helix)
+      dev.off()
+
+      openxlsx::insertImage(
+        wb_helix_intensities,
+        sheetname,
+        tmp_png_helix,
+        startRow = row_helix,
+        startCol = 1,
+        height = 560,
+        width = col_width,
+        units = "px"
+      )
+      row_helix <- row_helix + 1
+    }
+
+    # Remove postive controls and SST mix samples, (e.g. P1001, P1002, P1003, P1005)
+    intensities_plots_df_long <- intensities_plots_df_long %>% filter(!grepl("^P[0-9]{4}$", Samples))
+
     plot.new()
-    tmp_png <- paste0("plots/plot_", hmdb_name, ".png")
+    tmp_png <- paste0("plots/plot_", hmdb_id, ".png")
     png(filename = tmp_png, width = plot_width, height = 300)
 
-    # plot intensities for the controls and patients, use boxplot if group size is above 2, otherwise use a dash/line
-    p <- ggplot(intensities, aes(Samples, Intensities)) +
-      geom_boxplot(data = subset(intensities, group_size > 2), aes(fill = type)) +
-      geom_point(data = subset(intensities, group_size <= 2), shape = "-", size = 10, aes(colour = type, fill = type)) +
-      scale_fill_manual(values = c("Control" = "green", "Patients" = "#b20000")) +
-      scale_color_manual(values = c("Control" = "black", "Patients" = "#b20000")) +
-      theme(
-        legend.position = "none", axis.text.x = element_text(angle = 90, hjust = 1), axis.title = element_blank(),
-        plot.title = element_text(hjust = 0.5, size = 18, face = "bold"), axis.text = element_text(size = 12, face = "bold"),
-        panel.background = element_rect(fill = "white", colour = "black")
-      ) +
-      ggtitle(hmdb_name)
-
-    print(p)
+    boxplot_excel <- create_boxplot_excel(intensities_plots_df_long, hmdb_id)
+
+    print(boxplot_excel)
     dev.off()
 
     # place the plot in the Excel file
@@ -209,21 +210,6 @@ if (z_score == 1) {
       width = col_width,
       units = "px"
     )
-
-    if (hmdb_name %in% metab_list_helix) {
-      print(row_helix)
-      openxlsx::insertImage(
-        wb_helix_intensities,
-        sheetname,
-        tmp_png,
-        startRow = row_helix,
-        startCol = 1,
-        height = 560,
-        width = col_width,
-        units = "px"
-      )
-      row_helix <- row_helix + 1
-    }
   }
   wb_intensities <- set_row_height_col_width_wb(
     wb_intensities,

diff --git a/DIMS/export/generate_excel_functions.R b/DIMS/export/generate_excel_functions.R
@@ -34,22 +34,29 @@ calculate_zscores <- function(outlist, zscore_type, control_cols, stat_filter, i
   if (zscore_type == "_Zscore") {
     # Calculate mean and sd with all controls
     outlist$avg_ctrls <- apply(control_cols, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
-    outlist$sd_ctrls  <- apply(control_cols, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+    outlist$sd_ctrls <- apply(control_cols, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
   } else {
     if (length(control_cols) > 3) {
       for (metabolite_index in seq_len(nrow(outlist))) {
         if (zscore_type == "_RobustZscore") {
           # Calculate mean and sd, remove outlier controls by using robust scaler
-          outlist$avg_ctrls[metabolite_index] <- mean(robust_scaler(outlist[metabolite_index, control_cols],
-                                                                    control_cols, stat_filter))
-          outlist$sd_ctrls[metabolite_index]  <-   sd(robust_scaler(outlist[metabolite_index, control_cols],
-                                                                    control_cols, stat_filter))
+          outlist$avg_ctrls[metabolite_index] <- mean(robust_scaler(
+            outlist[metabolite_index, control_cols],
+            control_cols, stat_filter
+          ))
+          outlist$sd_ctrls[metabolite_index] <- sd(robust_scaler(
+            outlist[metabolite_index, control_cols],
+            control_cols, stat_filter
+          ))
         } else {
           # Calculate mean, sd and number of remaining controls, remove outlier controls by using grubbs test
-          intensities_without_outliers <- remove_outliers_grubbs(as.numeric(outlist[metabolite_index, control_cols]), stat_filter)
+          intensities_without_outliers <- remove_outliers_grubbs(
+            as.numeric(outlist[metabolite_index, control_cols]),
+            stat_filter
+          )
           outlist$avg_ctrls[metabolite_index] <- mean(intensities_without_outliers)
-          outlist$sd_ctrls[metabolite_index]  <- sd(intensities_without_outliers)
-          outlist$nr_ctrls[metabolite_index]  <- length(intensities_without_outliers)
+          outlist$sd_ctrls[metabolite_index] <- sd(intensities_without_outliers)
+          outlist$nr_ctrls[metabolite_index] <- length(intensities_without_outliers)
         }
       }
     }
@@ -61,7 +68,7 @@ calculate_zscores <- function(outlist, zscore_type, control_cols, stat_filter, i
   })
   outlist <- cbind(outlist, outlist_zscores)
   colnames(outlist)[startcol:ncol(outlist)] <- paste0(colnames(outlist)[intensity_col_ids], zscore_type)
-  
+
   return(outlist)
 }
 
@@ -75,7 +82,7 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) {
   #' @return trimmed_control_intensities: Intensities trimmed for outliers
   nr_to_remove <- ceiling(length(control_col_ids) * perc / 100)
   sorted_control_intensities <- sort(as.numeric(control_intensities))
-  trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1) :
+  trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1):
                                                               (length(sorted_control_intensities) - nr_to_remove)]
   return(trimmed_control_intensities)
 }
@@ -129,3 +136,64 @@ set_row_height_col_width_wb <- function(wb, sheetname, num_rows_df, num_cols_df,
   }
   return(wb)
 }
+
+#' Transform a dataframe with intensities to long format
+#'
+#' Get intensities of controls and patient for the selected metabolite,
+#' pivot to long format, arrange Samples nummerically, change Sample names, get group size and
+#' set Intensities to numeric.
+#'
+#' @param intensities_plots_df: a dataframe with HMDB_key column and intensities for all samples
+#'
+#' @returns intensities_plots_df_long: a dataframe with on each row a sample and their intensity
+transform_ints_df_plots <- function(intensities_plots_df, row_index) {
+  intensities_plots_df_long <- intensities_plots_df %>%
+    slice(row_index) %>%
+    select(-HMDB_key) %>%
+    as.data.frame() %>%
+    pivot_longer(everything(), names_to = "Samples", values_to = "Intensities") %>%
+    arrange(nchar(Samples)) %>%
+    mutate(
+      Samples = gsub("\\..*", "", Samples),
+      Samples = gsub("(C).*", "\\1", Samples),
+      Intensities = as.numeric(Intensities),
+      type = ifelse(Samples == "C", "Control", "Patients")
+    ) %>%
+    group_by(Samples) %>%
+    mutate(group_size = n()) %>%
+    ungroup()
+
+  return(intensities_plots_df_long)
+}
+
+
+#' Create a plot of intensities of samples for Excel
+#' Use boxplot if group size is above 2, otherwise use a dash/line
+#'
+#' @param intensities_plots_df_long: a dataframe with on each row a sample and their intensity
+#' @param hmdb_id: HMDB ID of the selected metabolite
+#'
+#' @returns boxplot_excel: ggplot2 object containing the plot of intensities
+create_boxplot_excel <- function(intensities_plots_df_long, hmdb_id) {
+  boxplot_excel <- ggplot(intensities_plots_df_long, aes(Samples, Intensities)) +
+    geom_boxplot(data = subset(intensities_plots_df_long, group_size > 2), aes(fill = type)) +
+    geom_point(
+      data = subset(intensities_plots_df_long, group_size <= 2),
+      shape = "-",
+      size = 10,
+      aes(colour = type, fill = type)
+    ) +
+    scale_fill_manual(values = c("Control" = "green", "Patients" = "#b20000")) +
+    scale_color_manual(values = c("Control" = "black", "Patients" = "#b20000")) +
+    theme(
+      legend.position = "none",
+      axis.text = element_text(size = 12, face = "bold"),
+      axis.text.x = element_text(angle = 90, hjust = 1),
+      axis.title = element_blank(),
+      plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
+      panel.background = element_rect(fill = "white", colour = "black")
+    ) +
+    ggtitle(hmdb_id)
+
+  return(boxplot_excel)
+}
diff --git a/DIMS/tests/testthat/_snaps/generate_excel/create-boxplot-excel.svg b/DIMS/tests/testthat/_snaps/generate_excel/create-boxplot-excel.svg