-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/DIMS_Excel_plots #92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
195ed54
aae998e
cab9ce7
9588296
aa56eb2
9e638ae
08dbf31
97ec94d
1fc1d3a
bb007f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| linters: linters_with_defaults( | ||
| line_length_linter(127), | ||
| object_usage_linter = NULL, | ||
| return_linter = NULL | ||
| return_linter = NULL, | ||
| pipe_consistency_linter = lintr::pipe_consistency_linter("auto") | ||
| ) | ||
| encoding: "UTF-8" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -115,10 +115,6 @@ if (z_score == 1) { | |
| # save outlist for GenerateQC step | ||
| save(outlist, file = "outlist.RData") | ||
|
|
||
| # get the IDs of the patients and sort | ||
| patient_ids <- unique(gsub("\\.[0-9]*", "", patient_columns)) | ||
| patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] | ||
|
|
||
| # get Helix IDs for extra Excel file | ||
| metabolite_files <- list.files( | ||
| path = paste(path_metabolite_groups, "Diagnostics", sep = "/"), | ||
|
|
@@ -151,51 +147,56 @@ if (z_score == 1) { | |
| relocate(c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID), .after = last_col()) %>% | ||
| rename(Name = H_Name) | ||
|
|
||
| for (row_index in seq_len(nrow(outlist))) { | ||
| # get HMDB ID | ||
| hmdb_name <- rownames(outlist[row_index, ]) | ||
| # Get intensity columns for controls and patients | ||
| intensities_plots_df <- outlist %>% select(HMDB_key, matches("^C|^P[0-9]"), -ends_with("_Zscore")) | ||
|
|
||
| # get intensities of controls and patient for a metabolite, get intensity columns, | ||
| # pivot to long format, arrange Samples nummerically, change Sample names, get group size and | ||
| # set Intensities to numeric. | ||
| intensities <- outlist %>% | ||
| for (row_index in seq_len(nrow(intensities_plots_df))) { | ||
| # get HMDB ID | ||
| hmdb_id <- intensities_plots_df %>% | ||
| slice(row_index) %>% | ||
| select(all_of(intensity_col_ids)) %>% | ||
| as.data.frame() %>% | ||
| pivot_longer(everything(), names_to = "Samples", values_to = "Intensities") %>% | ||
| arrange(nchar(Samples)) %>% | ||
| mutate( | ||
| Samples = gsub("\\..*", "", Samples), | ||
| Samples = gsub("(C).*", "\\1", Samples), | ||
| Intensities = as.numeric(Intensities), | ||
| type = ifelse(Samples == "C", "Control", "Patients") | ||
| ) %>% | ||
| group_by(Samples) %>% | ||
| mutate(group_size = n()) %>% | ||
| ungroup() | ||
| pull(HMDB_key) | ||
|
|
||
| # Transform dataframe to long format | ||
| intensities_plots_df_long <- transform_ints_df_plots(intensities_plots_df, row_index) | ||
|
|
||
| # set plot width to 40 times the number of samples | ||
| plot_width <- length(unique(intensities$Samples)) * 40 | ||
| plot_width <- length(unique(intensities_plots_df_long$Samples)) * 40 | ||
| col_width <- plot_width * 2 | ||
|
|
||
| if (hmdb_id %in% metab_list_helix) { | ||
| # Make separate plot for Helix Excel containing all samples | ||
| plot.new() | ||
| tmp_png_helix <- paste0("plots/plot_helix_", hmdb_id, ".png") | ||
| png(filename = tmp_png_helix, width = plot_width, height = 300) | ||
|
|
||
| boxplot_excel_helix <- create_boxplot_excel(intensities_plots_df_long, hmdb_id) | ||
|
|
||
| print(boxplot_excel_helix) | ||
| dev.off() | ||
|
Comment on lines
+168
to
+175
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Zou dit ook niet in de 'create_boxplot_excel' functie passen? Het lijkt er nu op de het maken van de boxplot feitelijk nog hier gebeurd, create boxplot genereerd nu enkel de ggplot config? |
||
|
|
||
| openxlsx::insertImage( | ||
| wb_helix_intensities, | ||
| sheetname, | ||
| tmp_png_helix, | ||
| startRow = row_helix, | ||
| startCol = 1, | ||
| height = 560, | ||
| width = col_width, | ||
| units = "px" | ||
| ) | ||
| row_helix <- row_helix + 1 | ||
| } | ||
|
|
||
| # Remove postive controls and SST mix samples, (e.g. P1001, P1002, P1003, P1005) | ||
| intensities_plots_df_long <- intensities_plots_df_long %>% filter(!grepl("^P[0-9]{4}$", Samples)) | ||
|
|
||
| plot.new() | ||
| tmp_png <- paste0("plots/plot_", hmdb_name, ".png") | ||
| tmp_png <- paste0("plots/plot_", hmdb_id, ".png") | ||
| png(filename = tmp_png, width = plot_width, height = 300) | ||
|
|
||
| # plot intensities for the controls and patients, use boxplot if group size is above 2, otherwise use a dash/line | ||
| p <- ggplot(intensities, aes(Samples, Intensities)) + | ||
| geom_boxplot(data = subset(intensities, group_size > 2), aes(fill = type)) + | ||
| geom_point(data = subset(intensities, group_size <= 2), shape = "-", size = 10, aes(colour = type, fill = type)) + | ||
| scale_fill_manual(values = c("Control" = "green", "Patients" = "#b20000")) + | ||
| scale_color_manual(values = c("Control" = "black", "Patients" = "#b20000")) + | ||
| theme( | ||
| legend.position = "none", axis.text.x = element_text(angle = 90, hjust = 1), axis.title = element_blank(), | ||
| plot.title = element_text(hjust = 0.5, size = 18, face = "bold"), axis.text = element_text(size = 12, face = "bold"), | ||
| panel.background = element_rect(fill = "white", colour = "black") | ||
| ) + | ||
| ggtitle(hmdb_name) | ||
|
|
||
| print(p) | ||
| boxplot_excel <- create_boxplot_excel(intensities_plots_df_long, hmdb_id) | ||
|
|
||
| print(boxplot_excel) | ||
| dev.off() | ||
|
Comment on lines
+197
to
200
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hier wordt de code van hierboven vrijwel hetzelfde nog een keer gebruikt? |
||
|
|
||
| # place the plot in the Excel file | ||
|
|
@@ -209,21 +210,6 @@ if (z_score == 1) { | |
| width = col_width, | ||
| units = "px" | ||
| ) | ||
|
|
||
| if (hmdb_name %in% metab_list_helix) { | ||
| print(row_helix) | ||
| openxlsx::insertImage( | ||
| wb_helix_intensities, | ||
| sheetname, | ||
| tmp_png, | ||
| startRow = row_helix, | ||
| startCol = 1, | ||
| height = 560, | ||
| width = col_width, | ||
| units = "px" | ||
| ) | ||
| row_helix <- row_helix + 1 | ||
| } | ||
| } | ||
| wb_intensities <- set_row_height_col_width_wb( | ||
| wb_intensities, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,22 +34,29 @@ calculate_zscores <- function(outlist, zscore_type, control_cols, stat_filter, i | |
| if (zscore_type == "_Zscore") { | ||
| # Calculate mean and sd with all controls | ||
| outlist$avg_ctrls <- apply(control_cols, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) | ||
| outlist$sd_ctrls <- apply(control_cols, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) | ||
| outlist$sd_ctrls <- apply(control_cols, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) | ||
| } else { | ||
| if (length(control_cols) > 3) { | ||
| for (metabolite_index in seq_len(nrow(outlist))) { | ||
| if (zscore_type == "_RobustZscore") { | ||
| # Calculate mean and sd, remove outlier controls by using robust scaler | ||
| outlist$avg_ctrls[metabolite_index] <- mean(robust_scaler(outlist[metabolite_index, control_cols], | ||
| control_cols, stat_filter)) | ||
| outlist$sd_ctrls[metabolite_index] <- sd(robust_scaler(outlist[metabolite_index, control_cols], | ||
| control_cols, stat_filter)) | ||
| outlist$avg_ctrls[metabolite_index] <- mean(robust_scaler( | ||
| outlist[metabolite_index, control_cols], | ||
| control_cols, stat_filter | ||
| )) | ||
| outlist$sd_ctrls[metabolite_index] <- sd(robust_scaler( | ||
| outlist[metabolite_index, control_cols], | ||
| control_cols, stat_filter | ||
| )) | ||
| } else { | ||
| # Calculate mean, sd and number of remaining controls, remove outlier controls by using grubbs test | ||
| intensities_without_outliers <- remove_outliers_grubbs(as.numeric(outlist[metabolite_index, control_cols]), stat_filter) | ||
| intensities_without_outliers <- remove_outliers_grubbs( | ||
| as.numeric(outlist[metabolite_index, control_cols]), | ||
| stat_filter | ||
| ) | ||
| outlist$avg_ctrls[metabolite_index] <- mean(intensities_without_outliers) | ||
| outlist$sd_ctrls[metabolite_index] <- sd(intensities_without_outliers) | ||
| outlist$nr_ctrls[metabolite_index] <- length(intensities_without_outliers) | ||
| outlist$sd_ctrls[metabolite_index] <- sd(intensities_without_outliers) | ||
| outlist$nr_ctrls[metabolite_index] <- length(intensities_without_outliers) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -61,7 +68,7 @@ calculate_zscores <- function(outlist, zscore_type, control_cols, stat_filter, i | |
| }) | ||
| outlist <- cbind(outlist, outlist_zscores) | ||
| colnames(outlist)[startcol:ncol(outlist)] <- paste0(colnames(outlist)[intensity_col_ids], zscore_type) | ||
|
|
||
| return(outlist) | ||
| } | ||
|
|
||
|
|
@@ -75,7 +82,7 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { | |
| #' @return trimmed_control_intensities: Intensities trimmed for outliers | ||
| nr_to_remove <- ceiling(length(control_col_ids) * perc / 100) | ||
| sorted_control_intensities <- sort(as.numeric(control_intensities)) | ||
| trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1) : | ||
| trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1): | ||
| (length(sorted_control_intensities) - nr_to_remove)] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Formatting een beetje vreemd? |
||
| return(trimmed_control_intensities) | ||
| } | ||
|
|
@@ -129,3 +136,64 @@ set_row_height_col_width_wb <- function(wb, sheetname, num_rows_df, num_cols_df, | |
| } | ||
| return(wb) | ||
| } | ||
|
|
||
| #' Transform a dataframe with intensities to long format | ||
| #' | ||
| #' Get intensities of controls and patient for the selected metabolite, | ||
| #' pivot to long format, arrange Samples nummerically, change Sample names, get group size and | ||
| #' set Intensities to numeric. | ||
| #' | ||
| #' @param intensities_plots_df: a dataframe with HMDB_key column and intensities for all samples | ||
| #' | ||
| #' @returns intensities_plots_df_long: a dataframe with on each row a sample and their intensity | ||
| transform_ints_df_plots <- function(intensities_plots_df, row_index) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ik zou het woord 'plots' uit de functie naam halen, functie maakt namelijk geen plots. Gezien je docstring header zou ik de functie noemen: Interne variable |
||
| intensities_plots_df_long <- intensities_plots_df %>% | ||
| slice(row_index) %>% | ||
| select(-HMDB_key) %>% | ||
| as.data.frame() %>% | ||
| pivot_longer(everything(), names_to = "Samples", values_to = "Intensities") %>% | ||
| arrange(nchar(Samples)) %>% | ||
| mutate( | ||
| Samples = gsub("\\..*", "", Samples), | ||
| Samples = gsub("(C).*", "\\1", Samples), | ||
| Intensities = as.numeric(Intensities), | ||
| type = ifelse(Samples == "C", "Control", "Patients") | ||
| ) %>% | ||
| group_by(Samples) %>% | ||
| mutate(group_size = n()) %>% | ||
| ungroup() | ||
|
|
||
| return(intensities_plots_df_long) | ||
| } | ||
|
|
||
|
|
||
| #' Create a plot of intensities of samples for Excel | ||
| #' Use boxplot if group size is above 2, otherwise use a dash/line | ||
| #' | ||
| #' @param intensities_plots_df_long: a dataframe with on each row a sample and their intensity | ||
| #' @param hmdb_id: HMDB ID of the selected metabolite | ||
| #' | ||
| #' @returns boxplot_excel: ggplot2 object containing the plot of intensities | ||
| create_boxplot_excel <- function(intensities_plots_df_long, hmdb_id) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ook hier zo ik van maken: |
||
| boxplot_excel <- ggplot(intensities_plots_df_long, aes(Samples, Intensities)) + | ||
| geom_boxplot(data = subset(intensities_plots_df_long, group_size > 2), aes(fill = type)) + | ||
| geom_point( | ||
| data = subset(intensities_plots_df_long, group_size <= 2), | ||
| shape = "-", | ||
| size = 10, | ||
| aes(colour = type, fill = type) | ||
| ) + | ||
| scale_fill_manual(values = c("Control" = "green", "Patients" = "#b20000")) + | ||
| scale_color_manual(values = c("Control" = "black", "Patients" = "#b20000")) + | ||
| theme( | ||
| legend.position = "none", | ||
| axis.text = element_text(size = 12, face = "bold"), | ||
| axis.text.x = element_text(angle = 90, hjust = 1), | ||
| axis.title = element_blank(), | ||
| plot.title = element_text(hjust = 0.5, size = 18, face = "bold"), | ||
| panel.background = element_rect(fill = "white", colour = "black") | ||
| ) + | ||
| ggtitle(hmdb_id) | ||
|
|
||
| return(boxplot_excel) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ook hier
intensities_plots_df->intensities_dfhet is immers een df van intensities niet een df van intensities plots.