## Re-build the package #### devtools::document() Sys.setenv('_R_CHECK_SYSTEM_CLOCK_' = 0) devtools::check() devtools::build() # build package and tar ball devtools::build_manual(path = "../ume/inst") # build pdf documentation library(testthat) library(ume) testthat::test_dir("tests/testthat/") # Check package for suitability in CRAN system("cd D:/_AWI/_Daten/_git") system("R CMD check --as-cran ume_0.2.20.tar.gz") # Copy tar ball and pdf to AWI server #### file.copy( from = paste0(getwd(), "_", utils::packageVersion("ume"), ".tar.gz"), to = paste0( r"(\\smb.isibhv.dmawi.de\projects-noreplica\p_ume\UME\ume_)", utils::packageVersion("ume"), ".tar.gz" ), overwrite = T ) file.copy( from = paste0(getwd(), "/inst/ume_", utils::packageVersion("ume"), ".pdf"), to = paste0( r"(\\smb.isibhv.dmawi.de\projects-noreplica\p_ume\UME\ume_)", utils::packageVersion("ume"), ".pdf" ), overwrite = T ) # Overview on package functions #### all_funs <- ls("package:ume", all.names = TRUE) all_funs # List exported functions by reading the NAMESPACE file exported_funs <- grep("^export\\(", readLines(system.file("NAMESPACE", package = "ume")), value = TRUE) exported_funs <- gsub("export\\(|\\)", "", exported_funs) # Identify non-exported functions non_exported_funs <- setdiff(all_funs, exported_funs) non_exported_funs # Install ume package #### # In case you already installed a previous version of ume: rm(list = ls()) # Bereinige die Arbeitsumgebung detach("package:ume", unload = TRUE) .rs.restartR() update.packages(ask = FALSE) install.packages("remotes") remotes::install_github("pmbrophy/mzDataTable") # Install package with pre-built molecular formula libraries: devtools::install_gitlab( repo = 'bkoch/ume', host = "https://gitlab.awi.de", build_vignettes = TRUE, force = FALSE, dependencies = TRUE, upgrade = "ask" ) # Local installation utils::install.packages( "\\\\smb.isibhv.dmawi.de\\projects-noreplica\\p_ume\\UME\\ume_0.2.20.tar.gz", repos = NULL, type = "source" ) # Install molecular formula library package: devtools::install_gitlab( repo = 'bkoch/ume.formulas', host = "https://gitlab.awi.de", build_vignettes = TRUE, build_manual = TRUE, dependencies = TRUE, force = TRUE ) library(ume) packageVersion("ume") vignette("ume") news(package = "ume") library(data.table) library(plotly) mfd_filt_cal <- ume::process_orbi_data( fn = fn[1:2], auto_calibrate = TRUE, calibr_list = "marine_dom", c_iso_check = TRUE, formula_library = ume.formulas::lib_02, pol = "neg", ma_dev = 2, msg = TRUE ) ## Test existing UME tools: ## Settings #### setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) WD <- setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) WD source("../mcupw.R") # Application: Fjord demo peaklist #### library(ume) vignette("ume") library(ume.formulas) # only if not already installed library(data.table) library(plotly) # Select peaklist from Fjord demo set #### pl <- ume.formulas::ume_test_fjords check_peaklist(pl) data(package = "ume.formulas", lib_02) # Recalibrate spectra #### out <- ume::calc_recalibrate_ms( pl = pl, calibr_list = "cal_SRFA_OL_neg", pol = "neg", ma_dev = 1, msg = TRUE ) summary(out) out$cal_stats plot(out$cal_stats[, .(median_ma_before, median_ma_after)]) out$fig_hist_before out$fig_hist_after out$fig_box_before out$fig_box_after # Assign formulas #### pl <- ume.formulas::ume_test_fjords mfd <- ume_assign_formulas( pl = pl , formula_library = ume.formulas::lib_02 , pol = "neg" , ma_dev = 0.5 , msg = FALSE ) mfd <- calc_norm_int(mfd, normalization = "sum_ubiq") calc_data_summary(mfd) names(mfd) mfd_filt <- ume::ume_filter_formulas( mfd = mfd # , select_file_ids = c("Nsea_a", "Nsea_b", "Nsea_c") , remove_blank_list = c("Blank") , normalization = "sum_ubiq" , norm_int_max = 0.9 # , select_category = c("marine_dom") , exclude_category = c("surfactant") # , c_iso_check = T # , n_iso_check = T # , s_iso_check = T # , ma_dev = 0.2 # , dbe_max = 2 # , dbe_o_min = 0 , dbe_o_max = 10 , p_min = 0, p_max = 0 , s_min = 0, s_max = 1 , n_min = 0, n_max = 2 # , norm_int_min = 0.8 # , n_rank = 400 # , oc_min = 0, oc_max = 2.5 # , hc_min = 0, hc_max = 3 # , nc_min = 0, nc_max = 2 # , mz_min = 200, mz_max = 650 , msg = TRUE ) mfd_filt[, max(norm_int)] uplot.vk(mfd_filt[ai > -2], z_var = "ai") mfd_filt[n_occurrence_orig == 12, .(sum(i_magnitude), .N), file_id] calc_data_summary(mfd_filt) names(mfd_filt) mfd_filt[, .N, .(file_id, int_ref)] # Alternative using pipe operator: mf_data_demo |> eval_isotopes(remove_isotopes = T) |> calc_eval_params() |> add_known_mf() |> order_columns() # Alternative: using pipe operator. mfd_filt <- ume.formulas::ume_test_fjords |> ume_assign_formulas( formula_library = ume.formulas::lib_02, pol = "neg", ma_dev = 0.5, msg = T ) |> filter_mf_data( remove_blank_list = "Blank" , exclude_category = c("surfactant") , p_max = 0, n_max = 2, s_max = 1, dbe_o_max = 10, msg = T ) |> calc_norm_int(normalization = "sum_ubiq", msg = T) filter_int(norm_int_max = 3, msg = T) dim(mfd_filt) ## Benchmarking and memory issues #### # https://stackoverflow.com/a/45458117 # http://adv-r.had.co.nz/Profiling.html # http://adv-r.had.co.nz/memory.html#memory-profiling devtools::install_github("hadley/lineprof") library(lineprof) library(microbenchmark) library(data.table) ume.formulas::lib_02[, .N, s] # benchmarking microbenchmark( mfd <- ume::assign_formulas( pl = ume.formulas::ume_test_fjords, formula_library = lib_02, pol = "neg", ma_dev = 0.2, msg = F ), mfd_old <- ume::assign_formulas_old( pl = ume.formulas::ume_test_fjords, formula_library = lib_02, pol = "neg", ma_dev = 0.2, msg = F ), times = 3 ) identical(mfd, mfd_old) # Memory issues l_new <- lineprof( assign_formulas( pl = ume.formulas::ume_test_fjords, formula_library = lib_02, pol = "neg", ma_dev = 0.2, msg = T ) ) l_new shine(l_new) l_old <- lineprof( assign_formulas_old( pl = ume.formulas::ume_test_fjords, formula_library = lib_02, pol = "neg", ma_dev = 0.2, msg = T ) ) l_old shine(l_old) ## Memory issues: column formats (factor data type is smaller than character) ls("package:ume") pl <- ume.formulas::ume_test_fjords pl[, file_id := as.factor(file_id)] # to do: check with Fabian and Marlo a general concept on "file_id" str(pl) sapply(pl, object.size) object.size(pl) # eval_isotopes ume_new <- lineprof( ume_assign_formulas( pl = ume.formulas::ume_test_fjords , formula_library = ume.formulas::lib_02 , pol = "neg" , ma_dev = 0.2 , msg = T ) ) ## Improve documentation #### ## https://roxygen2.r-lib.org/articles/rd.html ? assign_formulas browser(devtools::check()) ume::chec# For UME package development #### ume::load_mzml() library("devtools") # For package building library(roxygen2) # For package building require(usethis) # For package building library(testthat) library(utils) #library(available) # Checks if a package name already exists or name is problematic # roxygenize roxygen2::roxygenise() # Call packages from tidyverse (contains pipe operator %>%, stringr, tibble, purrr (for map() function)): library(magrittr) # pipe operator #library(EnvStats) tools::package_dependencies("ume", check = FALSE, depLevel = "Depends") library(tidyverse) avail_pks <- available.packages() deps <- tools::package_dependencies(packages = avail_pks[1:200, "Package"], recursive = TRUE) #argg <- c(as.list(environment()), list(...)) # This yields all function arguments and values in ellipsis #names(inargs) # names of all arguments in ellipsis #names(args) # names of all arguments in ellipsis # box-plot in plotly #### summary(out) plot_ly( data = out$pl, y = ~ ppm, type = "box" , color = ~ as.factor(calibration) ) x <- fig_old %>% add_boxplot(fig_new) x y <- x %>% add_annotations( text = "Test", x = 0.2, y = 0.2, xref = "paper", yref = "paper", align = "left", bgcolor = "lightgrey", opacity = 0.8 ) y fig2 <- plot_ly(mf_data_demo[, .(nsp_type, ppm)], x = ~ nsp_type, y = ~ ppm, type = "box") %>% layout( title = "Avg. mass error for NSP combinations", xaxis = list(title = "All combinations of N, S, and P atoms"), yaxis = list(title = "Mass accuracy (ppm)") ) fig2 # MAINTENANCE #### # Update known_mf #### library(RMySQL) library(data.table) library(sam) ch <- sam::f_sam_connect(user, pw) known_mf <- data.table(DBI::dbGetQuery(ch, "SELECT * FROM MarChem.tab_ume_info_known_mf")) DBI::dbDisconnect(ch) known_mf[mf_kf_id == 302870] usethis::use_data(known_mf, overwrite = TRUE, version = 3, compress = "xz") # save(known_mf, file = "data/known_mf.rda", version = 2) known_mf <- ume::known_mf load("data/known_mf.rda") known_mf ## New functions:_________________________ #### ## Intensity significance threshold (IST; provided by Maria da Silva) #### ## Taken from RMD file (email) # Filter formulas shared in the repeated measurements formulas.pool.shared <- formulas.pool[occurrence_count == max(formulas.pool$occurrence_count)] length(unique(formulas.pool.shared$cf_id)) length(unique(formulas.pool.shared$measurement_id)) #scale rank in the samples and repeated measurements formulas.pool.shared <- formulas.pool.shared %>% .[, `:=`(peak_intensity_rank, rank(peak_intensity, ties.method = "min")), by = "measurement_id"] %>% .[, `:=`(measurement_rank, rank(peak_intensity_rank, ties.method = "min")), by = "cf_id"] formulas.smp.shared <- formulas.smp.shared %>% .[, `:=`(peak_intensity_rank, rank(peak_intensity, ties.method = "min")), by = "measurement_id"] %>% .[, `:=`(measurement_rank, rank(peak_intensity_rank, ties.method = "min")), by = "cf_id"] #Function to test significance #define groups Groups.qc <- data.table(Measurement = unique(formulas.pool.shared$measurement_name) , Group = "QC") #calculate average spectra LM_stats.intensity_average(formulas.pool.shared, Groups.qc, suffix = "qc") #plot var_tic <- plot_RI_repro_TIC(formulas.qc.avg) var_bp <- plot_RI_repro_BP(formulas.qc.avg) plot_intensityErrorDistribution(formulas.qc.avg, "peak_relint_tic") plot_intensityErrorDistribution(formulas.qc.avg, "peak_relint_bp") #rsd from average spectra formulas.qc.avg[, peak_relint_tic_rsd := peak_relint_tic_sd / peak_relint_tic] formulas.qc.avg[, peak_relint_bp_rsd := peak_relint_bp_sd / peak_relint_bp] ##scale ranks formulas.pool.shared <- formulas.pool.shared %>% group_by(measurement_id) %>% mutate(xnorm = (peak_intensity_rank - min(peak_intensity_rank)) / (max(peak_intensity_rank) - min(peak_intensity_rank))) formulas.smp.shared <- formulas.smp.shared %>% group_by(measurement_id) %>% mutate(xnorm = (peak_intensity_rank - min(peak_intensity_rank)) / (max(peak_intensity_rank) - min(peak_intensity_rank))) #take scaled ranks across measurements x <- dcast(formulas.pool.shared, formula_mass ~ measurement_id, value.var = "xnorm") #calculate mean rank rowMean <- apply(x[, -1], 1, FUN = mean) #take max rank rowMax <- apply(x[, -1], 1, FUN = max) #calculate median rank rowMedian <- apply(x[, -1], 1, FUN = median) z <- melt(x, "formula_mass") #create function for confidence interval cof_int <- function(n, q, z) { j <- n * q - z * sqrt(n * q * (1 - q)) k <- n * q + z * sqrt(n * q * (1 - q)) return(list(j, k)) } #find ith confidence interval cof_int(10, 0.68, 1) #find values for confidence interval a <- 1 lower_limit <- vector() upper_limit <- vector() for (i in seq(1, nrow(x))) { row_mass <- sort(x[i, -1]) lower_limit[a] <- nth(row_mass, 6) upper_limit[a] <- nth(row_mass, 9) a <- a + 1 } #find rank threshold limits <- as.data.frame(cbind(lower_limit, upper_limit, rowMean, x[, 1])) limits$iqr <- upper_limit - lower_limit limits$rsd <- (limits$iqr / limits$rowMean) colnames(limits) <- c("lower_limit", "upper_limit", "peak_intensity_rank", "formula_mass", "iqr", "rsd") #plot rank var_rank <- ggplot(limits, aes(y = rsd * 100, x = peak_intensity_rank, color = measurement_name)) + geom_point(alpha = 0.1, color = "#1b9e77") + geom_smooth(method = "gam", color = "red") + labs(title = "", y = "RSD Normalized Intensity [%]", x = "mean ranked intensity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) ggarrange( var_tic, var_bp, var_rank, labels = c("A", "B", "C"), ncol = 3, nrow = 1 ) #adjust model gam.mdl_tic <- gam((peak_relint_tic_sd / peak_relint_tic) ~ s(peak_relint_tic), data = formulas.qc.avg) gam.mdl_bp <- gam((peak_relint_bp_sd / peak_relint_bp) ~ s(peak_relint_bp), data = formulas.qc.avg) gam.mdl_rank <- gam(rsd ~ s(peak_intensity_rank), data = limits) #predict values df_mdl_int_tic <- data.frame( "measurement_id" = formulas.smp.shared$measurement_id, "cf_id" = formulas.smp.shared$cf_id, "peak_relint_tic" = formulas.smp.shared$peak_relint_tic ) df_mdl_int_tic$predict_cv <- predict.gam(gam.mdl_tic, df_mdl_int_tic) df_mdl_int_tic$sd <- df_mdl_int_tic$peak_relint_tic * df_mdl_int_tic$predict_cv df_mdl_int_bp <- data.frame( "measurement_id" = formulas.smp.shared$measurement_id, "cf_id" = formulas.smp.shared$cf_id, "peak_relint_bp" = formulas.smp.shared$peak_relint_bp ) df_mdl_int_bp$predict_cv <- predict.gam(gam.mdl_bp, df_mdl_int_bp) df_mdl_int_bp$sd <- df_mdl_int_bp$peak_relint_bp * df_mdl_int_bp$predict_cv df_mdl_int_rank <- data.frame( "measurement_id" = formulas.smp.shared$measurement_id, "cf_id" = formulas.smp.shared$cf_id, "peak_intensity_rank" = formulas.smp.shared$xnorm, "peak_rank_orig" = formulas.smp.shared$peak_intensity_rank ) df_mdl_int_rank$predict_cv <- predict.gam(gam.mdl_rank, df_mdl_int_rank) df_mdl_int_rank$sd <- df_mdl_int_rank$peak_rank_orig * df_mdl_int_rank$predict_cv #transform data df_mdl_int_tic <- as.data.table(df_mdl_int_tic) df_mdl_int_bp <- as.data.table(df_mdl_int_bp) df_mdl_int_rank <- as.data.table(df_mdl_int_rank) #find lowest intensity across samples for each cf_id low_mdl_tic <- df_mdl_int_tic[, .(min(peak_relint_tic), mean(peak_relint_tic), sd(peak_relint_tic)), by = "cf_id"] low_mdl_bp <- df_mdl_int_bp[, .(min(peak_relint_bp), mean(peak_relint_bp), sd(peak_relint_bp)), by = "cf_id"] low_mdl_rank <- df_mdl_int_rank[, .(min(peak_intensity_rank), mean(peak_rank_orig), sd(peak_rank_orig)), by = "cf_id"] #rename columns colnames(low_mdl_tic) <- c("cf_id", "peak_relint_tic", "peak_relint_tic_mean", "peak_relint_tic_sd") colnames(low_mdl_bp) <- c("cf_id", "peak_relint_bp", "peak_relint_bp_mean", "peak_relint_bp_sd") colnames(low_mdl_rank) <- c( "cf_id", "peak_intensity_rank", "peak_intensity_rank_mean", "peak_intensity_rank_sd" ) #calculate cv low_mdl_tic$calculated_cv <- low_mdl_tic$peak_relint_tic_sd / low_mdl_tic$peak_relint_tic_mean low_mdl_bp$calculated_cv <- low_mdl_bp$peak_relint_bp_sd / low_mdl_bp$peak_relint_bp_mean low_mdl_rank$calculated_cv <- low_mdl_rank$peak_intensity_rank_sd / low_mdl_rank$peak_intensity_rank_mean #find modeled cv for lowest intensity low_mdl_tic$predict_cv <- predict.gam(gam.mdl_tic, low_mdl_tic) low_mdl_bp$predict_cv <- predict.gam(gam.mdl_bp, low_mdl_bp) low_mdl_rank$predict_cv <- predict.gam(gam.mdl_rank, low_mdl_rank) #find formulas that are excluded by intensity threshold n_excluded_tic <- low_mdl_tic[predict_cv < calculated_cv] n_excluded_bp <- low_mdl_bp[predict_cv < calculated_cv] n_excluded_rank <- low_mdl_rank[predict_cv < calculated_cv] ## Statistics: Cluster, MDS, PCA aus UltraMassExplorer #### # Spreadsheet 09: "Statistics" ======================================== # _____________________________________________________________ # Cluster analysis and multi-dimensional scaling #### # _____________________________________________________________ um_plot.cluster <- function(df, grp1) { print("***************************************************") print("Plotting cluster diagram & multi-dimensional scaling...") df_pivot <- dcast( df, get(grp1) ~ mf, value.var = ri_stats, fun = mean, fill = 0 ) max_char <- max(nchar(as.character(df_pivot[, grp1]))) # Determine the length of axis label df_pivot <- data.frame(df_pivot) # convert data.table to dataframe rownames(df_pivot) = df_pivot[, 1] # create rownames from "grp1" df_pivot[, 1] = NULL # delete grp1 names if (length(unique(df$file_id)) > 2) { par(mfrow = c(1, 2)) par(mar = c(max_char + 1, 7, 4, 2)) # optimize margins according to length of labels d <- vegdist(df_pivot, method = "bray") * 100 # scale distances to 100 instead 1 h <- hclust(d, method = "average") plot( as.dendrogram(h), cex = 1, horiz = F, nodePar = NULL, ylab = "Bray-Curtis dissimilarity" ) # plot cluster # nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), cex = 0.8, col = "blue") # plot(hcd, type = "rectangle", nodePar = nodePar, xlab = "Height", horiz = TRUE) # Clustering as tree #cut = rect.hclust(hc,k=3) #plot(h, ylab="Bray-Curtis Similarity", main="", sub = "", xlab="", axes = FALSE, hang = -1) #lines(x = c(0,0), y = c(0,100), type = "n") # force extension of y axis #axis(side = 2, at = seq(0,100,10), labels = seq(100,0,-10), ylab="Bray-Curtis Similarity") # improvements: # http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning # p <- ggdendro::ggdendrogram(h, rotate = FALSE, size = 2) # ggplotly(p) # _____________________________________________________________ # Multi-dimensional scaling # _____________________________________________________________ mds <- metaMDS(d, k = 2) fig <- ordiplot(mds, type = "none", display = "sites") points(fig, "sites", pch = "+", col = "black") orditorp( mds, display = "sites", cex = 1.2, air = .7, col = "grey" ) } if (length(unique(df$file_id)) <= 2) { print("Statistical evaluation requires more than 2 samples in analysis!") } } # _____________________________________________________________ # Principal component analysis (PCA) #### # https://sites.google.com/site/mb3gustame/reference/dissimilarity # _____________________________________________________________ um_plot.pca <- function(df, grp1) { df_pivot <- dcast( df, get(grp1) ~ mf, value.var = ri_stats, fun = mean, fill = 0 ) max_char <- max(nchar(as.character(df_pivot[, grp1]))) # Determine the length of axis label df_pivot <- data.frame(df_pivot) # convert data.table to dataframe rownames(df_pivot) = df_pivot[, 1] # create rownames from "grp1" df_pivot[, 1] = NULL # delete grp1 names # Remove columns with zero variance (PCA won't work with those) # https://stackoverflow.com/questions/40315227/how-to-solve-prcomp-default-cannot-rescale-a-constant-zero-column-to-unit-var df_pivot <- df_pivot[, which(apply(df_pivot, 2, var) != 0)] pca <- prcomp(df_pivot, scale. = T, rank. = 5) #summary(pca) #head(pca,2) #eigen(pca) t_score <- data.frame(pca$x) # show all scores t_score$files <- rownames(pca$x) # add file id as columnm t_score <- data.table(t_score) # convert to data.table #wcsv(t_score, "PC_scores.csv") # plot(pca) # Scree-Plot: how much variance in which PC? par(mfrow = c(1, 2)) # plot rotated data ("scores") of samples, only component 1 vs 2 plot( t_score[, 1:2], pch = 16, lwd = 2, xlab = paste("PC1", " (", signif(pca$sdev[1] ^ 2 / sum(pca$sdev ^ 2) * 100, 3), "%)", sep = ""), ylab = paste("PC2", " (", signif(pca$sdev[2] ^ 2 / sum(pca$sdev ^ 2) * 100, 3), "%)", sep = ""), main = "", xlim = c(min(t_score$PC1), max(t_score$PC1) * 1.5) ) #points(pca$x[,c(1,2)], pch = 5, cex = 2, lwd = 2, col = "blue") # add points to an existing plot text( t_score[, 1:2], t_score$files, offset = 1, pos = 4, col = "red" ) # add text to a plot #biplot(pca) #scores(pca) #res_cov2 <- cov(df_pivot) # calculate covariance matrix #eigen(res_cov2) # eigenvalues and eigenvectors (careful this costs computing power!!!) t <- data.frame(pca[2]) colnames(t) <- colnames(pca$x) t$mf <- row.names(t) t <- data.table(t) df <- t[df, on = "mf"] # add principal components info df <- df[!is.na(PC1)] ume::uplot.vk(df, col = "redblue", col_bar = T, z_var = "PC1") return(pca) } ri_stats = "norm_int" um_plot.cluster(df = mfd_filt, grp1 = "sample_tag",)