Skip to content
Snippets Groups Projects
Commit ae2b688a authored by Boris Koch's avatar Boris Koch
Browse files

filter for multiple assignments

parent 1c4e8624
No related branches found
No related tags found
No related merge requests found
#' @title Filter multiple assignments
#' @name filter_multiple_assignments
#' @description Automated filtering of multiple assignments.
# @inheritParams assign_formulas
# @inheritDotParams calc_neutral_mass
# @inheritDotParams calc_ma_abs
#' @param col_formula_class Name of the column having the formula type, such as CHNOS, CHNOSP, etc. (string)
#' @param col_mz Name of the column having the measured molecular m/z value (string)
#' A standard library for marine dissolved organic matter would be *ume.formulas::lib_02*
#' @param col_ma Name of the column having the relative mass accuracy calculated from col_mass_mf and col_mass (string)
#' @param col_mass_mf Name of the column having the calculated mass of the molecular formula (string)
#' @param col_mass Name of the column having the neutral molecular mass information (string)
#' @param ... zero or more objects which can be coerced to character
#' @import data.table plotly ggplot2 cowplot dplyr
# @export
#' @return A list object containing data.tables and figures: (i) calibration summary data, (ii) figures
#' that compare the calibration status before and after recalibration, (iii) recalibrated peaklist.
#' @author Shuxian Gao
filter_multiple_assignments <- function(mfd,
col_mass = "mass",
col_mass_mf = "formula_mass",
col_ma = "formula_error_relative",
col_mz = "peak_mz",
col_formula_class = "formula_class",
...) {
# this is the format of the formula list in Shuxians data
#
# measurement_name, peak_mz, peak_mass_deconvoluted, peak_intensity, peak_intensity_rank,
# peak_relint_bp, peak_relint_tic, formula_string, formula_mass, formula_error_relative,
# C, X13C, H, N, P, X34S, Cl, O, S,
# formula_class, formula_hc, formula_oc, formula_nc, formula_sc, formula_nosc,
# formula_dbe, formula_dbe_o, formula_ai_mod, occurrence_count
# load("formulas_raw.RData")
# col_mass <- "peak_mass_deconvoluted"
# col_mass <- "m"
# col_mass_mf <- "formula_mass"
# col_mass_mf <- "m_cal"
# col_ma <- "formula_error_relative"
# col_ma <- "ppm"
# col_mz <- "peak_mz"
# col_mz <- "mz"
# col_formula_class <- "formula_class"
# ume::mf_data_demo
## Selection of every multiple assignments (MultiAs). ####
myduplicates <- duplicated(
formulas_raw[, get(col_mass)]
) | duplicated(
formulas_raw[, get(col_mass)], fromLast = TRUE
)
formulas_multi_as <- formulas_raw[myduplicates, ]
## Calculation of KMD (CH2 scale) and mass error (Merr_mDa).####
formulas_multi_as$kmd_ch2 <- round(
ceiling(
formulas_multi_as[, get(col_mass_mf)] * 14 / 14.01565
) - formulas_multi_as[, get(col_mass_mf)] * 14 / 14.01565, 3
)
formulas_multi_as$Merr_mDa <- formulas_multi_as$formula_mass *
formulas_multi_as[, get(col_ma)] / 1000
## Select MultiAs caused by the same replacement pairs.####
merr_compi <- formulas_multi_as |>
dplyr::group_by(get(col_mz)) |>
dplyr::summarize(combinations = list(combn(formula_mass, 2, simplify = FALSE))) |>
tidyr::unnest(combinations) |>
dplyr::rowwise() |>
dplyr::mutate(Combi1 = combinations[[1]],
Combi2 = combinations[[2]]) |>
dplyr::select(-combinations)
merr_compi <- data.table::as.data.table(merr_compi)
colnames(merr_compi) <- c("peak_mz", "MF1", "MF2")
merr_compi$mass_diff <- round(abs(merr_compi$MF1 - merr_compi$MF2) * 1000, 3)
## Example of Merr_mDa distribution of MultiAs only caused by ####
# C1 35Cl1 (CHOCl MF) / O1P1 (CHOP MF)
# with mass difference of 0.176 mDa.
library(ggplot2)
formulas_multi_as_chocl_chop <- formulas_multi_as[
formula_mass %in% merr_compi[mass_diff == 0.176, c(MF1, MF2)],
]
figure_multi_as_0_176_mda <- ggplot(
formulas_multi_as_chocl_chop,
aes(x = Merr_mDa)
) +
geom_bar(
stat = "bin",
binwidth = 0.01,
aes(fill = get(col_formula_class))
) +
scale_fill_manual(
values = c(
"CHO" = "#44AADD",
"CHNO" = "#A40228",
"CHNOS" = "#008877",
"CHOS" = "#E6AE13",
"CHOP" = "#69217C",
"CHNOP" = "#AA36C9",
"CHOPS" = "#855493",
"CHNOPS" = "#00D4B7"
)
) +
ggtitle("Merr distribution of MultiAs caused by C1^35Cl1 / O1P1") +
labs(
x = "Mass error (mDa)",
y = "Count",
fill = "Formula class"
) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) +
theme(
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14)
)
figure_multi_as_0_176_mda
## Merr_mDa distribution of all MultiAs. ####
figure_multi_as_mda <- ggplot(
formulas_multi_as,
aes(x = Merr_mDa)
) +
geom_bar(
stat = "bin",
binwidth = 0.01,
aes(fill = formula_class)
) +
scale_fill_manual(
values = c(
"CHO" = "#44AADD",
"CHNO" = "#A40228",
"CHNOS" = "#008877",
"CHOS" = "#E6AE13",
"CHOP" = "#69217C",
"CHNOP" = "#AA36C9",
"CHOPS" = "#855493",
"CHNOPS" = "#00D4B7"
)
) +
labs(
x = "Mass error (mDa)",
y = "Count",
fill = "Formula class"
) +
ggtitle("Merr distribution of all MultiAs") +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) +
theme(
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14)
)
figure_multi_as_mda
## Estimation of the median values within subgroups.####
median <- formulas_multi_as[
,
quantile(
Merr_mDa, probs = 0.5, type = 8
)[[1]], keyby = .(formula_class, kmd_ch2)
]
colnames(median) <- c("formula_class", "kmd_ch2", "median")
formulas_median <- merge(
formulas_multi_as, median, by = c("formula_class", "kmd_ch2")
)
## Data filtering: formulas with smaller medians. ####
formulas_multi_as_filtered <- formulas_median |>
dplyr::group_by(peak_mz) |>
dplyr::slice(which.min(abs(median)))
## Mass error (mDa) distribution: MultiAs filtered molecular formulas.####
formulas_multi_as_filtered <- data.table::as.data.table(formulas_multi_as_filtered)
figure_multi_a_filtered <- ggplot(
formulas_multi_as_filtered,
aes(x = Merr_mDa)
) +
xlim(-0.3, 0.3) +
geom_bar(
stat = "bin",
binwidth = 0.01,
aes(
fill = formula_class
)
) +
scale_fill_manual(
values = c(
"CHO" = "#44AADD",
"CHNO" = "#A40228",
"CHNOS" = "#008877",
"CHOS" = "#E6AE13",
"CHOP" = "#69217C",
"CHNOP" = "#AA36C9",
"CHOPS" = "#855493",
"CHNOPS" = "#00D4B7"
)
) +
labs(
x = "Mass error (mDa)",
y = "Count",
fill = "Formula class"
) +
ggtitle("Merr distribution of filtered MultiAs") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
theme(axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
# figure_multi_a_filtered
## Comparison: initial MultiAs and filtered molecular formulas (regared as validated). ####
fig <- cowplot::plot_grid(
figure_multi_as_mda,
figure_multi_a_filtered,
labels = c("A", "B"),
nrow = 2,
ncol = 1
)
l <- list()
l$fig <- fig
l$mfd_filt_ma <- formulas_multi_as_filtered
return(l)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment