Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
U
UME
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Boris Koch
UME
Commits
ae2b688a
Commit
ae2b688a
authored
8 months ago
by
Boris Koch
Browse files
Options
Downloads
Patches
Plain Diff
filter for multiple assignments
parent
1c4e8624
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
R/filter_multiple_assignments.R
+257
-0
257 additions, 0 deletions
R/filter_multiple_assignments.R
with
257 additions
and
0 deletions
R/filter_multiple_assignments.R
0 → 100644
+
257
−
0
View file @
ae2b688a
#' @title Filter multiple assignments
#' @name filter_multiple_assignments
#' @description Automated filtering of multiple assignments.
# @inheritParams assign_formulas
# @inheritDotParams calc_neutral_mass
# @inheritDotParams calc_ma_abs
#' @param col_formula_class Name of the column having the formula type, such as CHNOS, CHNOSP, etc. (string)
#' @param col_mz Name of the column having the measured molecular m/z value (string)
#' A standard library for marine dissolved organic matter would be *ume.formulas::lib_02*
#' @param col_ma Name of the column having the relative mass accuracy calculated from col_mass_mf and col_mass (string)
#' @param col_mass_mf Name of the column having the calculated mass of the molecular formula (string)
#' @param col_mass Name of the column having the neutral molecular mass information (string)
#' @param ... zero or more objects which can be coerced to character
#' @import data.table plotly ggplot2 cowplot dplyr
# @export
#' @return A list object containing data.tables and figures: (i) calibration summary data, (ii) figures
#' that compare the calibration status before and after recalibration, (iii) recalibrated peaklist.
#' @author Shuxian Gao
filter_multiple_assignments
<-
function
(
mfd
,
col_mass
=
"mass"
,
col_mass_mf
=
"formula_mass"
,
col_ma
=
"formula_error_relative"
,
col_mz
=
"peak_mz"
,
col_formula_class
=
"formula_class"
,
...
)
{
# this is the format of the formula list in Shuxians data
#
# measurement_name, peak_mz, peak_mass_deconvoluted, peak_intensity, peak_intensity_rank,
# peak_relint_bp, peak_relint_tic, formula_string, formula_mass, formula_error_relative,
# C, X13C, H, N, P, X34S, Cl, O, S,
# formula_class, formula_hc, formula_oc, formula_nc, formula_sc, formula_nosc,
# formula_dbe, formula_dbe_o, formula_ai_mod, occurrence_count
# load("formulas_raw.RData")
# col_mass <- "peak_mass_deconvoluted"
# col_mass <- "m"
# col_mass_mf <- "formula_mass"
# col_mass_mf <- "m_cal"
# col_ma <- "formula_error_relative"
# col_ma <- "ppm"
# col_mz <- "peak_mz"
# col_mz <- "mz"
# col_formula_class <- "formula_class"
# ume::mf_data_demo
## Selection of every multiple assignments (MultiAs). ####
myduplicates
<-
duplicated
(
formulas_raw
[,
get
(
col_mass
)]
)
|
duplicated
(
formulas_raw
[,
get
(
col_mass
)],
fromLast
=
TRUE
)
formulas_multi_as
<-
formulas_raw
[
myduplicates
,
]
## Calculation of KMD (CH2 scale) and mass error (Merr_mDa).####
formulas_multi_as
$
kmd_ch2
<-
round
(
ceiling
(
formulas_multi_as
[,
get
(
col_mass_mf
)]
*
14
/
14.01565
)
-
formulas_multi_as
[,
get
(
col_mass_mf
)]
*
14
/
14.01565
,
3
)
formulas_multi_as
$
Merr_mDa
<-
formulas_multi_as
$
formula_mass
*
formulas_multi_as
[,
get
(
col_ma
)]
/
1000
## Select MultiAs caused by the same replacement pairs.####
merr_compi
<-
formulas_multi_as
|>
dplyr
::
group_by
(
get
(
col_mz
))
|>
dplyr
::
summarize
(
combinations
=
list
(
combn
(
formula_mass
,
2
,
simplify
=
FALSE
)))
|>
tidyr
::
unnest
(
combinations
)
|>
dplyr
::
rowwise
()
|>
dplyr
::
mutate
(
Combi1
=
combinations
[[
1
]],
Combi2
=
combinations
[[
2
]])
|>
dplyr
::
select
(
-
combinations
)
merr_compi
<-
data.table
::
as.data.table
(
merr_compi
)
colnames
(
merr_compi
)
<-
c
(
"peak_mz"
,
"MF1"
,
"MF2"
)
merr_compi
$
mass_diff
<-
round
(
abs
(
merr_compi
$
MF1
-
merr_compi
$
MF2
)
*
1000
,
3
)
## Example of Merr_mDa distribution of MultiAs only caused by ####
# C1 35Cl1 (CHOCl MF) / O1P1 (CHOP MF)
# with mass difference of 0.176 mDa.
library
(
ggplot2
)
formulas_multi_as_chocl_chop
<-
formulas_multi_as
[
formula_mass
%in%
merr_compi
[
mass_diff
==
0.176
,
c
(
MF1
,
MF2
)],
]
figure_multi_as_0_176_mda
<-
ggplot
(
formulas_multi_as_chocl_chop
,
aes
(
x
=
Merr_mDa
)
)
+
geom_bar
(
stat
=
"bin"
,
binwidth
=
0.01
,
aes
(
fill
=
get
(
col_formula_class
))
)
+
scale_fill_manual
(
values
=
c
(
"CHO"
=
"#44AADD"
,
"CHNO"
=
"#A40228"
,
"CHNOS"
=
"#008877"
,
"CHOS"
=
"#E6AE13"
,
"CHOP"
=
"#69217C"
,
"CHNOP"
=
"#AA36C9"
,
"CHOPS"
=
"#855493"
,
"CHNOPS"
=
"#00D4B7"
)
)
+
ggtitle
(
"Merr distribution of MultiAs caused by C1^35Cl1 / O1P1"
)
+
labs
(
x
=
"Mass error (mDa)"
,
y
=
"Count"
,
fill
=
"Formula class"
)
+
theme
(
panel.grid.major
=
element_blank
(),
panel.grid.minor
=
element_blank
(),
panel.background
=
element_blank
(),
axis.line
=
element_line
(
colour
=
"black"
)
)
+
theme
(
axis.title.x
=
element_text
(
size
=
16
),
axis.title.y
=
element_text
(
size
=
16
),
axis.text.x
=
element_text
(
size
=
14
),
axis.text.y
=
element_text
(
size
=
14
)
)
figure_multi_as_0_176_mda
## Merr_mDa distribution of all MultiAs. ####
figure_multi_as_mda
<-
ggplot
(
formulas_multi_as
,
aes
(
x
=
Merr_mDa
)
)
+
geom_bar
(
stat
=
"bin"
,
binwidth
=
0.01
,
aes
(
fill
=
formula_class
)
)
+
scale_fill_manual
(
values
=
c
(
"CHO"
=
"#44AADD"
,
"CHNO"
=
"#A40228"
,
"CHNOS"
=
"#008877"
,
"CHOS"
=
"#E6AE13"
,
"CHOP"
=
"#69217C"
,
"CHNOP"
=
"#AA36C9"
,
"CHOPS"
=
"#855493"
,
"CHNOPS"
=
"#00D4B7"
)
)
+
labs
(
x
=
"Mass error (mDa)"
,
y
=
"Count"
,
fill
=
"Formula class"
)
+
ggtitle
(
"Merr distribution of all MultiAs"
)
+
theme
(
panel.grid.major
=
element_blank
(),
panel.grid.minor
=
element_blank
(),
panel.background
=
element_blank
(),
axis.line
=
element_line
(
colour
=
"black"
)
)
+
theme
(
axis.title.x
=
element_text
(
size
=
16
),
axis.title.y
=
element_text
(
size
=
16
),
axis.text.x
=
element_text
(
size
=
14
),
axis.text.y
=
element_text
(
size
=
14
)
)
figure_multi_as_mda
## Estimation of the median values within subgroups.####
median
<-
formulas_multi_as
[
,
quantile
(
Merr_mDa
,
probs
=
0.5
,
type
=
8
)[[
1
]],
keyby
=
.
(
formula_class
,
kmd_ch2
)
]
colnames
(
median
)
<-
c
(
"formula_class"
,
"kmd_ch2"
,
"median"
)
formulas_median
<-
merge
(
formulas_multi_as
,
median
,
by
=
c
(
"formula_class"
,
"kmd_ch2"
)
)
## Data filtering: formulas with smaller medians. ####
formulas_multi_as_filtered
<-
formulas_median
|>
dplyr
::
group_by
(
peak_mz
)
|>
dplyr
::
slice
(
which.min
(
abs
(
median
)))
## Mass error (mDa) distribution: MultiAs filtered molecular formulas.####
formulas_multi_as_filtered
<-
data.table
::
as.data.table
(
formulas_multi_as_filtered
)
figure_multi_a_filtered
<-
ggplot
(
formulas_multi_as_filtered
,
aes
(
x
=
Merr_mDa
)
)
+
xlim
(
-0.3
,
0.3
)
+
geom_bar
(
stat
=
"bin"
,
binwidth
=
0.01
,
aes
(
fill
=
formula_class
)
)
+
scale_fill_manual
(
values
=
c
(
"CHO"
=
"#44AADD"
,
"CHNO"
=
"#A40228"
,
"CHNOS"
=
"#008877"
,
"CHOS"
=
"#E6AE13"
,
"CHOP"
=
"#69217C"
,
"CHNOP"
=
"#AA36C9"
,
"CHOPS"
=
"#855493"
,
"CHNOPS"
=
"#00D4B7"
)
)
+
labs
(
x
=
"Mass error (mDa)"
,
y
=
"Count"
,
fill
=
"Formula class"
)
+
ggtitle
(
"Merr distribution of filtered MultiAs"
)
+
theme
(
panel.grid.major
=
element_blank
(),
panel.grid.minor
=
element_blank
(),
panel.background
=
element_blank
(),
axis.line
=
element_line
(
colour
=
"black"
))
+
theme
(
axis.title.x
=
element_text
(
size
=
16
),
axis.title.y
=
element_text
(
size
=
16
),
axis.text.x
=
element_text
(
size
=
14
),
axis.text.y
=
element_text
(
size
=
14
))
# figure_multi_a_filtered
## Comparison: initial MultiAs and filtered molecular formulas (regared as validated). ####
fig
<-
cowplot
::
plot_grid
(
figure_multi_as_mda
,
figure_multi_a_filtered
,
labels
=
c
(
"A"
,
"B"
),
nrow
=
2
,
ncol
=
1
)
l
<-
list
()
l
$
fig
<-
fig
l
$
mfd_filt_ma
<-
formulas_multi_as_filtered
return
(
l
)
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment