3
|
1
|
|
2 #' ======================================================================
|
|
3 #' wl-26-03-2019, Tue: taken from R package 'ecipex'. This is the only
|
|
4 #' function in the package. https://cran.r-project.org/package=ecipex
|
|
5 ecipex <- function(formulas, isoinfo = nistiso, limit = 1e-12, id = FALSE, sortby = "abundance") {
|
|
6 sortby <- match.arg(sortby, c("abundance", "mass"))
|
|
7
|
|
8 # determine which elements are present in each molecule and the corresponding copy number
|
|
9 #' elementCount <- lapply(formulas, CHNOSZ::count.elements)
|
|
10 elementCount <- lapply(formulas, count.elements)
|
|
11
|
|
12 # these lists will be used as indices later
|
|
13 elementList <- lapply(elementCount, names)
|
|
14 copyList <- lapply(elementCount, format, scientific = FALSE, trim = TRUE)
|
|
15
|
|
16 # treat them as vectors for now
|
|
17 elements <- unlist(lapply(elementCount, names))
|
|
18 copies <- as.numeric(unlist(elementCount))
|
|
19
|
|
20 uniqueElements <- unique(unlist(elements))
|
|
21
|
|
22 # define a list with an entry for each unique element, stating the unique set of copy numbers that will be required for the formulas provided
|
|
23 compositionList <- lapply(uniqueElements, function(x) {
|
|
24 sort(unique(copies[elements == x]))
|
|
25 })
|
|
26 names(compositionList) <- uniqueElements
|
|
27
|
|
28 # determine the maximum copy number for each element in all of the formulas provided. Needed to determine dimensions of isofft array.
|
|
29 maxElements <- sapply(compositionList, max)
|
|
30
|
|
31 # define a list with an entry for each unique element containing the isotopic expansions for all copy numbers required by the formulas
|
|
32 pureIsos <- lapply(seq(uniqueElements), function(x) {
|
|
33 abundance <- isoinfo$abundance[isoinfo$element == uniqueElements[x]]
|
|
34 mass <- isoinfo$mass[isoinfo$element == uniqueElements[x]]
|
|
35 nucleons <- isoinfo$nucleons[isoinfo$element == uniqueElements[x]]
|
|
36
|
|
37 # in case abundances of zero were included
|
|
38 mass <- mass[abundance > 0]
|
|
39 abundance <- abundance[abundance > 0]
|
|
40
|
|
41 # dimensions of isofft array will be isoLength^isoDimension
|
|
42 isoLength <- nextn(maxElements[x] + 1) # may increase length of isofft but makes it more "composite" so that fft can more efficiently be taken
|
|
43 isoDimension <- length(abundance) - 1
|
|
44
|
|
45 # for elements with only one isotopic variant
|
|
46 if (isoDimension == 0) {
|
|
47 abundanceOut <- lapply(seq(length(compositionList[[x]])), function(y) {
|
|
48 1
|
|
49 })
|
|
50 massOut <- lapply(seq(length(compositionList[[x]])), function(y) {
|
|
51 compositionList[[x]][y] * mass
|
|
52 })
|
|
53
|
|
54 names(abundanceOut) <- compositionList[[x]]
|
|
55 names(massOut) <- compositionList[[x]]
|
|
56
|
|
57 if (!id) {
|
|
58 return(list("mass" = massOut, "abundance" = abundanceOut))
|
|
59 }
|
|
60 if (id) {
|
|
61 speciesOut <- lapply(seq(length(compositionList[[x]])), function(y) {
|
|
62 s <- matrix(compositionList[[x]][y])
|
|
63 colnames(s) <- paste(nucleons, uniqueElements[x], sep = "")
|
|
64 return(s)
|
|
65 })
|
|
66 names(speciesOut) <- compositionList[[x]]
|
|
67 return(list("mass" = massOut, "abundance" = abundanceOut, "isoSpecies" = speciesOut))
|
|
68 }
|
|
69 }
|
|
70
|
|
71 # define array containing abundances for a single atom of the current element. Its dimensions must be able to accomodate the appropriate maxElements-fold convolution without confounding overlap
|
|
72 abundanceIn <- array(rep.int(0, isoLength^isoDimension), dim = rep(isoLength, isoDimension))
|
|
73 abundanceIn[1] <- abundance[1]
|
|
74 abundanceIn[1 + isoLength^seq(0, isoDimension - 1)] <- abundance[seq(2, isoDimension + 1)]
|
|
75
|
|
76 # The output of the fft is stored in a hypercube with isoLength^isoDimension entries. We require only the entries in a series of the simplexes it contains. We therefore define an array of the same dimensions as abundanceIn (isoLength^isoDimension) which is enumerated in such a way that it is easy to extract the simplexes and save memory
|
|
77 simplexIndex <- seq(0, isoLength - 1, by = 1)
|
|
78 i <- 1
|
|
79 while (i < isoDimension) {
|
|
80 simplexIndex <- outer(simplexIndex, seq(0, isoLength - 1, by = 1), FUN = "+")
|
|
81 i <- i + 1
|
|
82 }
|
|
83
|
|
84 # perform the actual multidimensional fft convolution to obtain the desired abundances. Also remove abundance-elements lower than than the specified limit to save memory.
|
|
85 fftIn <- fft(abundanceIn)
|
|
86 convolOut <- lapply(compositionList[[x]], function(n) {
|
|
87 arrayOut <- Re(fft(fftIn^n, inverse = TRUE)) / (isoLength)^isoDimension
|
|
88 species <- which((arrayOut >= limit) & (simplexIndex <= n), arr.ind = TRUE) # these indices enable us to determine which isotopic species we are dealing with
|
|
89 return(list("abundance" = arrayOut[species], "species" = species))
|
|
90 })
|
|
91
|
|
92 # obtain the corresponding masses and remove abundance-elements lower than than the specified limit to save memory.
|
|
93 massDiffs <- outer(mass[-1] - mass[1], seq(0, isoLength - 1))
|
|
94 output <- massDiffs[1, ]
|
|
95 if (length(mass) > 2) {
|
|
96 for (i in 2:nrow(massDiffs)) {
|
|
97 output <- outer(output, massDiffs[i, ], FUN = "+")
|
|
98 }
|
|
99 }
|
|
100 massOut <- lapply(seq_along(compositionList[[x]]), function(n) {
|
|
101 mass[1] * compositionList[[x]][n] + output[convolOut[[n]]$species]
|
|
102 })
|
|
103
|
|
104 # sort entries in order of descending abundance
|
|
105 descendingOrder <- lapply(seq_along(compositionList[[x]]), function(y) {
|
|
106 order(convolOut[[y]]$abundance, decreasing = TRUE)
|
|
107 })
|
|
108 abundanceOut <- lapply(seq_along(compositionList[[x]]), function(y) {
|
|
109 convolOut[[y]]$abundance[descendingOrder[[y]]]
|
|
110 })
|
|
111 massOut <- lapply(seq_along(compositionList[[x]]), function(y) {
|
|
112 massOut[[y]][descendingOrder[[y]]]
|
|
113 })
|
|
114
|
|
115 names(abundanceOut) <- format(compositionList[[x]], scientific = FALSE, trim = TRUE)
|
|
116 names(massOut) <- format(compositionList[[x]], scientific = FALSE, trim = TRUE)
|
|
117
|
|
118 if (!id) {
|
|
119 return(list("mass" = massOut, "abundance" = abundanceOut))
|
|
120 }
|
|
121 if (id) {
|
|
122 speciesOut <- lapply(seq_along(compositionList[[x]]), function(y) { # determine the isotopic species of the values listed in massOut and abundanceOut
|
|
123 specMat <- cbind(compositionList[[x]][y] - apply(as.matrix(convolOut[[y]]$species[descendingOrder[[y]], ]) - 1, 1, sum), convolOut[[y]]$species[descendingOrder[[y]], ] - 1)
|
|
124 colnames(specMat) <- paste(nucleons, uniqueElements[x], sep = "")
|
|
125 return(specMat)
|
|
126 })
|
|
127 names(speciesOut) <- format(compositionList[[x]], scientific = FALSE, trim = TRUE)
|
|
128
|
|
129 return(list("mass" = massOut, "abundance" = abundanceOut, "isoSpecies" = speciesOut))
|
|
130 }
|
|
131 })
|
|
132 names(pureIsos) <- uniqueElements
|
|
133
|
|
134
|
|
135 # list that will contain the full isotopic expansion for all formulas
|
|
136 fullIsos <- lapply(seq(formulas), function(x) {
|
|
137
|
|
138 # extract highest abundance for each atomic species in current formula
|
|
139 maxProbs <- sapply(seq(elementList[[x]]), function(i) {
|
|
140 pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]][1]
|
|
141 })
|
|
142
|
|
143 # for each atomic species we immediately discard abundances which are less than the limit after multiplication by highest abundances in all other species. Similarly for masses, and isotopic species
|
|
144 ma <- lapply(seq(elementList[[x]]), function(i) {
|
|
145 z <- pureIsos[[elementList[[x]][i]]]$mass[[copyList[[x]][i]]][pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]] * prod(maxProbs[-i]) > limit]
|
|
146 if (length(z) == 0) {
|
|
147 return(pureIsos[[elementList[[x]][i]]]$mass[[copyList[[x]][i]]][which.max(pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]])])
|
|
148 }
|
|
149 return(z)
|
|
150 })
|
|
151 ab <- lapply(seq(elementList[[x]]), function(i) {
|
|
152 z <- pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]][pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]] * prod(maxProbs[-i]) > limit]
|
|
153 if (length(z) == 0) {
|
|
154 return(max(pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]]))
|
|
155 }
|
|
156 return(z)
|
|
157 })
|
|
158 if (id) {
|
|
159 is <- lapply(seq(elementList[[x]]), function(i) {
|
|
160 if (ncol(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]]) == 1) {
|
|
161 return(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]])
|
|
162 } # if there is only 1 isotope
|
|
163 z <- as.matrix(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]][pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]] * prod(maxProbs[-i]) > limit, ])
|
|
164 if (ncol(z) <= 1) {
|
|
165 return(t(as.matrix(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]][which.max(pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]]), ])))
|
|
166 }
|
|
167 return(z)
|
|
168 })
|
|
169 }
|
|
170
|
|
171 # these vectors will contain full isotopic expansions, but initially they contain the first elemental expansion
|
|
172 moleculeMass <- ma[[1]]
|
|
173 moleculeAbundance <- ab[[1]]
|
|
174 if (id) {
|
|
175 moleculeSpecies <- is[[1]]
|
|
176 }
|
|
177
|
|
178 # fold them an appropriate number of times
|
|
179 # each time we fold, a number of abundances will fall below limit and these are excluded. Furthermore, we can exclude abundances that are less than the limit after they have been multiplied by the product of the highest abundances in the remaining atomic species
|
|
180 remainingMaxAbundance <- c(cumprod(maxProbs[c(length(maxProbs):1)])[c(length(maxProbs):1)], 1) # compare with e.g. prod(maxProbs[c(3:5)])
|
|
181 i <- 2
|
|
182 while (i <= length(elementList[[x]])) {
|
|
183 moleculeMass <- outer(moleculeMass, pureIsos[[elementList[[x]][i]]]$mass[[copyList[[x]][i]]], FUN = "+")
|
|
184 moleculeAbundance <- outer(moleculeAbundance, pureIsos[[elementList[[x]][i]]]$abundance[[copyList[[x]][i]]])
|
|
185 keepers <- which(moleculeAbundance > limit / remainingMaxAbundance[i + 1], arr.ind = TRUE)
|
|
186
|
|
187 moleculeMass <- moleculeMass[keepers]
|
|
188 moleculeAbundance <- moleculeAbundance[keepers]
|
|
189
|
|
190 if (id) {
|
|
191 moleculeSpecies <- cbind(matrix(moleculeSpecies[keepers[, 1], ], ncol = ncol(moleculeSpecies), dimnames = list(NULL, colnames(moleculeSpecies))), matrix(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]][keepers[, 2], ], ncol = ncol(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]]), dimnames = list(NULL, colnames(pureIsos[[elementList[[x]][i]]]$isoSpecies[[copyList[[x]][i]]]))))
|
|
192 }
|
|
193 i <- i + 1
|
|
194 }
|
|
195
|
|
196 if (sortby == "abundance") {
|
|
197 dex <- order(moleculeAbundance, decreasing = TRUE)
|
|
198 }
|
|
199 if (sortby == "mass") {
|
|
200 dex <- order(moleculeMass, decreasing = FALSE)
|
|
201 }
|
|
202
|
|
203 if (id) {
|
|
204 return(cbind(data.frame("mass" = moleculeMass[dex], "abundance" = moleculeAbundance[dex]), matrix(moleculeSpecies[dex, ], ncol = ncol(moleculeSpecies), dimnames = list(NULL, colnames(moleculeSpecies)))))
|
|
205 }
|
|
206 if (!id) {
|
|
207 return(data.frame("mass" = moleculeMass[dex], "abundance" = moleculeAbundance[dex]))
|
|
208 }
|
|
209 })
|
|
210 names(fullIsos) <- formulas
|
|
211
|
|
212 return(fullIsos)
|
|
213 }
|
|
214
|
|
215 #' ======================================================================
|
|
216 #' wl-26-03-2019, Tue: From function 'makeup.R' of R package 'CHNOSZ'
|
|
217 #' https://cran.r-project.org/package=CHNOSZ
|
|
218 count.elements <- function(formula) {
|
|
219 # count the elements in a chemical formula 20120111 jmd
|
|
220 # this function expects a simple formula,
|
|
221 # no charge or parenthetical or suffixed subformulas
|
|
222 # regular expressions inspired by an answer on
|
|
223 # http://stackoverflow.com/questions/4116786/parsing-a-chemical-formula-from-a-string-in-c
|
|
224 #elementRegex <- "([A-Z][a-z]*)([0-9]*)"
|
|
225 elementSymbol <- "([A-Z][a-z]*)"
|
|
226 # here, element coefficients can be signed (+ or -) and have a decimal point
|
|
227 elementCoeff <- "((\\+|-|\\.|[0-9])*)"
|
|
228 elementRegex <- paste(elementSymbol, elementCoeff, sep="")
|
|
229 # stop if it doesn't look like a chemical formula
|
|
230 validateRegex <- paste("^(", elementRegex, ")+$", sep="")
|
|
231 if(length(grep(validateRegex, formula)) == 0)
|
|
232 stop(paste("'",formula,"' is not a simple chemical formula", sep="", collapse="\n"))
|
|
233 # where to put the output
|
|
234 element <- character()
|
|
235 count <- numeric()
|
|
236 # from now use "f" for formula to make writing the code easier
|
|
237 f <- formula
|
|
238 # we want to find the starting positions of all the elemental symbols
|
|
239 # make substrings, starting at every position in the formula
|
|
240 fsub <- sapply(1:nchar(f), function(i) substr(f, i, nchar(f)))
|
|
241 # get the numbers (positions) that start with an elemental symbol
|
|
242 # i.e. an uppercase letter
|
|
243 ielem <- grep("^[A-Z]", fsub)
|
|
244 # for each elemental symbol, j is the position before the start of the next
|
|
245 # symbol (or the position of the last character of the formula)
|
|
246 jelem <- c(tail(ielem - 1, -1), nchar(f))
|
|
247 # assemble the stuff: each symbol-coefficient combination
|
|
248 ec <- sapply(seq_along(ielem), function(i) substr(f, ielem[i], jelem[i]))
|
|
249 # get the individual element symbols and coefficients
|
|
250 myelement <- gsub(elementCoeff, "", ec)
|
|
251 mycount <- as.numeric(gsub(elementSymbol, "", ec))
|
|
252 # any missing coefficients are unity
|
|
253 mycount[is.na(mycount)] <- 1
|
|
254 # append to the output
|
|
255 element <- c(element, myelement)
|
|
256 count <- c(count, mycount)
|
|
257 # in case there are repeated elements, sum all of their counts
|
|
258 # (tapply hint from https://stat.ethz.ch/pipermail/r-help/2011-January/265341.html)
|
|
259 # use simplify=FALSE followed by unlist to get a vector, not array 20171005
|
|
260 out <- unlist(tapply(count, element, sum, simplify=FALSE))
|
|
261 # tapply returns alphabetical sorted list. keep the order appearing in the formula
|
|
262 out <- out[match(unique(element), names(out))]
|
|
263 return(out)
|
|
264 }
|