comparison parseHGVS.R @ 0:c12a4d187121 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hgvsparser/ commit f9deb29cdbd2d2a5f2f4fbd470b1078431a36ae0
author iuc
date Fri, 07 Jun 2024 15:21:07 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c12a4d187121
1 # Copyright (C) 2018 Jochen Weile, Roth Lab
2 #
3 # This file is part of hgvsParseR.
4 #
5 # hgvsParseR is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # hgvsParseR is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with hgvsParseR. If not, see <https://www.gnu.org/licenses/>.
17
18 #' HGVS Parser
19 #'
20 #' Parses HGVS strings
21 #' @param strings A character vector containing the HGVS strings
22 #' @param aacode allowed values: 1, 3, or NA. Determines whether 1-letter codes or 3-letter codes should be forced. NA uses input format.
23 #' @return A \code{data.frame} with the following columns:
24 #' @keywords HGVS parsing
25 #' @export
26 #' @examples
27 #' result <- parseHGVS(c("g.1318G>T","c.123_125inv","p.R123_L152del"))
28
29 parseHGVS <- function(strings,aacode=c(NA,1,3)) {
30
31 #Check that parameters are valid
32 if (!is.character(strings)) {
33 stop("Input for 'parse' function must be a character vector! Found '",class(strings),"' instead.")
34 }
35
36 aacode <- aacode[[1]]
37 if (!is.na(aacode) && !(aacode %in% c(1,3))) {
38 warning("Invalid aacode parameter, defaulting to NA!")
39 aacode <- NA
40 }
41
42 #Helper function: turns a list of lists (lol) in to a dataframe
43 to.df <- function(lol) {
44 colnames <- unique(do.call(c,lapply(lol,names)))
45 columns <- lapply(colnames,function(cn) sapply(lol,function(row) {
46 if (cn %in% names(row)) row[[cn]] else NA
47 }))
48 names(columns) <- colnames
49 empty <- which(sapply(columns,function(xs)all(is.na(xs))))
50 columns[empty] <- NULL
51 do.call(data.frame,columns)
52 }
53
54 # ###
55 # # Binds matrices of same size together to a 3D matrix, analogously
56 # # to cbind and rbind.
57 # #
58 # zbind <- function(...) {
59 # x <- list(...)
60 # y <- array(0,dim=c(nrow(x[[1]]),ncol(x[[1]]),length(x)),dimnames=dimnames(x[[1]]))
61 # for (i in 1:length(x)) y[,,i] <- x[[i]]
62 # y
63 # }
64
65
66 ###
67 # Function to *locally* excise regex groups from string vectors.
68 # I.e. only extract the first occurrence of each group within each string.
69 # x = string vector
70 # re = regular expression with groups
71 #
72 extract.groups <- function(x, re) {
73 matches <- regexpr(re,x,perl=TRUE)
74 start <- attr(matches,"capture.start")
75 end <- start + attr(matches,"capture.length") - 1
76 do.call(cbind,lapply(1:ncol(start), function(i) {
77 sapply(1:nrow(start),function(j){
78 if (start[j,i] > -1) substr(x[[j]],start[j,i],end[j,i]) else NA
79 })
80 }))
81 }
82
83 # ###
84 # # Function to *globally* excise regex groups from string vectors.
85 # # x = string vector
86 # # re = regular expression with groups
87 # #
88 # global.extract.groups <- function(x,re) {
89 # all.matches <- gregexpr(re,x,perl=TRUE)
90 # mapply(function(matches,x) {
91 # start <- attr(matches,"capture.start")
92 # end <- start + attr(matches,"capture.length") - 1
93 # apply(zbind(start,end),c(1,2),function(pos) substr(x,pos[[1]],pos[[2]]) )
94 # },matches=all.matches,x=x,SIMPLIFY=FALSE)
95 # }
96
97 ###
98 # Helper function to split multi-mutant bodies into their individual
99 # elements. Returns a vector of strings containing these elements.
100 # An attribute "multi" is attached to the vector, detailing the type
101 # of multi-mutant
102 #
103 splitMulti <- function(body) {
104 if (regexpr("\\[.+\\];\\[.+\\]",body) > 0) {
105 out <- strsplit(substr(body,2,nchar(body)-1),"\\];\\[")[[1]]
106 attr(out,"multi") <- "trans"
107 } else if (regexpr("\\[.+\\(;\\).+\\]",body) > 0) {
108 out <- strsplit(substr(body,2,nchar(body)-1),"\\(;\\)")[[1]]
109 attr(out,"multi") <- "unknown"
110 } else if (regexpr("\\[.+;.+\\]",body) > 0) {
111 out <- strsplit(substr(body,2,nchar(body)-1),";")[[1]]
112 attr(out,"multi") <- "cis"
113 } else {
114 out <- body
115 attr(out,"multi") <- "single"
116 }
117 return(out)
118 }
119
120 ###
121 # Helper function:
122 # Given an HGVS body and a list of regexes corresponding to types,
123 # find the (first) matching type.
124 findType <- function(body,types) {
125 i <- 0
126 found <- done <- FALSE
127 while (!found && !done) {
128 found <- regexpr(types[[i <- i+1]],body) > 0
129 done <- i >= length(types)
130 }
131 if (found) {
132 return(names(types)[[i]])
133 } else {
134 return("invalid")
135 }
136 }
137
138 out <- lapply(strings,function(s) {
139
140 if (regexpr("^[gcnmrp]\\.",s) < 1) {
141 return(list(list(hgvs=s,subject="invalid",type="invalid")))
142 }
143
144 body <- substr(s,3,nchar(s))
145
146 subbodies <- splitMulti(body)
147
148 subjects <- c(
149 g="genomic",c="coding",n="noncoding",
150 m="mitochondrial",r="rna",p="protein"
151 )
152 subject <- subjects[[substr(s,1,1)]]
153
154 if (subject=="genomic") {
155
156 types <- c(
157 substitution="\\d+[ACGT]>[ACGT]", singledeletion="^\\d+del$",
158 deletion="\\d+_\\d+del$",inversion="\\d+_\\d+inv",
159 duplication="\\d+_\\d+dup",insertion="\\d+_\\d+ins[ATCG]+",
160 conversion="\\d+_\\d+con\\d+_\\d+",delins="\\d+_\\d+delins[ATCG]+",
161 amplification="\\d+_\\d+\\[\\d+\\]"
162 )
163
164 phasing <- attr(subbodies,"multi")
165 isMulti <- length(subbodies) > 1
166
167 lapply(1:length(subbodies), function(i.multi) {
168 body <- subbodies[[i.multi]]
169
170 type <- findType(body,types)
171
172 if (type == "substitution") {
173 groups <- extract.groups(body,"(\\d+)([ACGT])>([ACGT])")[1,]
174 position <- as.integer(groups[[1]])
175 ancestral <- groups[[2]]
176 variant <- groups[[3]]
177 if (isMulti) {
178 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
179 type=type,start=position,ancestral=ancestral,variant=variant))
180 } else {
181 return(list(hgvs=s,subject=subject,type=type,start=position,
182 ancestral=ancestral,variant=variant))
183 }
184
185 } else if (type == "singledeletion") {
186 groups <- extract.groups(body,"(\\d+)del")[1,]
187 position <- as.integer(groups[[1]])
188 if (isMulti) {
189 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
190 type=type,start=position))
191 } else {
192 return(list(hgvs=s,subject=subject,type=type,start=position))
193 }
194
195 } else if (type == "deletion") {
196 groups <- extract.groups(body,"(\\d+)_(\\d+)del")
197 start <- as.integer(groups[[1]])
198 end <- as.integer(groups[[2]])
199 if (isMulti) {
200 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
201 type=type,start=start,end=end))
202 } else {
203 return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
204 }
205
206 } else if (type == "inversion") {
207 groups <- extract.groups(body,"(\\d+)_(\\d+)inv")
208 start <- as.integer(groups[[1]])
209 end <- as.integer(groups[[2]])
210 if (isMulti) {
211 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
212 type=type,start=start,end=end))
213 } else {
214 return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
215 }
216
217 } else if (type == "duplication") {
218 groups <- extract.groups(body,"(\\d+)_(\\d+)dup")
219 start <- as.integer(groups[[1]])
220 end <- as.integer(groups[[2]])
221 if (isMulti) {
222 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
223 type=type,start=start,end=end))
224 } else {
225 return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
226 }
227
228 } else if (type == "insertion") {
229 groups <- extract.groups(body,"(\\d+)_(\\d+)ins([ATCG]+)")
230 start <- as.integer(groups[[1]])
231 end <- as.integer(groups[[2]])
232 if (abs(end-start)!=1) {
233 warning("Invalid insertion definition:
234 Start and end positions must be adjacent!")
235 }
236 variant <- groups[[3]]
237 if (isMulti) {
238 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
239 type=type,start=start,end=end,variant=variant))
240 } else {
241 return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
242 variant=variant))
243 }
244
245 } else if (type == "conversion") {
246 groups <- extract.groups(body,"(\\d+)_(\\d+)con(\\d+)_(\\d+)")
247 start <- as.integer(groups[[1]])
248 end <- as.integer(groups[[2]])
249 tStart <- as.integer(groups[[3]])
250 tEnd <- as.integer(groups[[4]])
251 if (isMulti) {
252 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
253 type=type,start=start,end=end,
254 templateStart=tStart,templateEnd=tEnd))
255 } else {
256 return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
257 templateStart=tStart,templateEnd=tEnd))
258 }
259
260 } else if (type == "delins") {
261 groups <- extract.groups(body,"(\\d+)_(\\d+)delins([ATCG]+)")
262 start <- as.integer(groups[[1]])
263 end <- as.integer(groups[[2]])
264 variant <- groups[[3]]
265 if (isMulti) {
266 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
267 type=type,start=start,end=end,variant=variant))
268 } else {
269 return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
270 variant=variant))
271 }
272
273 } else if (type == "amplification") {
274 groups <- extract.groups(body,"(\\d+)_(\\d+)\\[(\\d+)\\]")
275 start <- as.integer(groups[[1]])
276 end <- as.integer(groups[[2]])
277 copies <- as.integer(groups[[3]])
278 if (isMulti) {
279 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
280 type=type,start=start,end=end,copies=copies))
281 } else {
282 return(list(hgvs=s,subject=subject,type=type,
283 start=start,end=end,copies=copies))
284 }
285
286 } else {
287 if (isMulti) {
288 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type="invalid"))
289 } else {
290 return(list(hgvs=s,subject=subject,type="invalid"))
291 }
292 }
293
294 })
295
296 } else if (subject=="coding") {
297 #coding needs to be handled separately from genomic, as the syntax may differ
298 #e.g. it allows for offset descriptions relative to exon-intron borders
299
300 types <- c(
301 substitution="\\d+([+-]\\d+)?[ACGT]>[ACGT]",
302 singledeletion="^\\d+([+-]\\d+)?del$",
303 deletion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?del$",
304 inversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?inv",
305 duplication="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?dup",
306 insertion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?ins[ATCG]+",
307 conversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?con\\d+([+-]\\d+)?_\\d+([+-]\\d+)?",
308 delins="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?delins[ATCG]+",
309 amplification="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?\\[\\d+\\]"
310 )
311
312 phasing <- attr(subbodies,"multi")
313 isMulti <- length(subbodies) > 1
314
315 lapply(1:length(subbodies), function(i.multi) {
316 body <- subbodies[[i.multi]]
317
318 type <- findType(body,types)
319
320 if (type == "substitution") {
321 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?([ACGT])>([ACGT])")[1,]
322 position <- as.integer(groups[[1]])
323 intronOffset <- as.integer(groups[[2]])
324 ancestral <- groups[[3]]
325 variant <- groups[[4]]
326 if (isMulti) {
327 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
328 type=type,start=position,startIntron=intronOffset,ancestral=ancestral,
329 variant=variant))
330 } else {
331 return(list(hgvs=s,subject=subject,type=type,start=position,
332 startIntron=intronOffset,ancestral=ancestral,variant=variant))
333 }
334
335 } else if (type == "singledeletion") {
336 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?del")[1,]
337 position <- as.integer(groups[[1]])
338 intronOffset <- as.integer(groups[[2]])
339 if (isMulti) {
340 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
341 type=type,start=position,startIntron=intronOffset))
342 } else {
343 return(list(hgvs=s,subject=subject,type=type,start=position,
344 startIntron=intronOffset))
345 }
346 } else if (type == "deletion") {
347 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?del")
348 start <- as.integer(groups[[1]])
349 intronOffset <- as.integer(groups[[2]])
350 end <- as.integer(groups[[3]])
351 intronOffset2 <- as.integer(groups[[4]])
352 if (isMulti) {
353 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
354 type=type,start=start,startIntron=intronOffset,
355 end=end,endIntron=intronOffset2))
356 } else {
357 return(list(hgvs=s,subject=subject,type=type,start=start,
358 startIntron=intronOffset,end=end,endIntron=intronOffset2))
359 }
360 } else if (type == "inversion") {
361 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?inv")
362 start <- as.integer(groups[[1]])
363 intronOffset <- as.integer(groups[[2]])
364 end <- as.integer(groups[[3]])
365 intronOffset2 <- as.integer(groups[[4]])
366 if (isMulti) {
367 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
368 type=type,start=start,startIntron=intronOffset,
369 end=end,endIntron=intronOffset2))
370 } else {
371 return(list(hgvs=s,subject=subject,type=type,start=start,
372 startIntron=intronOffset,end=end,endIntron=intronOffset2))
373 }
374 } else if (type == "duplication") {
375 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?dup")
376 start <- as.integer(groups[[1]])
377 intronOffset <- as.integer(groups[[2]])
378 end <- as.integer(groups[[3]])
379 intronOffset2 <- as.integer(groups[[4]])
380 if (isMulti) {
381 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
382 type=type,start=start,startIntron=intronOffset,
383 end=end,endIntron=intronOffset2))
384 } else {
385 return(list(hgvs=s,subject=subject,type=type,start=start,
386 startIntron=intronOffset,end=end,endIntron=intronOffset2))
387 }
388 } else if (type == "insertion") {
389 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?ins([ATCG]+)")
390 start <- as.integer(groups[[1]])
391 intronOffset <- as.integer(groups[[2]])
392 end <- as.integer(groups[[3]])
393 intronOffset2 <- as.integer(groups[[4]])
394 variant <- groups[[5]]
395 if (isMulti) {
396 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
397 type=type,start=start,startIntron=intronOffset,
398 end=end,endIntron=intronOffset2,variant=variant))
399 } else {
400 return(list(hgvs=s,subject=subject,type=type,start=start,
401 startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant))
402 }
403 } else if (type == "conversion") {
404 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?con(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?")
405 start <- as.integer(groups[[1]])
406 intronOffset <- as.integer(groups[[2]])
407 end <- as.integer(groups[[3]])
408 intronOffset2 <- as.integer(groups[[4]])
409 tStart <- as.integer(groups[[5]])
410 intronOffset3 <- as.integer(groups[[6]])
411 tEnd <- as.integer(groups[[7]])
412 intronOffset4 <- as.integer(groups[[8]])
413 if (isMulti) {
414 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
415 type=type,start=start,startIntron=intronOffset,
416 end=end,endIntron=intronOffset2,templateStart=tStart,
417 templateStartIntron=intronOffset3,templateEnd=tEnd,
418 templateEndIntron=intronOffset4))
419 } else {
420 return(list(hgvs=s,subject=subject,type=type,
421 start=start,startIntron=intronOffset,
422 end=end,endIntron=intronOffset2,templateStart=tStart,
423 templateStartIntron=intronOffset3,templateEnd=tEnd,
424 templateEndIntron=intronOffset4))
425 }
426 } else if (type == "delins") {
427 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?delins([ATCG]+)")
428 start <- as.integer(groups[[1]])
429 intronOffset <- as.integer(groups[[2]])
430 end <- as.integer(groups[[3]])
431 intronOffset2 <- as.integer(groups[[4]])
432 variant <- groups[[5]]
433 if (isMulti) {
434 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
435 type=type,start=start,startIntron=intronOffset,
436 end=end,endIntron=intronOffset2,variant=variant))
437 } else {
438 return(list(hgvs=s,subject=subject,type=type,start=start,
439 startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant))
440 }
441 } else if (type == "amplification") {
442 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?\\[(\\d+)\\]")
443 start <- as.integer(groups[[1]])
444 intronOffset <- as.integer(groups[[2]])
445 end <- as.integer(groups[[3]])
446 intronOffset2 <- as.integer(groups[[4]])
447 copies <- as.integer(groups[[3]])
448 if (isMulti) {
449 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
450 type=type,start=start,startIntron=intronOffset,
451 end=end,endIntron=intronOffset2,copies=copies))
452 } else {
453 return(list(hgvs=s,subject=subject,type=type,start=start,
454 startIntron=intronOffset,end=end,endIntron=intronOffset2,copies=copies))
455 }
456 } else {
457 if (isMulti) {
458 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
459 type="invalid"))
460 } else {
461 return(list(hgvs=s,subject=subject,type="invalid"))
462 }
463 }
464
465 })
466
467 } else if (subject=="protein") {
468
469 one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His",
470 I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg",
471 S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter")
472 three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E",
473 Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P",
474 Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*")
475 codes <- paste(c(one2three,three2one[-21],"\\*"),collapse="|")
476
477 types <- list(
478 synonymous1="^=$",
479 synonymous2=paste0("^(",codes,")(\\d+)=$"),
480 substitution=paste0("^(",codes,")(\\d+)(",codes,")$"),
481 singledeletion=paste0("^(",codes,")(\\d+)del$"),
482 deletion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)del$"),
483 duplication=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)dup$"),
484 insertion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)ins((",codes,")+)$"),
485 delins=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)delins((",codes,")+)$"),
486 frameshift1=paste0("^(",codes,")(\\d+)fs$"),
487 frameshift2=paste0("^(",codes,")(\\d+)(",codes,")fs(Ter|\\*)(\\d+)$")
488 )
489
490 phasing <- attr(subbodies,"multi")
491 isMulti <- length(subbodies) > 1
492
493 lapply(1:length(subbodies), function(i.multi) {
494 body <- subbodies[[i.multi]]
495
496 type <- findType(body,types)
497
498 if (type == "synonymous1") {
499 if (isMulti) {
500 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type=type))
501 } else {
502 return(list(hgvs=s,subject=subject,type="synonymous"))
503 }
504 } else if (type == "synonymous2") {
505 groups <- extract.groups(body,types$synonymous2)
506 aa1 <- groups[[1]]
507 pos <- as.integer(groups[[2]])
508 if (aa1 %in% c(one2three,three2one)) {
509 if (is.na(aacode)) {
510 #do nothing
511 } else if (aacode == 1) {
512 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
513 } else if (aacode ==3) {
514 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
515 } else {
516 #this should never happen, as it's supposed to be detected at start of function
517 stop("Invalid aacode. If you see this, report this as a bug!")
518 }
519 if (isMulti) {
520 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
521 type="synonymous",start=pos,ancestral=aa1))
522 } else {
523 return(list(hgvs=s,subject=subject,type="synonymous",start=pos,
524 ancestral=aa1))
525 }
526 } else {#not valid amino acid
527 if (isMulti) {
528 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
529 type="invalid"))
530 } else {
531 return(list(hgvs=s,subject=subject,type="invalid"))
532 }
533 }
534 } else if (type == "substitution") {
535 groups <- extract.groups(body,types$substitution)
536 aa1 <- groups[[1]]
537 pos <- as.integer(groups[[2]])
538 aa2 <- groups[[3]]
539 if (aa1 %in% c(one2three,three2one) && aa2 %in% c(one2three,three2one)) {
540 if (is.na(aacode)) {
541 #do nothing
542 } else if (aacode == 1) {
543 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
544 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
545 } else if (aacode ==3) {
546 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
547 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
548 } else {
549 #this should never happen, as it's supposed to be detected at start of function
550 stop("Invalid aacode. If you see this, report this as a bug!")
551 }
552 if (isMulti) {
553 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
554 type=type,start=pos,ancestral=aa1,variant=aa2))
555 } else {
556 return(list(hgvs=s,subject=subject,type=type,start=pos,
557 ancestral=aa1,variant=aa2))
558 }
559 } else {#not valid amino acid
560 if (isMulti) {
561 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
562 type="invalid"))
563 } else {
564 return(list(hgvs=s,subject=subject,type="invalid"))
565 }
566 }
567
568 } else if (type == "singledeletion") {
569 groups <- extract.groups(body,types$singledeletion)
570 aa1 <- groups[[1]]
571 pos <- as.integer(groups[[2]])
572 if (is.na(aacode)) {
573 #do nothing
574 } else if (aacode == 1) {
575 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
576 } else if (aacode == 3) {
577 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
578 } else {
579 #this should never happen, as it's supposed to be detected at start of function
580 stop("Invalid aacode. If you see this, report this as a bug!")
581 }
582 if (isMulti) {
583 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
584 type=type,start=pos,ancestral=aa1))
585 } else {
586 return(list(hgvs=s,subject=subject,type=type,start=pos,ancestral=aa1))
587 }
588 } else if (type == "deletion") {
589 groups <- extract.groups(body,types$deletion)
590 aa1 <- groups[[1]]
591 pos <- as.integer(groups[[2]])
592 aa2 <- groups[[3]]
593 pos2 <- as.integer(groups[[4]])
594 if (is.na(aacode)) {
595 #do nothing
596 } else if (aacode == 1) {
597 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
598 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
599 } else if (aacode == 3) {
600 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
601 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
602 } else {
603 #this should never happen, as it's supposed to be detected at start of function
604 stop("Invalid aacode. If you see this, report this as a bug!")
605 }
606 if (isMulti) {
607 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
608 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2))
609 } else {
610 return(list(hgvs=s,subject=subject,type=type,start=pos,
611 ancestral=aa1,end=pos2,ancestral2=aa2))
612 }
613
614 } else if (type == "duplication") {
615 groups <- extract.groups(body,types$duplication)
616 aa1 <- groups[[1]]
617 pos <- as.integer(groups[[2]])
618 aa2 <- groups[[3]]
619 pos2 <- as.integer(groups[[4]])
620 if (is.na(aacode)) {
621 #do nothing
622 } else if (aacode == 1) {
623 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
624 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
625 } else if (aacode == 3) {
626 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
627 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
628 } else {
629 #this should never happen, as it's supposed to be detected at start of function
630 stop("Invalid aacode. If you see this, report this as a bug!")
631 }
632 if (isMulti) {
633 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
634 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2))
635 } else {
636 return(list(hgvs=s,subject=subject,type=type,start=pos,
637 ancestral=aa1,end=pos2,ancestral2=aa2))
638 }
639
640 } else if (type == "insertion") {
641 groups <- extract.groups(body,types$insertion)
642 aa1 <- groups[[1]]
643 pos <- as.integer(groups[[2]])
644 aa2 <- groups[[3]]
645 pos2 <- as.integer(groups[[4]])
646 insert <- groups[[5]]
647 #TODO: Implement code conversion
648 if (isMulti) {
649 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
650 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
651 } else {
652 return(list(hgvs=s,subject=subject,type=type,start=pos,
653 ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
654 }
655
656 } else if (type == "delins") {
657 groups <- extract.groups(body,types$delins)
658 aa1 <- groups[[1]]
659 pos <- as.integer(groups[[2]])
660 aa2 <- groups[[3]]
661 pos2 <- as.integer(groups[[4]])
662 insert <- groups[[5]]
663 #TODO: Implement code conversion
664 if (isMulti) {
665 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
666 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
667 } else {
668 return(list(hgvs=s,subject=subject,type=type,start=pos,
669 ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
670 }
671
672 } else if (type == "frameshift1") {
673 groups <- extract.groups(body,types$frameshift1)
674 aa1 <- groups[[1]]
675 pos <- as.integer(groups[[2]])
676 #TODO: Implement code conversion
677 if (isMulti) {
678 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
679 type="frameshift",start=pos,ancestral=aa1))
680 } else {
681 return(list(hgvs=s,subject=subject,type="frameshift",start=pos,
682 ancestral=aa1))
683 }
684
685 } else if (type == "frameshift2") {
686 groups <- extract.groups(body,types$frameshift2)
687 aa1 <- groups[[1]]
688 pos <- as.integer(groups[[2]])
689 aa2 <- groups[[3]]
690 term <- as.integer(groups[[5]])
691 #TODO: Implement code conversion
692 if (isMulti) {
693 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
694 type="frameshift",start=pos,ancestral=aa1,variant=aa2,end=term))
695 } else {
696 return(list(hgvs=s,subject=subject,type="frameshift",start=pos,
697 ancestral=aa1,variant=aa2,end=term))
698 }
699
700 } else {#unmatched type
701 if (isMulti) {
702 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
703 type="invalid"))
704 } else {
705 return(list(hgvs=s,subject=subject,type="invalid"))
706 }
707 }
708
709 })
710
711 } else if (subject=="noncoding") {
712 #FIXME: These need to be list of lists to match postprocessing
713 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
714 } else if (subject=="mitochondrial") {
715 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
716 } else if (subject=="rna") {
717 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
718 } else {#unmatched subject, shouldn't happen
719 stop("Unmatched subject! If you see this, report it as a bug!")
720 }
721 })
722
723 #demote multimutants to enforce simple list structure
724 multiLengths <- sapply(out,length)
725 ids <- do.call(c,lapply(1:length(multiLengths),
726 function(i) if (multiLengths[[i]]==1) as.character(i) else paste0(i,".",1:multiLengths[[i]])
727 ))
728 out2 <- do.call(c,out)
729 names(out2) <- ids
730
731
732
733 to.df(out2)
734 }