Mercurial > repos > iuc > hgvsparser
comparison parseHGVS.R @ 0:c12a4d187121 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hgvsparser/ commit f9deb29cdbd2d2a5f2f4fbd470b1078431a36ae0
| author | iuc |
|---|---|
| date | Fri, 07 Jun 2024 15:21:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c12a4d187121 |
|---|---|
| 1 # Copyright (C) 2018 Jochen Weile, Roth Lab | |
| 2 # | |
| 3 # This file is part of hgvsParseR. | |
| 4 # | |
| 5 # hgvsParseR is free software: you can redistribute it and/or modify | |
| 6 # it under the terms of the GNU General Public License as published by | |
| 7 # the Free Software Foundation, either version 3 of the License, or | |
| 8 # (at your option) any later version. | |
| 9 # | |
| 10 # hgvsParseR is distributed in the hope that it will be useful, | |
| 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 # GNU General Public License for more details. | |
| 14 # | |
| 15 # You should have received a copy of the GNU General Public License | |
| 16 # along with hgvsParseR. If not, see <https://www.gnu.org/licenses/>. | |
| 17 | |
| 18 #' HGVS Parser | |
| 19 #' | |
| 20 #' Parses HGVS strings | |
| 21 #' @param strings A character vector containing the HGVS strings | |
| 22 #' @param aacode allowed values: 1, 3, or NA. Determines whether 1-letter codes or 3-letter codes should be forced. NA uses input format. | |
| 23 #' @return A \code{data.frame} with the following columns: | |
| 24 #' @keywords HGVS parsing | |
| 25 #' @export | |
| 26 #' @examples | |
| 27 #' result <- parseHGVS(c("g.1318G>T","c.123_125inv","p.R123_L152del")) | |
| 28 | |
| 29 parseHGVS <- function(strings,aacode=c(NA,1,3)) { | |
| 30 | |
| 31 #Check that parameters are valid | |
| 32 if (!is.character(strings)) { | |
| 33 stop("Input for 'parse' function must be a character vector! Found '",class(strings),"' instead.") | |
| 34 } | |
| 35 | |
| 36 aacode <- aacode[[1]] | |
| 37 if (!is.na(aacode) && !(aacode %in% c(1,3))) { | |
| 38 warning("Invalid aacode parameter, defaulting to NA!") | |
| 39 aacode <- NA | |
| 40 } | |
| 41 | |
| 42 #Helper function: turns a list of lists (lol) in to a dataframe | |
| 43 to.df <- function(lol) { | |
| 44 colnames <- unique(do.call(c,lapply(lol,names))) | |
| 45 columns <- lapply(colnames,function(cn) sapply(lol,function(row) { | |
| 46 if (cn %in% names(row)) row[[cn]] else NA | |
| 47 })) | |
| 48 names(columns) <- colnames | |
| 49 empty <- which(sapply(columns,function(xs)all(is.na(xs)))) | |
| 50 columns[empty] <- NULL | |
| 51 do.call(data.frame,columns) | |
| 52 } | |
| 53 | |
| 54 # ### | |
| 55 # # Binds matrices of same size together to a 3D matrix, analogously | |
| 56 # # to cbind and rbind. | |
| 57 # # | |
| 58 # zbind <- function(...) { | |
| 59 # x <- list(...) | |
| 60 # y <- array(0,dim=c(nrow(x[[1]]),ncol(x[[1]]),length(x)),dimnames=dimnames(x[[1]])) | |
| 61 # for (i in 1:length(x)) y[,,i] <- x[[i]] | |
| 62 # y | |
| 63 # } | |
| 64 | |
| 65 | |
| 66 ### | |
| 67 # Function to *locally* excise regex groups from string vectors. | |
| 68 # I.e. only extract the first occurrence of each group within each string. | |
| 69 # x = string vector | |
| 70 # re = regular expression with groups | |
| 71 # | |
| 72 extract.groups <- function(x, re) { | |
| 73 matches <- regexpr(re,x,perl=TRUE) | |
| 74 start <- attr(matches,"capture.start") | |
| 75 end <- start + attr(matches,"capture.length") - 1 | |
| 76 do.call(cbind,lapply(1:ncol(start), function(i) { | |
| 77 sapply(1:nrow(start),function(j){ | |
| 78 if (start[j,i] > -1) substr(x[[j]],start[j,i],end[j,i]) else NA | |
| 79 }) | |
| 80 })) | |
| 81 } | |
| 82 | |
| 83 # ### | |
| 84 # # Function to *globally* excise regex groups from string vectors. | |
| 85 # # x = string vector | |
| 86 # # re = regular expression with groups | |
| 87 # # | |
| 88 # global.extract.groups <- function(x,re) { | |
| 89 # all.matches <- gregexpr(re,x,perl=TRUE) | |
| 90 # mapply(function(matches,x) { | |
| 91 # start <- attr(matches,"capture.start") | |
| 92 # end <- start + attr(matches,"capture.length") - 1 | |
| 93 # apply(zbind(start,end),c(1,2),function(pos) substr(x,pos[[1]],pos[[2]]) ) | |
| 94 # },matches=all.matches,x=x,SIMPLIFY=FALSE) | |
| 95 # } | |
| 96 | |
| 97 ### | |
| 98 # Helper function to split multi-mutant bodies into their individual | |
| 99 # elements. Returns a vector of strings containing these elements. | |
| 100 # An attribute "multi" is attached to the vector, detailing the type | |
| 101 # of multi-mutant | |
| 102 # | |
| 103 splitMulti <- function(body) { | |
| 104 if (regexpr("\\[.+\\];\\[.+\\]",body) > 0) { | |
| 105 out <- strsplit(substr(body,2,nchar(body)-1),"\\];\\[")[[1]] | |
| 106 attr(out,"multi") <- "trans" | |
| 107 } else if (regexpr("\\[.+\\(;\\).+\\]",body) > 0) { | |
| 108 out <- strsplit(substr(body,2,nchar(body)-1),"\\(;\\)")[[1]] | |
| 109 attr(out,"multi") <- "unknown" | |
| 110 } else if (regexpr("\\[.+;.+\\]",body) > 0) { | |
| 111 out <- strsplit(substr(body,2,nchar(body)-1),";")[[1]] | |
| 112 attr(out,"multi") <- "cis" | |
| 113 } else { | |
| 114 out <- body | |
| 115 attr(out,"multi") <- "single" | |
| 116 } | |
| 117 return(out) | |
| 118 } | |
| 119 | |
| 120 ### | |
| 121 # Helper function: | |
| 122 # Given an HGVS body and a list of regexes corresponding to types, | |
| 123 # find the (first) matching type. | |
| 124 findType <- function(body,types) { | |
| 125 i <- 0 | |
| 126 found <- done <- FALSE | |
| 127 while (!found && !done) { | |
| 128 found <- regexpr(types[[i <- i+1]],body) > 0 | |
| 129 done <- i >= length(types) | |
| 130 } | |
| 131 if (found) { | |
| 132 return(names(types)[[i]]) | |
| 133 } else { | |
| 134 return("invalid") | |
| 135 } | |
| 136 } | |
| 137 | |
| 138 out <- lapply(strings,function(s) { | |
| 139 | |
| 140 if (regexpr("^[gcnmrp]\\.",s) < 1) { | |
| 141 return(list(list(hgvs=s,subject="invalid",type="invalid"))) | |
| 142 } | |
| 143 | |
| 144 body <- substr(s,3,nchar(s)) | |
| 145 | |
| 146 subbodies <- splitMulti(body) | |
| 147 | |
| 148 subjects <- c( | |
| 149 g="genomic",c="coding",n="noncoding", | |
| 150 m="mitochondrial",r="rna",p="protein" | |
| 151 ) | |
| 152 subject <- subjects[[substr(s,1,1)]] | |
| 153 | |
| 154 if (subject=="genomic") { | |
| 155 | |
| 156 types <- c( | |
| 157 substitution="\\d+[ACGT]>[ACGT]", singledeletion="^\\d+del$", | |
| 158 deletion="\\d+_\\d+del$",inversion="\\d+_\\d+inv", | |
| 159 duplication="\\d+_\\d+dup",insertion="\\d+_\\d+ins[ATCG]+", | |
| 160 conversion="\\d+_\\d+con\\d+_\\d+",delins="\\d+_\\d+delins[ATCG]+", | |
| 161 amplification="\\d+_\\d+\\[\\d+\\]" | |
| 162 ) | |
| 163 | |
| 164 phasing <- attr(subbodies,"multi") | |
| 165 isMulti <- length(subbodies) > 1 | |
| 166 | |
| 167 lapply(1:length(subbodies), function(i.multi) { | |
| 168 body <- subbodies[[i.multi]] | |
| 169 | |
| 170 type <- findType(body,types) | |
| 171 | |
| 172 if (type == "substitution") { | |
| 173 groups <- extract.groups(body,"(\\d+)([ACGT])>([ACGT])")[1,] | |
| 174 position <- as.integer(groups[[1]]) | |
| 175 ancestral <- groups[[2]] | |
| 176 variant <- groups[[3]] | |
| 177 if (isMulti) { | |
| 178 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 179 type=type,start=position,ancestral=ancestral,variant=variant)) | |
| 180 } else { | |
| 181 return(list(hgvs=s,subject=subject,type=type,start=position, | |
| 182 ancestral=ancestral,variant=variant)) | |
| 183 } | |
| 184 | |
| 185 } else if (type == "singledeletion") { | |
| 186 groups <- extract.groups(body,"(\\d+)del")[1,] | |
| 187 position <- as.integer(groups[[1]]) | |
| 188 if (isMulti) { | |
| 189 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 190 type=type,start=position)) | |
| 191 } else { | |
| 192 return(list(hgvs=s,subject=subject,type=type,start=position)) | |
| 193 } | |
| 194 | |
| 195 } else if (type == "deletion") { | |
| 196 groups <- extract.groups(body,"(\\d+)_(\\d+)del") | |
| 197 start <- as.integer(groups[[1]]) | |
| 198 end <- as.integer(groups[[2]]) | |
| 199 if (isMulti) { | |
| 200 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 201 type=type,start=start,end=end)) | |
| 202 } else { | |
| 203 return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) | |
| 204 } | |
| 205 | |
| 206 } else if (type == "inversion") { | |
| 207 groups <- extract.groups(body,"(\\d+)_(\\d+)inv") | |
| 208 start <- as.integer(groups[[1]]) | |
| 209 end <- as.integer(groups[[2]]) | |
| 210 if (isMulti) { | |
| 211 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 212 type=type,start=start,end=end)) | |
| 213 } else { | |
| 214 return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) | |
| 215 } | |
| 216 | |
| 217 } else if (type == "duplication") { | |
| 218 groups <- extract.groups(body,"(\\d+)_(\\d+)dup") | |
| 219 start <- as.integer(groups[[1]]) | |
| 220 end <- as.integer(groups[[2]]) | |
| 221 if (isMulti) { | |
| 222 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 223 type=type,start=start,end=end)) | |
| 224 } else { | |
| 225 return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) | |
| 226 } | |
| 227 | |
| 228 } else if (type == "insertion") { | |
| 229 groups <- extract.groups(body,"(\\d+)_(\\d+)ins([ATCG]+)") | |
| 230 start <- as.integer(groups[[1]]) | |
| 231 end <- as.integer(groups[[2]]) | |
| 232 if (abs(end-start)!=1) { | |
| 233 warning("Invalid insertion definition: | |
| 234 Start and end positions must be adjacent!") | |
| 235 } | |
| 236 variant <- groups[[3]] | |
| 237 if (isMulti) { | |
| 238 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 239 type=type,start=start,end=end,variant=variant)) | |
| 240 } else { | |
| 241 return(list(hgvs=s,subject=subject,type=type,start=start,end=end, | |
| 242 variant=variant)) | |
| 243 } | |
| 244 | |
| 245 } else if (type == "conversion") { | |
| 246 groups <- extract.groups(body,"(\\d+)_(\\d+)con(\\d+)_(\\d+)") | |
| 247 start <- as.integer(groups[[1]]) | |
| 248 end <- as.integer(groups[[2]]) | |
| 249 tStart <- as.integer(groups[[3]]) | |
| 250 tEnd <- as.integer(groups[[4]]) | |
| 251 if (isMulti) { | |
| 252 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 253 type=type,start=start,end=end, | |
| 254 templateStart=tStart,templateEnd=tEnd)) | |
| 255 } else { | |
| 256 return(list(hgvs=s,subject=subject,type=type,start=start,end=end, | |
| 257 templateStart=tStart,templateEnd=tEnd)) | |
| 258 } | |
| 259 | |
| 260 } else if (type == "delins") { | |
| 261 groups <- extract.groups(body,"(\\d+)_(\\d+)delins([ATCG]+)") | |
| 262 start <- as.integer(groups[[1]]) | |
| 263 end <- as.integer(groups[[2]]) | |
| 264 variant <- groups[[3]] | |
| 265 if (isMulti) { | |
| 266 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 267 type=type,start=start,end=end,variant=variant)) | |
| 268 } else { | |
| 269 return(list(hgvs=s,subject=subject,type=type,start=start,end=end, | |
| 270 variant=variant)) | |
| 271 } | |
| 272 | |
| 273 } else if (type == "amplification") { | |
| 274 groups <- extract.groups(body,"(\\d+)_(\\d+)\\[(\\d+)\\]") | |
| 275 start <- as.integer(groups[[1]]) | |
| 276 end <- as.integer(groups[[2]]) | |
| 277 copies <- as.integer(groups[[3]]) | |
| 278 if (isMulti) { | |
| 279 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 280 type=type,start=start,end=end,copies=copies)) | |
| 281 } else { | |
| 282 return(list(hgvs=s,subject=subject,type=type, | |
| 283 start=start,end=end,copies=copies)) | |
| 284 } | |
| 285 | |
| 286 } else { | |
| 287 if (isMulti) { | |
| 288 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type="invalid")) | |
| 289 } else { | |
| 290 return(list(hgvs=s,subject=subject,type="invalid")) | |
| 291 } | |
| 292 } | |
| 293 | |
| 294 }) | |
| 295 | |
| 296 } else if (subject=="coding") { | |
| 297 #coding needs to be handled separately from genomic, as the syntax may differ | |
| 298 #e.g. it allows for offset descriptions relative to exon-intron borders | |
| 299 | |
| 300 types <- c( | |
| 301 substitution="\\d+([+-]\\d+)?[ACGT]>[ACGT]", | |
| 302 singledeletion="^\\d+([+-]\\d+)?del$", | |
| 303 deletion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?del$", | |
| 304 inversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?inv", | |
| 305 duplication="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?dup", | |
| 306 insertion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?ins[ATCG]+", | |
| 307 conversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?con\\d+([+-]\\d+)?_\\d+([+-]\\d+)?", | |
| 308 delins="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?delins[ATCG]+", | |
| 309 amplification="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?\\[\\d+\\]" | |
| 310 ) | |
| 311 | |
| 312 phasing <- attr(subbodies,"multi") | |
| 313 isMulti <- length(subbodies) > 1 | |
| 314 | |
| 315 lapply(1:length(subbodies), function(i.multi) { | |
| 316 body <- subbodies[[i.multi]] | |
| 317 | |
| 318 type <- findType(body,types) | |
| 319 | |
| 320 if (type == "substitution") { | |
| 321 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?([ACGT])>([ACGT])")[1,] | |
| 322 position <- as.integer(groups[[1]]) | |
| 323 intronOffset <- as.integer(groups[[2]]) | |
| 324 ancestral <- groups[[3]] | |
| 325 variant <- groups[[4]] | |
| 326 if (isMulti) { | |
| 327 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 328 type=type,start=position,startIntron=intronOffset,ancestral=ancestral, | |
| 329 variant=variant)) | |
| 330 } else { | |
| 331 return(list(hgvs=s,subject=subject,type=type,start=position, | |
| 332 startIntron=intronOffset,ancestral=ancestral,variant=variant)) | |
| 333 } | |
| 334 | |
| 335 } else if (type == "singledeletion") { | |
| 336 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?del")[1,] | |
| 337 position <- as.integer(groups[[1]]) | |
| 338 intronOffset <- as.integer(groups[[2]]) | |
| 339 if (isMulti) { | |
| 340 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 341 type=type,start=position,startIntron=intronOffset)) | |
| 342 } else { | |
| 343 return(list(hgvs=s,subject=subject,type=type,start=position, | |
| 344 startIntron=intronOffset)) | |
| 345 } | |
| 346 } else if (type == "deletion") { | |
| 347 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?del") | |
| 348 start <- as.integer(groups[[1]]) | |
| 349 intronOffset <- as.integer(groups[[2]]) | |
| 350 end <- as.integer(groups[[3]]) | |
| 351 intronOffset2 <- as.integer(groups[[4]]) | |
| 352 if (isMulti) { | |
| 353 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 354 type=type,start=start,startIntron=intronOffset, | |
| 355 end=end,endIntron=intronOffset2)) | |
| 356 } else { | |
| 357 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 358 startIntron=intronOffset,end=end,endIntron=intronOffset2)) | |
| 359 } | |
| 360 } else if (type == "inversion") { | |
| 361 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?inv") | |
| 362 start <- as.integer(groups[[1]]) | |
| 363 intronOffset <- as.integer(groups[[2]]) | |
| 364 end <- as.integer(groups[[3]]) | |
| 365 intronOffset2 <- as.integer(groups[[4]]) | |
| 366 if (isMulti) { | |
| 367 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 368 type=type,start=start,startIntron=intronOffset, | |
| 369 end=end,endIntron=intronOffset2)) | |
| 370 } else { | |
| 371 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 372 startIntron=intronOffset,end=end,endIntron=intronOffset2)) | |
| 373 } | |
| 374 } else if (type == "duplication") { | |
| 375 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?dup") | |
| 376 start <- as.integer(groups[[1]]) | |
| 377 intronOffset <- as.integer(groups[[2]]) | |
| 378 end <- as.integer(groups[[3]]) | |
| 379 intronOffset2 <- as.integer(groups[[4]]) | |
| 380 if (isMulti) { | |
| 381 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 382 type=type,start=start,startIntron=intronOffset, | |
| 383 end=end,endIntron=intronOffset2)) | |
| 384 } else { | |
| 385 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 386 startIntron=intronOffset,end=end,endIntron=intronOffset2)) | |
| 387 } | |
| 388 } else if (type == "insertion") { | |
| 389 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?ins([ATCG]+)") | |
| 390 start <- as.integer(groups[[1]]) | |
| 391 intronOffset <- as.integer(groups[[2]]) | |
| 392 end <- as.integer(groups[[3]]) | |
| 393 intronOffset2 <- as.integer(groups[[4]]) | |
| 394 variant <- groups[[5]] | |
| 395 if (isMulti) { | |
| 396 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 397 type=type,start=start,startIntron=intronOffset, | |
| 398 end=end,endIntron=intronOffset2,variant=variant)) | |
| 399 } else { | |
| 400 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 401 startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) | |
| 402 } | |
| 403 } else if (type == "conversion") { | |
| 404 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?con(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?") | |
| 405 start <- as.integer(groups[[1]]) | |
| 406 intronOffset <- as.integer(groups[[2]]) | |
| 407 end <- as.integer(groups[[3]]) | |
| 408 intronOffset2 <- as.integer(groups[[4]]) | |
| 409 tStart <- as.integer(groups[[5]]) | |
| 410 intronOffset3 <- as.integer(groups[[6]]) | |
| 411 tEnd <- as.integer(groups[[7]]) | |
| 412 intronOffset4 <- as.integer(groups[[8]]) | |
| 413 if (isMulti) { | |
| 414 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 415 type=type,start=start,startIntron=intronOffset, | |
| 416 end=end,endIntron=intronOffset2,templateStart=tStart, | |
| 417 templateStartIntron=intronOffset3,templateEnd=tEnd, | |
| 418 templateEndIntron=intronOffset4)) | |
| 419 } else { | |
| 420 return(list(hgvs=s,subject=subject,type=type, | |
| 421 start=start,startIntron=intronOffset, | |
| 422 end=end,endIntron=intronOffset2,templateStart=tStart, | |
| 423 templateStartIntron=intronOffset3,templateEnd=tEnd, | |
| 424 templateEndIntron=intronOffset4)) | |
| 425 } | |
| 426 } else if (type == "delins") { | |
| 427 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?delins([ATCG]+)") | |
| 428 start <- as.integer(groups[[1]]) | |
| 429 intronOffset <- as.integer(groups[[2]]) | |
| 430 end <- as.integer(groups[[3]]) | |
| 431 intronOffset2 <- as.integer(groups[[4]]) | |
| 432 variant <- groups[[5]] | |
| 433 if (isMulti) { | |
| 434 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 435 type=type,start=start,startIntron=intronOffset, | |
| 436 end=end,endIntron=intronOffset2,variant=variant)) | |
| 437 } else { | |
| 438 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 439 startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) | |
| 440 } | |
| 441 } else if (type == "amplification") { | |
| 442 groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?\\[(\\d+)\\]") | |
| 443 start <- as.integer(groups[[1]]) | |
| 444 intronOffset <- as.integer(groups[[2]]) | |
| 445 end <- as.integer(groups[[3]]) | |
| 446 intronOffset2 <- as.integer(groups[[4]]) | |
| 447 copies <- as.integer(groups[[3]]) | |
| 448 if (isMulti) { | |
| 449 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 450 type=type,start=start,startIntron=intronOffset, | |
| 451 end=end,endIntron=intronOffset2,copies=copies)) | |
| 452 } else { | |
| 453 return(list(hgvs=s,subject=subject,type=type,start=start, | |
| 454 startIntron=intronOffset,end=end,endIntron=intronOffset2,copies=copies)) | |
| 455 } | |
| 456 } else { | |
| 457 if (isMulti) { | |
| 458 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 459 type="invalid")) | |
| 460 } else { | |
| 461 return(list(hgvs=s,subject=subject,type="invalid")) | |
| 462 } | |
| 463 } | |
| 464 | |
| 465 }) | |
| 466 | |
| 467 } else if (subject=="protein") { | |
| 468 | |
| 469 one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His", | |
| 470 I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg", | |
| 471 S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter") | |
| 472 three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E", | |
| 473 Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P", | |
| 474 Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*") | |
| 475 codes <- paste(c(one2three,three2one[-21],"\\*"),collapse="|") | |
| 476 | |
| 477 types <- list( | |
| 478 synonymous1="^=$", | |
| 479 synonymous2=paste0("^(",codes,")(\\d+)=$"), | |
| 480 substitution=paste0("^(",codes,")(\\d+)(",codes,")$"), | |
| 481 singledeletion=paste0("^(",codes,")(\\d+)del$"), | |
| 482 deletion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)del$"), | |
| 483 duplication=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)dup$"), | |
| 484 insertion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)ins((",codes,")+)$"), | |
| 485 delins=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)delins((",codes,")+)$"), | |
| 486 frameshift1=paste0("^(",codes,")(\\d+)fs$"), | |
| 487 frameshift2=paste0("^(",codes,")(\\d+)(",codes,")fs(Ter|\\*)(\\d+)$") | |
| 488 ) | |
| 489 | |
| 490 phasing <- attr(subbodies,"multi") | |
| 491 isMulti <- length(subbodies) > 1 | |
| 492 | |
| 493 lapply(1:length(subbodies), function(i.multi) { | |
| 494 body <- subbodies[[i.multi]] | |
| 495 | |
| 496 type <- findType(body,types) | |
| 497 | |
| 498 if (type == "synonymous1") { | |
| 499 if (isMulti) { | |
| 500 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type=type)) | |
| 501 } else { | |
| 502 return(list(hgvs=s,subject=subject,type="synonymous")) | |
| 503 } | |
| 504 } else if (type == "synonymous2") { | |
| 505 groups <- extract.groups(body,types$synonymous2) | |
| 506 aa1 <- groups[[1]] | |
| 507 pos <- as.integer(groups[[2]]) | |
| 508 if (aa1 %in% c(one2three,three2one)) { | |
| 509 if (is.na(aacode)) { | |
| 510 #do nothing | |
| 511 } else if (aacode == 1) { | |
| 512 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] | |
| 513 } else if (aacode ==3) { | |
| 514 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] | |
| 515 } else { | |
| 516 #this should never happen, as it's supposed to be detected at start of function | |
| 517 stop("Invalid aacode. If you see this, report this as a bug!") | |
| 518 } | |
| 519 if (isMulti) { | |
| 520 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 521 type="synonymous",start=pos,ancestral=aa1)) | |
| 522 } else { | |
| 523 return(list(hgvs=s,subject=subject,type="synonymous",start=pos, | |
| 524 ancestral=aa1)) | |
| 525 } | |
| 526 } else {#not valid amino acid | |
| 527 if (isMulti) { | |
| 528 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 529 type="invalid")) | |
| 530 } else { | |
| 531 return(list(hgvs=s,subject=subject,type="invalid")) | |
| 532 } | |
| 533 } | |
| 534 } else if (type == "substitution") { | |
| 535 groups <- extract.groups(body,types$substitution) | |
| 536 aa1 <- groups[[1]] | |
| 537 pos <- as.integer(groups[[2]]) | |
| 538 aa2 <- groups[[3]] | |
| 539 if (aa1 %in% c(one2three,three2one) && aa2 %in% c(one2three,three2one)) { | |
| 540 if (is.na(aacode)) { | |
| 541 #do nothing | |
| 542 } else if (aacode == 1) { | |
| 543 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] | |
| 544 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] | |
| 545 } else if (aacode ==3) { | |
| 546 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] | |
| 547 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] | |
| 548 } else { | |
| 549 #this should never happen, as it's supposed to be detected at start of function | |
| 550 stop("Invalid aacode. If you see this, report this as a bug!") | |
| 551 } | |
| 552 if (isMulti) { | |
| 553 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 554 type=type,start=pos,ancestral=aa1,variant=aa2)) | |
| 555 } else { | |
| 556 return(list(hgvs=s,subject=subject,type=type,start=pos, | |
| 557 ancestral=aa1,variant=aa2)) | |
| 558 } | |
| 559 } else {#not valid amino acid | |
| 560 if (isMulti) { | |
| 561 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 562 type="invalid")) | |
| 563 } else { | |
| 564 return(list(hgvs=s,subject=subject,type="invalid")) | |
| 565 } | |
| 566 } | |
| 567 | |
| 568 } else if (type == "singledeletion") { | |
| 569 groups <- extract.groups(body,types$singledeletion) | |
| 570 aa1 <- groups[[1]] | |
| 571 pos <- as.integer(groups[[2]]) | |
| 572 if (is.na(aacode)) { | |
| 573 #do nothing | |
| 574 } else if (aacode == 1) { | |
| 575 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] | |
| 576 } else if (aacode == 3) { | |
| 577 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] | |
| 578 } else { | |
| 579 #this should never happen, as it's supposed to be detected at start of function | |
| 580 stop("Invalid aacode. If you see this, report this as a bug!") | |
| 581 } | |
| 582 if (isMulti) { | |
| 583 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 584 type=type,start=pos,ancestral=aa1)) | |
| 585 } else { | |
| 586 return(list(hgvs=s,subject=subject,type=type,start=pos,ancestral=aa1)) | |
| 587 } | |
| 588 } else if (type == "deletion") { | |
| 589 groups <- extract.groups(body,types$deletion) | |
| 590 aa1 <- groups[[1]] | |
| 591 pos <- as.integer(groups[[2]]) | |
| 592 aa2 <- groups[[3]] | |
| 593 pos2 <- as.integer(groups[[4]]) | |
| 594 if (is.na(aacode)) { | |
| 595 #do nothing | |
| 596 } else if (aacode == 1) { | |
| 597 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] | |
| 598 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] | |
| 599 } else if (aacode == 3) { | |
| 600 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] | |
| 601 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] | |
| 602 } else { | |
| 603 #this should never happen, as it's supposed to be detected at start of function | |
| 604 stop("Invalid aacode. If you see this, report this as a bug!") | |
| 605 } | |
| 606 if (isMulti) { | |
| 607 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 608 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) | |
| 609 } else { | |
| 610 return(list(hgvs=s,subject=subject,type=type,start=pos, | |
| 611 ancestral=aa1,end=pos2,ancestral2=aa2)) | |
| 612 } | |
| 613 | |
| 614 } else if (type == "duplication") { | |
| 615 groups <- extract.groups(body,types$duplication) | |
| 616 aa1 <- groups[[1]] | |
| 617 pos <- as.integer(groups[[2]]) | |
| 618 aa2 <- groups[[3]] | |
| 619 pos2 <- as.integer(groups[[4]]) | |
| 620 if (is.na(aacode)) { | |
| 621 #do nothing | |
| 622 } else if (aacode == 1) { | |
| 623 if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] | |
| 624 if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] | |
| 625 } else if (aacode == 3) { | |
| 626 if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] | |
| 627 if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] | |
| 628 } else { | |
| 629 #this should never happen, as it's supposed to be detected at start of function | |
| 630 stop("Invalid aacode. If you see this, report this as a bug!") | |
| 631 } | |
| 632 if (isMulti) { | |
| 633 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 634 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) | |
| 635 } else { | |
| 636 return(list(hgvs=s,subject=subject,type=type,start=pos, | |
| 637 ancestral=aa1,end=pos2,ancestral2=aa2)) | |
| 638 } | |
| 639 | |
| 640 } else if (type == "insertion") { | |
| 641 groups <- extract.groups(body,types$insertion) | |
| 642 aa1 <- groups[[1]] | |
| 643 pos <- as.integer(groups[[2]]) | |
| 644 aa2 <- groups[[3]] | |
| 645 pos2 <- as.integer(groups[[4]]) | |
| 646 insert <- groups[[5]] | |
| 647 #TODO: Implement code conversion | |
| 648 if (isMulti) { | |
| 649 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 650 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) | |
| 651 } else { | |
| 652 return(list(hgvs=s,subject=subject,type=type,start=pos, | |
| 653 ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) | |
| 654 } | |
| 655 | |
| 656 } else if (type == "delins") { | |
| 657 groups <- extract.groups(body,types$delins) | |
| 658 aa1 <- groups[[1]] | |
| 659 pos <- as.integer(groups[[2]]) | |
| 660 aa2 <- groups[[3]] | |
| 661 pos2 <- as.integer(groups[[4]]) | |
| 662 insert <- groups[[5]] | |
| 663 #TODO: Implement code conversion | |
| 664 if (isMulti) { | |
| 665 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 666 type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) | |
| 667 } else { | |
| 668 return(list(hgvs=s,subject=subject,type=type,start=pos, | |
| 669 ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) | |
| 670 } | |
| 671 | |
| 672 } else if (type == "frameshift1") { | |
| 673 groups <- extract.groups(body,types$frameshift1) | |
| 674 aa1 <- groups[[1]] | |
| 675 pos <- as.integer(groups[[2]]) | |
| 676 #TODO: Implement code conversion | |
| 677 if (isMulti) { | |
| 678 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 679 type="frameshift",start=pos,ancestral=aa1)) | |
| 680 } else { | |
| 681 return(list(hgvs=s,subject=subject,type="frameshift",start=pos, | |
| 682 ancestral=aa1)) | |
| 683 } | |
| 684 | |
| 685 } else if (type == "frameshift2") { | |
| 686 groups <- extract.groups(body,types$frameshift2) | |
| 687 aa1 <- groups[[1]] | |
| 688 pos <- as.integer(groups[[2]]) | |
| 689 aa2 <- groups[[3]] | |
| 690 term <- as.integer(groups[[5]]) | |
| 691 #TODO: Implement code conversion | |
| 692 if (isMulti) { | |
| 693 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 694 type="frameshift",start=pos,ancestral=aa1,variant=aa2,end=term)) | |
| 695 } else { | |
| 696 return(list(hgvs=s,subject=subject,type="frameshift",start=pos, | |
| 697 ancestral=aa1,variant=aa2,end=term)) | |
| 698 } | |
| 699 | |
| 700 } else {#unmatched type | |
| 701 if (isMulti) { | |
| 702 return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, | |
| 703 type="invalid")) | |
| 704 } else { | |
| 705 return(list(hgvs=s,subject=subject,type="invalid")) | |
| 706 } | |
| 707 } | |
| 708 | |
| 709 }) | |
| 710 | |
| 711 } else if (subject=="noncoding") { | |
| 712 #FIXME: These need to be list of lists to match postprocessing | |
| 713 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) | |
| 714 } else if (subject=="mitochondrial") { | |
| 715 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) | |
| 716 } else if (subject=="rna") { | |
| 717 return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) | |
| 718 } else {#unmatched subject, shouldn't happen | |
| 719 stop("Unmatched subject! If you see this, report it as a bug!") | |
| 720 } | |
| 721 }) | |
| 722 | |
| 723 #demote multimutants to enforce simple list structure | |
| 724 multiLengths <- sapply(out,length) | |
| 725 ids <- do.call(c,lapply(1:length(multiLengths), | |
| 726 function(i) if (multiLengths[[i]]==1) as.character(i) else paste0(i,".",1:multiLengths[[i]]) | |
| 727 )) | |
| 728 out2 <- do.call(c,out) | |
| 729 names(out2) <- ids | |
| 730 | |
| 731 | |
| 732 | |
| 733 to.df(out2) | |
| 734 } |
