comparison customProDB.R @ 1:ad130eaa3a05 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit e025f5b4d590c44537cf0702e2fb040a28f98fec
author galaxyp
date Fri, 12 May 2017 13:17:40 -0400
parents 8ccfff69dd57
children 2cba79e6037e
comparison
equal deleted inserted replaced
0:8ccfff69dd57 1:ad130eaa3a05
25 option_list$ids <- make_option('--ids', type='character') 25 option_list$ids <- make_option('--ids', type='character')
26 option_list$dbsnpinCoding <- make_option('--dbsnpinCoding', type='character') 26 option_list$dbsnpinCoding <- make_option('--dbsnpinCoding', type='character')
27 option_list$cosmic <- make_option('--cosmic', type='character') 27 option_list$cosmic <- make_option('--cosmic', type='character')
28 option_list$annotationFromHistory <- make_option('--annotationFromHistory', type='logical', action="store_true", default=FALSE) 28 option_list$annotationFromHistory <- make_option('--annotationFromHistory', type='logical', action="store_true", default=FALSE)
29 option_list$rpkmCutoff <- make_option('--rpkmCutoff', type='character') 29 option_list$rpkmCutoff <- make_option('--rpkmCutoff', type='character')
30 #option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE) 30 option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE)
31 #option_list$outputNovelJunctions <- make_option('--outputNovelJunctions', type='logical', action="store_true", default=FALSE) 31 #option_list$outputNovelJunctions <- make_option('--outputNovelJunctions', type='logical', action="store_true", default=FALSE)
32 option_list$outputFile <- make_option('--outputFile', type='character') 32 #option_list$bedFile <- make_option('--bedFile', type='character')
33 #option_list$bsGenome <- make_option('--bsGenome', type='character')
34 option_list$outputRData <- make_option('--outputRData', type='logical', action="store_true", default=FALSE)
35 option_list$outputSQLite <- make_option('--outputSQLite', type='logical', action="store_true", default=FALSE)
33 36
34 37
35 opt <- parse_args(OptionParser(option_list=option_list)) 38 opt <- parse_args(OptionParser(option_list=option_list))
36 39
37 40
38 customProDB <- function( 41 customProDB <- function(
39 bam_file = GalaxyInputFile(required=TRUE), 42 bam_file = GalaxyInputFile(required=TRUE),
40 bai_file = GalaxyInputFile(required=TRUE), 43 bai_file = GalaxyInputFile(required=TRUE),
41 vcf_file = GalaxyInputFile(required=TRUE), 44 vcf_file = GalaxyInputFile(required=TRUE),
42 exon_anno_file = GalaxyInputFile(required=TRUE), 45 exon_anno_file = GalaxyInputFile(required=TRUE),
43 proteinseq_file = GalaxyInputFile(required=TRUE), 46 proteinseq_file = GalaxyInputFile(required=TRUE),
44 procodingseq_file = GalaxyInputFile(required=TRUE), 47 procodingseq_file = GalaxyInputFile(required=TRUE),
45 ids_file = GalaxyInputFile(required=TRUE), 48 ids_file = GalaxyInputFile(required=TRUE),
46 dbsnpinCoding_file = GalaxyInputFile(required=FALSE), 49 dbsnpinCoding_file = GalaxyInputFile(required=FALSE),
47 cosmic_file = GalaxyInputFile(required=FALSE), 50 cosmic_file = GalaxyInputFile(required=FALSE),
48 annotationFromHistory = GalaxyLogicalParam(required=FALSE), 51 annotationFromHistory = GalaxyLogicalParam(required=FALSE),
49 rpkmCutoff = GalaxyNumericParam(required=TRUE), 52 rpkmCutoff = GalaxyNumericParam(required=TRUE),
50 #outputIndels = GalaxyLogicalParam(required=FALSE), 53 outputIndels = GalaxyLogicalParam(required=FALSE),
51 #outputNovelJunctions = GalaxyLogicalParam(required=FALSE), 54 outputRData = GalaxyLogicalParam(required=FALSE),
52 outputFile = GalaxyOutput("FASTA","fasta")) 55 outputSQLite = GalaxyLogicalParam(required=FALSE)
56 #,outputNovelJunctions = GalaxyLogicalParam(required=FALSE)
57 #,bedFile = GalaxyInputFile(required=FALSE)
58 #,bsGenome = GalaxyCharacterParam(required=FALSE)
59 )
53 { 60 {
61 old <- options(stringsAsFactors = FALSE, gsubfn.engine = "R")
62 on.exit(options(old), add = TRUE)
63
54 file.symlink(exon_anno_file, paste(getwd(), "exon_anno.RData", sep="/")) 64 file.symlink(exon_anno_file, paste(getwd(), "exon_anno.RData", sep="/"))
55 file.symlink(proteinseq_file, paste(getwd(), "proseq.RData", sep="/")) 65 file.symlink(proteinseq_file, paste(getwd(), "proseq.RData", sep="/"))
56 file.symlink(procodingseq_file, paste(getwd(), "procodingseq.RData", sep="/")) 66 file.symlink(procodingseq_file, paste(getwd(), "procodingseq.RData", sep="/"))
57 file.symlink(ids_file, paste(getwd(), "ids.RData", sep="/")) 67 file.symlink(ids_file, paste(getwd(), "ids.RData", sep="/"))
58 68
69 load(exon_anno_file)
70 load(proteinseq_file)
71 load(procodingseq_file)
72 load(ids_file)
73
59 if (length(dbsnpinCoding_file) > 0) 74 if (length(dbsnpinCoding_file) > 0)
60 { 75 {
61 file.symlink(dbsnpinCoding_file, paste(getwd(), "dbsnpinCoding.RData", sep="/")) 76 file.symlink(dbsnpinCoding_file, paste(getwd(), "dbsnpinCoding.RData", sep="/"))
62 labelrsid = T 77 labelrsid = TRUE
78 load(dbsnpinCoding_file)
63 } 79 }
64 else 80 else
65 { 81 {
66 labelrsid = F 82 dbsnpinCoding = NULL
83 labelrsid = FALSE
67 } 84 }
68 85
69 if (length(cosmic_file) > 0) 86 if (length(cosmic_file) > 0)
70 { 87 {
71 file.symlink(cosmic_file, paste(getwd(), "cosmic.RData", sep="/")) 88 file.symlink(cosmic_file, paste(getwd(), "cosmic.RData", sep="/"))
72 cosmic = T 89 use_cosmic = TRUE
90 load(cosmic_file)
73 } 91 }
74 else 92 else
75 { 93 {
76 cosmic = F 94 cosmic = NULL
95 use_cosmic = FALSE
77 } 96 }
78 97
79 bamLink = "input.bam" 98 bamLink = "input.bam"
80 file.symlink(bam_file, bamLink) 99 file.symlink(bam_file, bamLink)
81 file.symlink(bai_file, paste(bamLink, ".bai", sep="")) 100 file.symlink(bai_file, paste(bamLink, ".bai", sep=""))
82 101
83 suppressPackageStartupMessages(library(customProDB)) 102 # load from GitHub until conda package is available
103 download.file("https://github.com/ggrothendieck/sqldf/archive/master.zip", "sqldf.zip", quiet=TRUE)
104 unzip("sqldf.zip")
105 devtools::load_all("sqldf-master")
106
107 # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github)
108 download.file("https://github.com/chambm/customProDB/archive/master.zip", "customProDB.zip", quiet=TRUE)
109 unzip("customProDB.zip")
110 devtools::load_all("customProDB-master")
84 111
85 easyRun(bamFile=bamLink, vcfFile=vcf_file, annotation_path=getwd(), 112 easyRun(bamFile=bamLink, vcfFile=vcf_file, annotation_path=getwd(),
86 rpkm_cutoff=rpkmCutoff, outfile_path=".", outfile_name="output", 113 rpkm_cutoff=rpkmCutoff, outfile_path=".", outfile_name="output",
87 nov_junction=F, INDEL=T, lablersid=labelrsid, COSMIC=cosmic) 114 nov_junction=FALSE, INDEL=outputIndels,
115 lablersid=labelrsid, COSMIC=use_cosmic)
116
117 # save variant annotations to an RData file (needed by proBAMr)
118 if (outputRData || outputSQLite)
119 {
120 variantAnnotation = getVariantAnnotation(vcf_file, ids, exon, proteinseq, procodingseq, dbsnpinCoding, cosmic)
121 if (outputRData) save(variantAnnotation, file="output.rdata")
122 }
123
124 if (outputSQLite)
125 {
126 # create protein-centric variant annotation table (needed by Galaxy-P viewer MVP)
127 varproseq = unique(rbind(variantAnnotation$snvproseq, variantAnnotation$indelproseq))
128 ref_vs_var_seq = sqldf::sqldf("SELECT reference.pro_name, variant.pro_name AS var_pro_name, reference.peptide AS ref_seq, variant.peptide AS var_seq
129 FROM proteinseq reference, varproseq variant
130 WHERE reference.tx_name=variant.tx_name
131 GROUP BY variant.pro_name")
132 getCigarishString = function(ref, var)
133 {
134 a = Biostrings::pairwiseAlignment(ref, var)
135 d = gsub("[A-Z]", "=", Biostrings::compareStrings(a@pattern, a@subject))
136 r = rle(strsplit(d, "")[[1]])
137 gsub("-", "D", gsub("\\+", "I", gsub("\\?", "X", paste0(r$lengths, r$values, collapse=""))))
138 }
139 ref_vs_var_seq$cigar = mapply(FUN=getCigarishString, ref_vs_var_seq$ref_seq, ref_vs_var_seq$var_seq, USE.NAMES=FALSE)
140 ref_vs_var_seq$annotation = substring(ref_vs_var_seq$var_pro_name, stringr::str_length(ref_vs_var_seq$pro_name)+2)
141
142 variant_annotation_sqlite = dbConnect(RSQLite::SQLite(), "output_variant_annotation.sqlite")
143 dbWriteTable(variant_annotation_sqlite,
144 "variant_annotation",
145 sqldf::sqldf("SELECT var_pro_name, pro_name, cigar, annotation FROM ref_vs_var_seq"))
146 DBI::dbExecute(variant_annotation_sqlite, "CREATE INDEX variant_annotation_var_pro_name ON variant_annotation (var_pro_name)")
147
148 # save genomic mapping to a SQLite file (needed by Galaxy-P viewer MVP)
149 exon$cds_start = as.integer(exon$cds_start)
150 exon$cds_end = as.integer(exon$cds_end)
151 genomic_mapping_sqlite = dbConnect(RSQLite::SQLite(), "output_genomic_mapping.sqlite")
152 varprocoding = unique(rbind(variantAnnotation$snvprocoding, variantAnnotation$indelprocoding))
153 dbWriteTable(genomic_mapping_sqlite,
154 "genomic_mapping",
155 sqldf::sqldf("SELECT exon.gene_name, exon.tx_name, varprocoding.pro_name, cds_start, cds_end,
156 chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand
157 FROM exon, varprocoding
158 WHERE exon.tx_id=varprocoding.tx_id AND cds_chr_start > 0
159 GROUP BY exon.tx_id, rank
160 UNION
161 SELECT gene_name, tx_name, pro_name, cds_start, cds_end,
162 chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand
163 FROM exon
164 WHERE cds_chr_start > 0
165 GROUP BY tx_id, rank"))
166 DBI::dbExecute(genomic_mapping_sqlite, "CREATE INDEX genomic_mapping_pro_name ON genomic_mapping (pro_name)")
167 }
168
169 invisible(NULL)
88 } 170 }
89 171
90 172
91 params <- list() 173 params <- list()
92 for(param in names(opt)) 174 for(param in names(opt))