diff customProDB.R @ 1:ad130eaa3a05 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit e025f5b4d590c44537cf0702e2fb040a28f98fec
author galaxyp
date Fri, 12 May 2017 13:17:40 -0400
parents 8ccfff69dd57
children 2cba79e6037e
line wrap: on
line diff
--- a/customProDB.R	Tue Mar 14 14:14:38 2017 -0400
+++ b/customProDB.R	Fri May 12 13:17:40 2017 -0400
@@ -27,64 +27,146 @@
 option_list$cosmic <- make_option('--cosmic', type='character')
 option_list$annotationFromHistory <- make_option('--annotationFromHistory', type='logical', action="store_true", default=FALSE)
 option_list$rpkmCutoff <- make_option('--rpkmCutoff', type='character')
-#option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE)
+option_list$outputIndels <- make_option('--outputIndels', type='logical', action="store_true", default=FALSE)
 #option_list$outputNovelJunctions <- make_option('--outputNovelJunctions', type='logical', action="store_true", default=FALSE)
-option_list$outputFile <- make_option('--outputFile', type='character')
+#option_list$bedFile <- make_option('--bedFile', type='character')
+#option_list$bsGenome <- make_option('--bsGenome', type='character')
+option_list$outputRData <- make_option('--outputRData', type='logical', action="store_true", default=FALSE)
+option_list$outputSQLite <- make_option('--outputSQLite', type='logical', action="store_true", default=FALSE)
 
 
 opt <- parse_args(OptionParser(option_list=option_list))
 
 
 customProDB <- function(
-	bam_file = GalaxyInputFile(required=TRUE), 
-	bai_file = GalaxyInputFile(required=TRUE), 
-	vcf_file = GalaxyInputFile(required=TRUE), 
-	exon_anno_file = GalaxyInputFile(required=TRUE),
-	proteinseq_file = GalaxyInputFile(required=TRUE),
-	procodingseq_file = GalaxyInputFile(required=TRUE),
-	ids_file = GalaxyInputFile(required=TRUE),
-	dbsnpinCoding_file = GalaxyInputFile(required=FALSE),
-	cosmic_file = GalaxyInputFile(required=FALSE),
-	annotationFromHistory = GalaxyLogicalParam(required=FALSE),
-	rpkmCutoff = GalaxyNumericParam(required=TRUE),
-	#outputIndels = GalaxyLogicalParam(required=FALSE),
-	#outputNovelJunctions = GalaxyLogicalParam(required=FALSE),
-	outputFile = GalaxyOutput("FASTA","fasta"))
+    bam_file = GalaxyInputFile(required=TRUE), 
+    bai_file = GalaxyInputFile(required=TRUE), 
+    vcf_file = GalaxyInputFile(required=TRUE), 
+    exon_anno_file = GalaxyInputFile(required=TRUE),
+    proteinseq_file = GalaxyInputFile(required=TRUE),
+    procodingseq_file = GalaxyInputFile(required=TRUE),
+    ids_file = GalaxyInputFile(required=TRUE),
+    dbsnpinCoding_file = GalaxyInputFile(required=FALSE),
+    cosmic_file = GalaxyInputFile(required=FALSE),
+    annotationFromHistory = GalaxyLogicalParam(required=FALSE),
+    rpkmCutoff = GalaxyNumericParam(required=TRUE),
+    outputIndels = GalaxyLogicalParam(required=FALSE),
+    outputRData = GalaxyLogicalParam(required=FALSE),
+    outputSQLite = GalaxyLogicalParam(required=FALSE)
+    #,outputNovelJunctions = GalaxyLogicalParam(required=FALSE)
+    #,bedFile = GalaxyInputFile(required=FALSE)
+    #,bsGenome = GalaxyCharacterParam(required=FALSE)
+    )
 {
+    old <- options(stringsAsFactors = FALSE, gsubfn.engine = "R")
+    on.exit(options(old), add = TRUE)
+
     file.symlink(exon_anno_file, paste(getwd(), "exon_anno.RData", sep="/"))
     file.symlink(proteinseq_file, paste(getwd(), "proseq.RData", sep="/"))
     file.symlink(procodingseq_file, paste(getwd(), "procodingseq.RData", sep="/"))
     file.symlink(ids_file, paste(getwd(), "ids.RData", sep="/"))
 
+    load(exon_anno_file)
+    load(proteinseq_file)
+    load(procodingseq_file)
+    load(ids_file)
+
     if (length(dbsnpinCoding_file) > 0)
     {
         file.symlink(dbsnpinCoding_file, paste(getwd(), "dbsnpinCoding.RData", sep="/"))
-        labelrsid = T
+        labelrsid = TRUE
+        load(dbsnpinCoding_file)
     }
     else
     {
-        labelrsid = F
+        dbsnpinCoding = NULL
+        labelrsid = FALSE
     }
 
     if (length(cosmic_file) > 0)
     {
         file.symlink(cosmic_file, paste(getwd(), "cosmic.RData", sep="/"))
-        cosmic = T
+        use_cosmic = TRUE
+        load(cosmic_file)
     }
     else
     {
-        cosmic = F
+        cosmic = NULL
+        use_cosmic = FALSE
     }
 
     bamLink = "input.bam"
     file.symlink(bam_file, bamLink)
     file.symlink(bai_file, paste(bamLink, ".bai", sep=""))
 
-    suppressPackageStartupMessages(library(customProDB))
+    # load from GitHub until conda package is available
+    download.file("https://github.com/ggrothendieck/sqldf/archive/master.zip", "sqldf.zip", quiet=TRUE)
+    unzip("sqldf.zip")
+    devtools::load_all("sqldf-master")
+
+    # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github)
+    download.file("https://github.com/chambm/customProDB/archive/master.zip", "customProDB.zip", quiet=TRUE)
+    unzip("customProDB.zip")
+    devtools::load_all("customProDB-master")
 
     easyRun(bamFile=bamLink, vcfFile=vcf_file, annotation_path=getwd(),
             rpkm_cutoff=rpkmCutoff, outfile_path=".", outfile_name="output",
-            nov_junction=F, INDEL=T, lablersid=labelrsid, COSMIC=cosmic)
+            nov_junction=FALSE, INDEL=outputIndels,
+            lablersid=labelrsid, COSMIC=use_cosmic)
+
+    # save variant annotations to an RData file (needed by proBAMr)
+    if (outputRData || outputSQLite)
+    {
+        variantAnnotation = getVariantAnnotation(vcf_file, ids, exon, proteinseq, procodingseq, dbsnpinCoding, cosmic)
+        if (outputRData) save(variantAnnotation, file="output.rdata")
+    }
+
+    if (outputSQLite)
+    {
+        # create protein-centric variant annotation table (needed by Galaxy-P viewer MVP)
+        varproseq = unique(rbind(variantAnnotation$snvproseq, variantAnnotation$indelproseq))
+        ref_vs_var_seq = sqldf::sqldf("SELECT reference.pro_name, variant.pro_name AS var_pro_name, reference.peptide AS ref_seq, variant.peptide AS var_seq
+                                       FROM proteinseq reference, varproseq variant
+                                       WHERE reference.tx_name=variant.tx_name
+                                       GROUP BY variant.pro_name")
+        getCigarishString = function(ref, var)
+        {
+            a = Biostrings::pairwiseAlignment(ref, var)
+            d = gsub("[A-Z]", "=", Biostrings::compareStrings(a@pattern, a@subject))
+            r = rle(strsplit(d, "")[[1]])
+            gsub("-", "D", gsub("\\+", "I", gsub("\\?", "X", paste0(r$lengths, r$values, collapse=""))))
+        }
+        ref_vs_var_seq$cigar =  mapply(FUN=getCigarishString, ref_vs_var_seq$ref_seq, ref_vs_var_seq$var_seq, USE.NAMES=FALSE)
+        ref_vs_var_seq$annotation = substring(ref_vs_var_seq$var_pro_name, stringr::str_length(ref_vs_var_seq$pro_name)+2)
+
+        variant_annotation_sqlite = dbConnect(RSQLite::SQLite(), "output_variant_annotation.sqlite")
+        dbWriteTable(variant_annotation_sqlite,
+                     "variant_annotation",
+                     sqldf::sqldf("SELECT var_pro_name, pro_name, cigar, annotation FROM ref_vs_var_seq"))
+        DBI::dbExecute(variant_annotation_sqlite, "CREATE INDEX variant_annotation_var_pro_name ON variant_annotation (var_pro_name)")
+
+        # save genomic mapping to a SQLite file (needed by Galaxy-P viewer MVP)
+        exon$cds_start = as.integer(exon$cds_start)
+        exon$cds_end = as.integer(exon$cds_end)
+        genomic_mapping_sqlite = dbConnect(RSQLite::SQLite(), "output_genomic_mapping.sqlite")
+        varprocoding = unique(rbind(variantAnnotation$snvprocoding, variantAnnotation$indelprocoding))
+        dbWriteTable(genomic_mapping_sqlite,
+                     "genomic_mapping",
+                     sqldf::sqldf("SELECT exon.gene_name, exon.tx_name, varprocoding.pro_name, cds_start, cds_end,
+                                          chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand
+                                  FROM exon, varprocoding
+                                  WHERE exon.tx_id=varprocoding.tx_id AND cds_chr_start > 0
+                                  GROUP BY exon.tx_id, rank
+                                  UNION
+                                  SELECT gene_name, tx_name, pro_name, cds_start, cds_end,
+                                         chromosome_name AS chr_name, cds_chr_start, cds_chr_end, exon.strand
+                                  FROM exon
+                                  WHERE cds_chr_start > 0
+                                  GROUP BY tx_id, rank"))
+        DBI::dbExecute(genomic_mapping_sqlite, "CREATE INDEX genomic_mapping_pro_name ON genomic_mapping (pro_name)")
+    }
+
+    invisible(NULL)
 }