diff data_manager/customProDB_annotation.R @ 1:9b4ee836e35b draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
author galaxyp
date Thu, 08 Jun 2017 10:55:08 -0400
parents 45755942ae7b
children 0a9ffebba65d
line wrap: on
line diff
--- a/data_manager/customProDB_annotation.R	Tue Mar 14 14:11:55 2017 -0400
+++ b/data_manager/customProDB_annotation.R	Thu Jun 08 10:55:08 2017 -0400
@@ -12,10 +12,13 @@
 
 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("RGalaxy"))
+suppressPackageStartupMessages(library("GetoptLong"))
 
 
 option_list <- list()
 option_list$dbkey <- make_option('--dbkey', type='character')
+option_list$ensembl_host <- make_option('--ensembl_host', type='character')
+option_list$ensembl_dataset <- make_option('--ensembl_dataset', type='character')
 option_list$dbsnp <- make_option('--dbsnp', type='character')
 option_list$cosmic <- make_option('--cosmic', type='logical')
 option_list$outputFile <- make_option('--outputFile', type='character')
@@ -25,20 +28,29 @@
 
 
 customProDB_annotation <- function(
-	dbkey = GalaxyCharacterParam(required=TRUE), 
+	dbkey = GalaxyCharacterParam(required=FALSE), 
+	ensembl_host = GalaxyCharacterParam(required=FALSE), 
+	ensembl_dataset = GalaxyCharacterParam(required=FALSE), 
 	dbsnp_str = GalaxyCharacterParam(required=FALSE), 
 	cosmic = GalaxyLogicalParam(required=FALSE), 
 	dbkey_description = GalaxyCharacterParam(required=FALSE), 
 	outputFile = GalaxyOutput("output","json"))
 {
+    options(stringsAsFactors = FALSE, gsubfn.engine = "R")
+
     if (!file.exists(outputFile))
     {
         gstop("json params file does not exist")
     }
 
-    if (length(dbkey_description) < 1)
+    if (length(dbkey)+length(ensembl_dataset)+length(ensembl_host) == 0)
     {
-        dbkey_description = dbkey
+        gstop("one of the genome annotation sources must be specified; either dbkey or host and dataset")
+    }
+    else if (length(dbkey) > 0 &&
+             (length(ensembl_dataset) > 0 || length(ensembl_host) > 0))
+    {
+        gstop("only one genome annotation source can be specified; either dbkey or host and dataset")
     }
 
     if (length(dbsnp_str) > 0)
@@ -53,7 +65,8 @@
     use_cosmic = FALSE
     if (length(cosmic) > 0)
     {
-        if (grepl("^hg", dbkey))
+        if (length(dbkey) > 0 && grepl("^hg", dbkey) ||
+            length(ensembl_dataset) > 0 && grepl("^hsapiens", ensembl_dataset))
         {
             use_cosmic = TRUE
         }
@@ -76,26 +89,96 @@
         gstop("failed to remove json params file after reading")
     })
 
-    ucscTableCodingFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgSeq.cdsExon=on&hgSeq.granularity=gene&hgSeq.casing=exon&hgSeq.repMasking=lower&hgta_doGenomicDna=get+sequence&hgta_group=genes&hgta_track=refGene&hgta_table=refGene&hgta_regionType=genome", sep="")
-    ucscTableProteinFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgta_geneSeqType=protein&hgta_doGenePredSequence=submit&hgta_track=refGene&hgta_table=refGene", sep="")
-    codingFastaFilepath = paste(target_directory, "/", dbkey, ".cds.fa", sep="")
-    proteinFastaFilepath = paste(target_directory, "/", dbkey, ".protein.fa", sep="")
+    # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github)
+    download.file("https://github.com/chambm/customProDB/archive/c57e5498392197bc598a18c26acb70d7530a921cc57e5498.zip", "customProDB.zip", quiet=TRUE)
+    unzip("customProDB.zip")
+    devtools::load_all("customProDB-c57e5498392197bc598a18c26acb70d7530a921c")
 
-    suppressPackageStartupMessages(library(customProDB))
+    #suppressPackageStartupMessages(library(customProDB))
     options(timeout=3600)
 
-    cat(paste("Downloading coding FASTA from:", ucscTableCodingFastaURL, "\n"))
-    download.file(ucscTableCodingFastaURL, codingFastaFilepath, quiet=T, mode='wb')
+    # download protein and coding sequences for UCSC annotation
+    if (length(dbkey) > 0)
+    {
+        proteinFastaFilepath = paste(dbkey, ".protein.fa", sep="")
+
+        cat(paste("Downloading protein FASTA from:", getProteinFastaUrlFromUCSC(dbkey), "\n"))
+        download.file(getProteinFastaUrlFromUCSC(dbkey), proteinFastaFilepath, quiet=T, mode='wb')
+
+        local_cache_path = paste0("customProDB_annotation_", dbkey, "-", tools::md5sum(proteinFastaFilepath)[[1]])
+        codingFastaFilepath = paste0(local_cache_path, "/", dbkey, ".cds.fa")
+        dir.create(local_cache_path, showWarnings=FALSE)
+
+        if (!file.exists(codingFastaFilepath)) {
+            cat(paste("Downloading coding FASTA from:", getCodingFastaUrlFromUCSC(dbkey), "\n"))
+            download.file(getCodingFastaUrlFromUCSC(dbkey), codingFastaFilepath, quiet=T, mode='wb')
+        }
+
+        cat(paste("Preparing Refseq annotation files\n"))
+        PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path)
 
-    cat(paste("Downloading protein FASTA from:", ucscTableProteinFastaURL, "\n"))
-    download.file(ucscTableProteinFastaURL, proteinFastaFilepath, quiet=T, mode='wb')
+        if (length(dbkey_description) < 1)
+        {
+            dbkey_description = dbkey
+        }
+    }
+    else
+    {
+        local_cache_path = paste0("customProDB_annotation_", ensembl_dataset, "_", ensembl_host)
+
+        suppressPackageStartupMessages(library(biomaRt))
+        cat(paste("Preparing Ensembl annotation files\n"))
+        ensembl_mart = useMart("ENSEMBL_MART_ENSEMBL", dataset=ensembl_dataset, host=ensembl_host)
+        PrepareAnnotationEnsembl(mart=ensembl_mart, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path)
+
+        metadata = sqldf::sqldf("SELECT value FROM metadata WHERE name='BioMart database version' OR name='BioMart dataset description' OR name='BioMart dataset version'",
+                                dbname=file.path(target_directory, "txdb.sqlite"))
+        version = metadata$value[1] # Ensembl Genes 87
+        assembly = metadata$value[3]
+        dbkey = paste0(ensembl_dataset, "_", sub(".*?(\\d+)", "\\1", version, perl=TRUE))
 
-    cat(paste("Preparing Refseq annotation files\n"))
-    customProDB::PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic)
-    
-    outputPath = paste(dbkey, "/customProDB", sep="")
+        # convert Ensembl chromosome names to UCSC for Galaxy compatibility
+        chromosomeMappingsBaseUrl = "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master"
+        assemblyNoGrcPatch = sub("(\\S+?)(\\.p\\S+)?$", "\\1", assembly, perl=TRUE)
+        chromosomeMappingsUrl = qq("@{chromosomeMappingsBaseUrl}/@{assemblyNoGrcPatch}_ensembl2UCSC.txt")
+        if (RCurl::url.exists(chromosomeMappingsUrl))
+        {
+            cat(qq("Converting Ensembl chromosome names from: @{chromosomeMappingsUrl}\n"))
+            e2u = read.delim(chromosomeMappingsUrl, header=FALSE, col.names=c("ensembl", "ucsc"))
+            e2u = setNames(as.list(e2u$ucsc), e2u$ensembl)
+            load(file.path(target_directory, "exon_anno.RData"))
+            exon$chromosome_name = sapply(exon$chromosome_name, function(x) e2u[[as.character(x)]])
+            exon = exon[nzchar(exon$chromosome_name), ] # omit genome patches with no mapping
+            save(exon, file=file.path(target_directory, "exon_anno.RData"))
+        }
+        else
+        {
+            gwarning(qq("unable to convert Ensembl chromosome names to UCSC; mapping file @{assemblyNoGrcPatch}_ensembl2UCSC.txt does not exist"))
+        }
+
+        if (length(dbkey_description) < 1)
+        {
+            dbkey_description = qq("@{ensembl_dataset} (@{version}) (@{assembly})")
+        }
+    }
+
+    qualified_dbkey = dbkey
+
+    if (length(dbsnp_str) > 0 && nzchar(dbsnp_str))
+    {
+        qualified_dbkey = qq("@{qualified_dbkey}_db@{dbsnp_str}")
+        dbkey_description = qq("@{dbkey_description} (db@{dbsnp_str})")
+    }
+
+    if (length(cosmic) > 0)
+    {
+        qualified_dbkey = qq("@{qualified_dbkey}_cosmic")
+        dbkey_description = qq("@{dbkey_description} (COSMIC)")
+    }
+
+    outputPath = paste0(qualified_dbkey, "/customProDB")
     output = list(data_tables = list())
-    output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=dbkey, value=dbkey)
+    output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=qualified_dbkey, value=qualified_dbkey)
     write(toJSON(output), file=outputFile)
 }