Mercurial > repos > galaxyp > custom_pro_db_annotation_data_manager
changeset 1:9b4ee836e35b draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
author | galaxyp |
---|---|
date | Thu, 08 Jun 2017 10:55:08 -0400 |
parents | 45755942ae7b |
children | 0a9ffebba65d |
files | data_manager/customProDB_annotation.R data_manager/customProDB_annotation.xml ensembl_datasets.loc.sample tool-data/update_ensembl_datasets.R |
diffstat | 4 files changed, 331 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/customProDB_annotation.R Tue Mar 14 14:11:55 2017 -0400 +++ b/data_manager/customProDB_annotation.R Thu Jun 08 10:55:08 2017 -0400 @@ -12,10 +12,13 @@ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("RGalaxy")) +suppressPackageStartupMessages(library("GetoptLong")) option_list <- list() option_list$dbkey <- make_option('--dbkey', type='character') +option_list$ensembl_host <- make_option('--ensembl_host', type='character') +option_list$ensembl_dataset <- make_option('--ensembl_dataset', type='character') option_list$dbsnp <- make_option('--dbsnp', type='character') option_list$cosmic <- make_option('--cosmic', type='logical') option_list$outputFile <- make_option('--outputFile', type='character') @@ -25,20 +28,29 @@ customProDB_annotation <- function( - dbkey = GalaxyCharacterParam(required=TRUE), + dbkey = GalaxyCharacterParam(required=FALSE), + ensembl_host = GalaxyCharacterParam(required=FALSE), + ensembl_dataset = GalaxyCharacterParam(required=FALSE), dbsnp_str = GalaxyCharacterParam(required=FALSE), cosmic = GalaxyLogicalParam(required=FALSE), dbkey_description = GalaxyCharacterParam(required=FALSE), outputFile = GalaxyOutput("output","json")) { + options(stringsAsFactors = FALSE, gsubfn.engine = "R") + if (!file.exists(outputFile)) { gstop("json params file does not exist") } - if (length(dbkey_description) < 1) + if (length(dbkey)+length(ensembl_dataset)+length(ensembl_host) == 0) { - dbkey_description = dbkey + gstop("one of the genome annotation sources must be specified; either dbkey or host and dataset") + } + else if (length(dbkey) > 0 && + (length(ensembl_dataset) > 0 || length(ensembl_host) > 0)) + { + gstop("only one genome annotation source can be specified; either dbkey or host and dataset") } if (length(dbsnp_str) > 0) @@ -53,7 +65,8 @@ use_cosmic = FALSE if (length(cosmic) > 0) { - if (grepl("^hg", dbkey)) + if (length(dbkey) > 0 && grepl("^hg", dbkey) || + length(ensembl_dataset) > 0 && grepl("^hsapiens", ensembl_dataset)) { use_cosmic = TRUE } @@ -76,26 +89,96 @@ gstop("failed to remove json params file after reading") }) - ucscTableCodingFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgSeq.cdsExon=on&hgSeq.granularity=gene&hgSeq.casing=exon&hgSeq.repMasking=lower&hgta_doGenomicDna=get+sequence&hgta_group=genes&hgta_track=refGene&hgta_table=refGene&hgta_regionType=genome", sep="") - ucscTableProteinFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgta_geneSeqType=protein&hgta_doGenePredSequence=submit&hgta_track=refGene&hgta_table=refGene", sep="") - codingFastaFilepath = paste(target_directory, "/", dbkey, ".cds.fa", sep="") - proteinFastaFilepath = paste(target_directory, "/", dbkey, ".protein.fa", sep="") + # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github) + download.file("https://github.com/chambm/customProDB/archive/c57e5498392197bc598a18c26acb70d7530a921cc57e5498.zip", "customProDB.zip", quiet=TRUE) + unzip("customProDB.zip") + devtools::load_all("customProDB-c57e5498392197bc598a18c26acb70d7530a921c") - suppressPackageStartupMessages(library(customProDB)) + #suppressPackageStartupMessages(library(customProDB)) options(timeout=3600) - cat(paste("Downloading coding FASTA from:", ucscTableCodingFastaURL, "\n")) - download.file(ucscTableCodingFastaURL, codingFastaFilepath, quiet=T, mode='wb') + # download protein and coding sequences for UCSC annotation + if (length(dbkey) > 0) + { + proteinFastaFilepath = paste(dbkey, ".protein.fa", sep="") + + cat(paste("Downloading protein FASTA from:", getProteinFastaUrlFromUCSC(dbkey), "\n")) + download.file(getProteinFastaUrlFromUCSC(dbkey), proteinFastaFilepath, quiet=T, mode='wb') + + local_cache_path = paste0("customProDB_annotation_", dbkey, "-", tools::md5sum(proteinFastaFilepath)[[1]]) + codingFastaFilepath = paste0(local_cache_path, "/", dbkey, ".cds.fa") + dir.create(local_cache_path, showWarnings=FALSE) + + if (!file.exists(codingFastaFilepath)) { + cat(paste("Downloading coding FASTA from:", getCodingFastaUrlFromUCSC(dbkey), "\n")) + download.file(getCodingFastaUrlFromUCSC(dbkey), codingFastaFilepath, quiet=T, mode='wb') + } + + cat(paste("Preparing Refseq annotation files\n")) + PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path) - cat(paste("Downloading protein FASTA from:", ucscTableProteinFastaURL, "\n")) - download.file(ucscTableProteinFastaURL, proteinFastaFilepath, quiet=T, mode='wb') + if (length(dbkey_description) < 1) + { + dbkey_description = dbkey + } + } + else + { + local_cache_path = paste0("customProDB_annotation_", ensembl_dataset, "_", ensembl_host) + + suppressPackageStartupMessages(library(biomaRt)) + cat(paste("Preparing Ensembl annotation files\n")) + ensembl_mart = useMart("ENSEMBL_MART_ENSEMBL", dataset=ensembl_dataset, host=ensembl_host) + PrepareAnnotationEnsembl(mart=ensembl_mart, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path) + + metadata = sqldf::sqldf("SELECT value FROM metadata WHERE name='BioMart database version' OR name='BioMart dataset description' OR name='BioMart dataset version'", + dbname=file.path(target_directory, "txdb.sqlite")) + version = metadata$value[1] # Ensembl Genes 87 + assembly = metadata$value[3] + dbkey = paste0(ensembl_dataset, "_", sub(".*?(\\d+)", "\\1", version, perl=TRUE)) - cat(paste("Preparing Refseq annotation files\n")) - customProDB::PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic) - - outputPath = paste(dbkey, "/customProDB", sep="") + # convert Ensembl chromosome names to UCSC for Galaxy compatibility + chromosomeMappingsBaseUrl = "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master" + assemblyNoGrcPatch = sub("(\\S+?)(\\.p\\S+)?$", "\\1", assembly, perl=TRUE) + chromosomeMappingsUrl = qq("@{chromosomeMappingsBaseUrl}/@{assemblyNoGrcPatch}_ensembl2UCSC.txt") + if (RCurl::url.exists(chromosomeMappingsUrl)) + { + cat(qq("Converting Ensembl chromosome names from: @{chromosomeMappingsUrl}\n")) + e2u = read.delim(chromosomeMappingsUrl, header=FALSE, col.names=c("ensembl", "ucsc")) + e2u = setNames(as.list(e2u$ucsc), e2u$ensembl) + load(file.path(target_directory, "exon_anno.RData")) + exon$chromosome_name = sapply(exon$chromosome_name, function(x) e2u[[as.character(x)]]) + exon = exon[nzchar(exon$chromosome_name), ] # omit genome patches with no mapping + save(exon, file=file.path(target_directory, "exon_anno.RData")) + } + else + { + gwarning(qq("unable to convert Ensembl chromosome names to UCSC; mapping file @{assemblyNoGrcPatch}_ensembl2UCSC.txt does not exist")) + } + + if (length(dbkey_description) < 1) + { + dbkey_description = qq("@{ensembl_dataset} (@{version}) (@{assembly})") + } + } + + qualified_dbkey = dbkey + + if (length(dbsnp_str) > 0 && nzchar(dbsnp_str)) + { + qualified_dbkey = qq("@{qualified_dbkey}_db@{dbsnp_str}") + dbkey_description = qq("@{dbkey_description} (db@{dbsnp_str})") + } + + if (length(cosmic) > 0) + { + qualified_dbkey = qq("@{qualified_dbkey}_cosmic") + dbkey_description = qq("@{dbkey_description} (COSMIC)") + } + + outputPath = paste0(qualified_dbkey, "/customProDB") output = list(data_tables = list()) - output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=dbkey, value=dbkey) + output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=qualified_dbkey, value=qualified_dbkey) write(toJSON(output), file=outputFile) }
--- a/data_manager/customProDB_annotation.xml Tue Mar 14 14:11:55 2017 -0400 +++ b/data_manager/customProDB_annotation.xml Thu Jun 08 10:55:08 2017 -0400 @@ -1,7 +1,25 @@ -<tool id="custom_pro_db_annotation_data_manager" name="CustomProDB Annotation" tool_type="manage_data" version="0.0.1"> +<tool id="custom_pro_db_annotation_data_manager" name="CustomProDB Annotation" tool_type="manage_data" version="1.16.1.0"> <description>builder</description> <requirements> - <requirement type="package" version="1.14.0">bioconductor-customprodb</requirement> + <requirement type="package" version="3.3.1">r-base</requirement> + <!--<requirement type="package" version="1.14.0">bioconductor-customprodb</requirement>--> + <requirement type="package" version="1.18.0">bioconductor-rgalaxy</requirement> + <requirement type="package" version="1.21.0">bioconductor-biocinstaller</requirement> + <requirement type="package" version="1.20.3">bioconductor-variantannotation</requirement> + <requirement type="package" version="1.26.4">bioconductor-genomicfeatures</requirement> + <requirement type="package" version="1.11.1">r-devtools</requirement> + <requirement type="package" version="3.98_1.4">r-xml</requirement> + <requirement type="package" version="0.10.11">r-rmysql</requirement> + <requirement type="package" version="1.0.2">r-testthat</requirement> + <requirement type="package" version="0.1.0">r-getoptlong</requirement> + <requirement type="package" version="1.1.2">r-stringi</requirement> + <requirement type="package" version="1.1.0">r-stringr</requirement> + <requirement type="package" version="1.10.0">r-data.table</requirement> + <requirement type="package" version="0.4_10">r-sqldf</requirement> + <requirement type="package" version="0.6_6">r-gsubfn</requirement> + <requirement type="package" version="2.3_47">r-chron</requirement> + <requirement type="package" version="0.3_10">r-proto</requirement> + <requirement type="package" version="1.8.4">r-plyr</requirement> </requirements> <stdio> <exit_code range=":-1" /> @@ -9,17 +27,43 @@ </stdio> <command><![CDATA[ Rscript --vanilla '$__tool_directory__/customProDB_annotation.R' - --outputFile '${out_file}' - --dbkey '${dbkey}' - --dbsnp '${dbsnp}' - $cosmic - --dbkey_description '${ dbkey.get_display_text() }' - 2>1 + --outputFile '${out_file}' + + #if str($transcriptome_annotation.source) == 'refseq': + --dbkey '${transcriptome_annotation.dbkey}' + --dbkey_description '${ transcriptome_annotation.dbkey.get_display_text().strip("\"'") }' + #else: + --ensembl_dataset '${transcriptome_annotation.ensembl_dataset.fields.dataset}' + --ensembl_host '${transcriptome_annotation.ensembl_dataset.fields.host}' + --dbkey_description '${transcriptome_annotation.ensembl_dataset.fields.name}' + #end if + + --dbsnp '${dbsnp}' + $cosmic + 2>&1 ]]> </command> <inputs> - <param type="genomebuild" name="dbkey" value="" label="UCSC dbKey for reference genome" /> - <param type="text" name="dbsnp" value="" label="dbSNP identifier currently available from UCSC" help="e.g. 'snp142'" /> + <conditional name="transcriptome_annotation"> + <param name="source" type="select" label="What source do you want to use for mapping between genes, transcripts, and proteins?" help="RefSeq transcripts are like NM_xxxx, Ensembl transcripts are like ENSTxxxx"> + <option value="refseq">Annotate transcriptome with RefSeq (from NCBI/UCSC)</option> + <option value="ensembl">Annotate transcriptome with Ensembl (from Biomart)</option> + </param> + <when value="refseq"> + <param type="genomebuild" name="dbkey" value="" label="UCSC dbKey for reference genome" help="e.g. hg38, hg19, mm10, canFam31" /> + </when> + <when value="ensembl"> + <param type="select" name="ensembl_dataset" value="" label="Ensembl reference genome identifier"> + <options from_file="ensembl_datasets.loc"> + <column name="value" index="2" /> + <column name="dataset" index="0" /> + <column name="host" index="1" /> + <validator type="no_options" message="Ensembl dataset list not loaded"/> + </options> + </param> + </when> + </conditional> + <param type="text" name="dbsnp" value="" label="dbSNP identifier (select organisms only, e.g. human, mouse, cow)" help="e.g. snp142" /> <param type="boolean" name="cosmic" truevalue="--cosmic true" falsevalue="" label="Annotate somatic SNPs from COSMIC (human only)" /> </inputs> <outputs> @@ -29,8 +73,12 @@ .. class:: infomark -**Notice:** If you leave name, description, or id blank, it will be generated automatically. +This data manager creates the transcriptome annotation in the RData format needed by customProDB. +Two annotation sources are supported: UCSC and Ensembl. +Note that because UCSC's table browser only provides current gene annotations for a given genome assembly, +only the Ensembl annotation is entirely reproducible, i.e. running again with the same settings next month will create the same annotation. +Ensembl chromosome names (1,2, ...) are converted to UCSC format (chr1,chr2, ...) to ease integration with other Galaxy tools. </help> <citations> <citation type="doi">10.1093/bioinformatics/btt543</citation>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensembl_datasets.loc.sample Thu Jun 08 10:55:08 2017 -0400 @@ -0,0 +1,113 @@ +aaegypti_gene_ensembl may2009.archive.ensembl.org A.aegypti genes (Ensembl 54 aaegypti) (AaegL1) +amelanoleuca_gene_ensembl may2017.archive.ensembl.org Panda genes (Ensembl 89 amelanoleuca) (ailMel1) +aplatyrhynchos_gene_ensembl may2017.archive.ensembl.org Duck genes (Ensembl 89 aplatyrhynchos) (BGI_duck_1.0) +acarolinensis_gene_ensembl may2017.archive.ensembl.org Anole lizard genes (Ensembl 89 acarolinensis) (AnoCar2.0) +acarolinensis_gene_ensembl may2009.archive.ensembl.org Anole lizard genes (Ensembl 54 acarolinensis) (AnoCar1.0) +agambiae_gene_ensembl may2009.archive.ensembl.org Mosquito genes (Ensembl 54 agambiae) (AgamP3) +amexicanus_gene_ensembl may2017.archive.ensembl.org Cave fish genes (Ensembl 89 amexicanus) (AstMex102) +btaurus_gene_ensembl may2017.archive.ensembl.org Cow genes (Ensembl 89 btaurus) (UMD3.1) +btaurus_gene_ensembl may2009.archive.ensembl.org Cow genes (Ensembl 54 btaurus) (Btau_4.0) +celegans_gene_ensembl may2017.archive.ensembl.org C.elegans genes (Ensembl 89 celegans) (WBcel235) +celegans_gene_ensembl may2012.archive.ensembl.org C.elegans genes (Ensembl 67 celegans) (WBcel215) +celegans_gene_ensembl may2009.archive.ensembl.org C.elegans genes (Ensembl 54 celegans) (WS190) +cjacchus_gene_ensembl may2017.archive.ensembl.org Marmoset genes (Ensembl 89 cjacchus) (C_jacchus3.2.1) +cfamiliaris_gene_ensembl may2017.archive.ensembl.org Dog genes (Ensembl 89 cfamiliaris) (CanFam3.1) +cfamiliaris_gene_ensembl may2012.archive.ensembl.org Dog genes (Ensembl 67 cfamiliaris) (CanFam 2.0) +tsyrichta_gene_ensembl may2017.archive.ensembl.org Tarsier genes (Ensembl 89 tsyrichta) (tarSyr1) +cporcellus_gene_ensembl may2017.archive.ensembl.org Guinea Pig genes (Ensembl 89 cporcellus) (cavPor3) +csabaeus_gene_ensembl may2017.archive.ensembl.org Vervet-AGM genes (Ensembl 89 csabaeus) (ChlSab1.1) +choffmanni_gene_ensembl may2017.archive.ensembl.org Sloth genes (Ensembl 89 choffmanni) (choHof1) +cintestinalis_gene_ensembl may2017.archive.ensembl.org C.intestinalis genes (Ensembl 89 cintestinalis) (KH) +cintestinalis_gene_ensembl may2009.archive.ensembl.org C.intestinalis genes (Ensembl 54 cintestinalis) (JGI 2) +csavignyi_gene_ensembl may2017.archive.ensembl.org C.savignyi genes (Ensembl 89 csavignyi) (CSAV 2.0) +drerio_gene_ensembl may2017.archive.ensembl.org Zebrafish genes (Ensembl 89 drerio) (GRCz10) +drerio_gene_ensembl mar2015.archive.ensembl.org Zebrafish genes (Ensembl 79 drerio) (Zv9) +drerio_gene_ensembl may2009.archive.ensembl.org Zebrafish genes (Ensembl 54 drerio) (Zv8) +dnovemcinctus_gene_ensembl may2017.archive.ensembl.org Armadillo genes (Ensembl 89 dnovemcinctus) (Dasnov3.0) +dnovemcinctus_gene_ensembl may2012.archive.ensembl.org Armadillo genes (Ensembl 67 dnovemcinctus) (dasNov2) +dnovemcinctus_gene_ensembl may2009.archive.ensembl.org Armadillo genes (Ensembl 54 dnovemcinctus) (ARMA) +dordii_gene_ensembl may2017.archive.ensembl.org Kangaroo rat genes (Ensembl 89 dordii) (dipOrd1) +dmelanogaster_gene_ensembl may2017.archive.ensembl.org Fly genes (Ensembl 89 dmelanogaster) (BDGP6) +dmelanogaster_gene_ensembl dec2014.archive.ensembl.org Fly genes (Ensembl 78 dmelanogaster) (BDGP 5) +dmelanogaster_gene_ensembl may2009.archive.ensembl.org Fly genes (Ensembl 54 dmelanogaster) (BDGP 5.4) +etelfairi_gene_ensembl may2017.archive.ensembl.org Tenrec genes (Ensembl 89 etelfairi) (TENREC) +ecaballus_gene_ensembl may2017.archive.ensembl.org Horse genes (Ensembl 89 ecaballus) (Equ Cab 2) +eeuropaeus_gene_ensembl may2017.archive.ensembl.org Hedgehog genes (Ensembl 89 eeuropaeus) (eriEur1) +fcatus_gene_ensembl may2017.archive.ensembl.org Cat genes (Ensembl 89 fcatus) (Felis_catus_6.2) +fcatus_gene_ensembl may2012.archive.ensembl.org Cat genes (Ensembl 67 fcatus) (CAT) +falbicollis_gene_ensembl may2017.archive.ensembl.org Collared flycatcher genes (Ensembl 89 falbicollis) (FicAlb_1.4) +gmorhua_gene_ensembl may2017.archive.ensembl.org Cod genes (Ensembl 89 gmorhua) (gadMor1) +ggallus_gene_ensembl may2017.archive.ensembl.org Chicken genes (Ensembl 89 ggallus) (Gallus_gallus-5.0) +ggallus_gene_ensembl jul2016.archive.ensembl.org Chicken genes (Ensembl 85 ggallus) (Galgal4) +ggallus_gene_ensembl may2012.archive.ensembl.org Chicken genes (Ensembl 67 ggallus) (WASHUC2) +gaculeatus_gene_ensembl may2017.archive.ensembl.org Stickleback genes (Ensembl 89 gaculeatus) (BROAD S1) +ggorilla_gene_ensembl may2017.archive.ensembl.org Gorilla genes (Ensembl 89 ggorilla) (gorGor3.1) +ggorilla_gene_ensembl may2009.archive.ensembl.org Gorilla genes (Ensembl 54 ggorilla) (gorGor1) +hsapiens_gene_ensembl may2017.archive.ensembl.org Human genes (Ensembl 89 hsapiens) (GRCh38.p10) +hsapiens_gene_ensembl mar2017.archive.ensembl.org Human genes (Ensembl 88 hsapiens) (GRCh38.p7) +hsapiens_gene_ensembl mar2016.archive.ensembl.org Human genes (Ensembl 84 hsapiens) (GRCh38.p5) +hsapiens_gene_ensembl sep2015.archive.ensembl.org Human genes (Ensembl 82 hsapiens) (GRCh38.p3) +hsapiens_gene_ensembl may2015.archive.ensembl.org Human genes (Ensembl 80 hsapiens) (GRCh38.p2) +hsapiens_gene_ensembl dec2014.archive.ensembl.org Human genes (Ensembl 78 hsapiens) (GRCh38) +hsapiens_gene_ensembl feb2014.archive.ensembl.org Human genes (Ensembl 75 hsapiens) (GRCh37.p13) +hsapiens_gene_ensembl may2012.archive.ensembl.org Human genes (Ensembl 67 hsapiens) (GRCh37.p7) +hsapiens_gene_ensembl may2009.archive.ensembl.org Human genes (Ensembl 54 hsapiens) (NCBI 36) +itridecemlineatus_gene_ensembl may2017.archive.ensembl.org Squirrel genes (Ensembl 89 itridecemlineatus) (spetri2) +itridecemlineatus_gene_ensembl may2009.archive.ensembl.org Squirrel genes (Ensembl 54 itridecemlineatus) (speTri1) +lchalumnae_gene_ensembl may2017.archive.ensembl.org Coelacanth genes (Ensembl 89 lchalumnae) (LatCha1) +loculatus_gene_ensembl may2017.archive.ensembl.org Spotted gar genes (Ensembl 89 loculatus) (LepOcu1) +lafricana_gene_ensembl may2017.archive.ensembl.org Elephant genes (Ensembl 89 lafricana) (Loxafr3.0) +lafricana_gene_ensembl may2009.archive.ensembl.org Elephant genes (Ensembl 54 lafricana) (BROAD E1) +mmulatta_gene_ensembl may2017.archive.ensembl.org Macaque genes (Ensembl 89 mmulatta) (Mmul_8.0.1) +mmulatta_gene_ensembl jul2016.archive.ensembl.org Macaque genes (Ensembl 85 mmulatta) (MMUL 1.0) +mgallopavo_gene_ensembl may2017.archive.ensembl.org Turkey genes (Ensembl 89 mgallopavo) (Turkey_2.01) +mmurinus_gene_ensembl may2017.archive.ensembl.org Mouse lemur genes (Ensembl 89 mmurinus) (Mmur_2.0) +mmurinus_gene_ensembl jul2016.archive.ensembl.org Mouse lemur genes (Ensembl 85 mmurinus) (micMur1) +mdomestica_gene_ensembl may2017.archive.ensembl.org Opossum genes (Ensembl 89 mdomestica) (monDom5) +mmusculus_gene_ensembl may2017.archive.ensembl.org Mouse genes (Ensembl 89 mmusculus) (GRCm38.p5) +mmusculus_gene_ensembl oct2016.archive.ensembl.org Mouse genes (Ensembl 86 mmusculus) (GRCm38.p4) +mmusculus_gene_ensembl may2015.archive.ensembl.org Mouse genes (Ensembl 80 mmusculus) (GRCm38.p3) +mmusculus_gene_ensembl oct2014.archive.ensembl.org Mouse genes (Ensembl 77 mmusculus) (GRCm38.p2) +mmusculus_gene_ensembl may2012.archive.ensembl.org Mouse genes (Ensembl 67 mmusculus) (NCBI m37) +mfuro_gene_ensembl may2017.archive.ensembl.org Domestic ferret genes (Ensembl 89 mfuro) (MusPutFur1.0) +mlucifugus_gene_ensembl may2017.archive.ensembl.org Microbat genes (Ensembl 89 mlucifugus) (Myoluc2.0) +mlucifugus_gene_ensembl may2009.archive.ensembl.org Microbat genes (Ensembl 54 mlucifugus) (myoLuc1) +nleucogenys_gene_ensembl may2017.archive.ensembl.org Gibbon genes (Ensembl 89 nleucogenys) (Nleu1.0) +meugenii_gene_ensembl may2017.archive.ensembl.org Wallaby genes (Ensembl 89 meugenii) (Meug_1.0) +oprinceps_gene_ensembl may2017.archive.ensembl.org Pika genes (Ensembl 89 oprinceps) (OchPri2.0-Ens) +oprinceps_gene_ensembl mar2017.archive.ensembl.org Pika genes (Ensembl 88 oprinceps) (OchPri2.0) +oniloticus_gene_ensembl may2017.archive.ensembl.org Tilapia genes (Ensembl 89 oniloticus) (Orenil1.0) +oanatinus_gene_ensembl may2017.archive.ensembl.org Platypus genes (Ensembl 89 oanatinus) (OANA5) +ocuniculus_gene_ensembl may2017.archive.ensembl.org Rabbit genes (Ensembl 89 ocuniculus) (OryCun2.0) +ocuniculus_gene_ensembl may2009.archive.ensembl.org Rabbit genes (Ensembl 54 ocuniculus) (RABBIT) +olatipes_gene_ensembl may2017.archive.ensembl.org Medaka genes (Ensembl 89 olatipes) (HdrR) +ogarnettii_gene_ensembl may2017.archive.ensembl.org Bushbaby genes (Ensembl 89 ogarnettii) (OtoGar3) +ogarnettii_gene_ensembl may2009.archive.ensembl.org Bushbaby genes (Ensembl 54 ogarnettii) (otoGar1) +oaries_gene_ensembl may2017.archive.ensembl.org Sheep genes (Ensembl 89 oaries) (Oar_v3.1) +ptroglodytes_gene_ensembl may2017.archive.ensembl.org Chimp genes (Ensembl 89 ptroglodytes) (CHIMP2.1.4) +ptroglodytes_gene_ensembl may2009.archive.ensembl.org Chimp genes (Ensembl 54 ptroglodytes) (CHIMP2.1) +panubis_gene_ensembl may2017.archive.ensembl.org Olive Baboon genes (Ensembl 89 panubis) (PapAnu2.0) +psinensis_gene_ensembl may2017.archive.ensembl.org Chinese softshell turtle genes (Ensembl 89 psinensis) (PelSin_1.0) +pmarinus_gene_ensembl may2017.archive.ensembl.org Lamprey genes (Ensembl 89 pmarinus) (Pmarinus_7.0) +pformosa_gene_ensembl may2017.archive.ensembl.org Amazon molly genes (Ensembl 89 pformosa) (Poecilia_formosa-5.1.2) +pabelii_gene_ensembl may2017.archive.ensembl.org Orangutan genes (Ensembl 89 pabelii) (PPYG2) +pcapensis_gene_ensembl may2017.archive.ensembl.org Rock hyrax genes (Ensembl 89 pcapensis) (proCap1) +pvampyrus_gene_ensembl may2017.archive.ensembl.org Megabat genes (Ensembl 89 pvampyrus) (pteVam1) +rnorvegicus_gene_ensembl may2017.archive.ensembl.org Rat genes (Ensembl 89 rnorvegicus) (Rnor_6.0) +rnorvegicus_gene_ensembl mar2015.archive.ensembl.org Rat genes (Ensembl 79 rnorvegicus) (Rnor_5.0) +rnorvegicus_gene_ensembl may2012.archive.ensembl.org Rat genes (Ensembl 67 rnorvegicus) (RGSC 3.4) +scerevisiae_gene_ensembl may2017.archive.ensembl.org S.cerevisiae genes (Ensembl 89 scerevisiae) (R64-1-1) +scerevisiae_gene_ensembl dec2013.archive.ensembl.org S.cerevisiae genes (Ensembl 74 scerevisiae) (EF 4) +scerevisiae_gene_ensembl may2009.archive.ensembl.org S.cerevisiae genes (Ensembl 54 scerevisiae) (SGD1.01) +sharrisii_gene_ensembl may2017.archive.ensembl.org Tasmanian Devil genes (Ensembl 89 sharrisii) (Devil_ref v7.0) +saraneus_gene_ensembl may2017.archive.ensembl.org Shrew genes (Ensembl 89 saraneus) (sorAra1) +sscrofa_gene_ensembl may2017.archive.ensembl.org Pig genes (Ensembl 89 sscrofa) (Sscrofa10.2) +tguttata_gene_ensembl may2017.archive.ensembl.org Zebra finch genes (Ensembl 89 tguttata) (taeGut3.2.4) +trubripes_gene_ensembl may2017.archive.ensembl.org Fugu genes (Ensembl 89 trubripes) (FUGU 4.0) +tnigroviridis_gene_ensembl may2017.archive.ensembl.org Tetraodon genes (Ensembl 89 tnigroviridis) (TETRAODON 8.0) +tbelangeri_gene_ensembl may2017.archive.ensembl.org Tree Shrew genes (Ensembl 89 tbelangeri) (tupBel1) +ttruncatus_gene_ensembl may2017.archive.ensembl.org Dolphin genes (Ensembl 89 ttruncatus) (turTru1) +vpacos_gene_ensembl may2017.archive.ensembl.org Alpaca genes (Ensembl 89 vpacos) (vicPac1) +xtropicalis_gene_ensembl may2017.archive.ensembl.org Xenopus genes (Ensembl 89 xtropicalis) (JGI 4.2) +xtropicalis_gene_ensembl may2009.archive.ensembl.org Xenopus genes (Ensembl 54 xtropicalis) (JGI 4.1) +xmaculatus_gene_ensembl may2017.archive.ensembl.org Platyfish genes (Ensembl 89 xmaculatus) (Xipmac4.4.2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/update_ensembl_datasets.R Thu Jun 08 10:55:08 2017 -0400 @@ -0,0 +1,58 @@ +## +## Run this script to update the table of Ensembl assemblies available in the customProDB annotation data manager (ensembl_datasets.loc) +## + +library(RMySQL) +library(httr) +library(biomaRt) +library(stringdist) + +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous") +archives = dbGetQuery(con, "SHOW DATABASES LIKE 'ensembl_archive_%'") +dbDisconnect(con) + +latestArchive = tail(archives[,1], 1) +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous", dbname=latestArchive) +assemblies = dbGetQuery(con, "SELECT s.name, s.common_name, rs.assembly_name, MAX(rs.release_id) AS latest_release, r.date + FROM species as s, release_species as rs, ens_release as r + WHERE s.species_id = rs.species_id AND r.release_id = rs.release_id AND r.online = 'Y' + AND r.release_id < 10000 -- ignore 10075 (the special GRCh37 site) + GROUP BY rs.assembly_name + ORDER BY s.common_name, rs.release_id") +allReleases = assemblies$latest_release +uniqueReleases = unique(allReleases) + +# Get the <MMMYYYY> style archive link for each Ensembl release +urlRedirectMap = sapply(paste0("e", uniqueReleases, ".ensembl.org"), function(url){XML::parseURI(HEAD(url)$url)$server}) + +## NOTE ## Make sure the following line is updated to the latest Ensembl mirror +assemblies$url = sub("www.", "may2017.archive.", urlRedirectMap[paste0("e", allReleases, ".ensembl.org")], fixed=TRUE) + +# Get all datasets from the archives +datasets = c() +for (archive in unique(assemblies$url)) { + datasets = unique(c(datasets, listDatasets(useMart("ensembl", host=archive))$dataset)) +} +datasets = sub("_gene_ensembl", "", datasets, fixed=TRUE) + +# Match the assembly species names to the datasets (using amatch() because of cases like Mustela_putorius_furo -> mfuro) +assemblies$dataset_id = datasets[amatch(tolower(assemblies$name), datasets, maxDist=3, method="osa", weight=c(0.1, 1, 1, 1))] + +# Remove mouse strains (would need to add these from ENSEMBL_MOUSE_MART) +assemblies = assemblies[-grep("Mus_musculus_\\S+", assemblies$name, perl=TRUE),] + +# Remove unmatched assemblies (e.g. Mus spretus) +assemblies = assemblies[-which(is.na(assemblies$dataset_id)),] + +# Replace underscores in scientific name +assemblies$name = gsub("_", " ", assemblies$name, fixed=TRUE) + +# Sort assemblies first by scientific name, then descending by latest release for that assembly +assemblies = assemblies[order(assemblies$name, -assemblies$latest_release),] + +# Write dataset table (3 columns: dataset_id, host, description) +dataset_id = paste0(assemblies$dataset_id, "_gene_ensembl") +host = paste0(assemblies$url) +description = paste0(assemblies$common_name, " genes (Ensembl ", assemblies$latest_release, " ", assemblies$dataset_id, + ") (", assemblies$assembly_name, ")") +write.csv(paste(dataset_id, host, description, sep="\t"), file="ensembl_datasets.loc.sample")