Mercurial > repos > galaxyp > custom_pro_db_annotation_data_manager
annotate tool-data/update_ensembl_datasets.R @ 3:9ee512decde8 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit d4b5497065b853ed094aebc9e4185e9995c5e0e0
author | galaxyp |
---|---|
date | Wed, 01 Nov 2017 19:34:06 -0400 |
parents | 9b4ee836e35b |
children |
rev | line source |
---|---|
1
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
1 ## |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
2 ## Run this script to update the table of Ensembl assemblies available in the customProDB annotation data manager (ensembl_datasets.loc) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
3 ## |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
4 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
5 library(RMySQL) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
6 library(httr) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
7 library(biomaRt) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
8 library(stringdist) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
9 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
10 con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous") |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
11 archives = dbGetQuery(con, "SHOW DATABASES LIKE 'ensembl_archive_%'") |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
12 dbDisconnect(con) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
13 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
14 latestArchive = tail(archives[,1], 1) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
15 con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous", dbname=latestArchive) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
16 assemblies = dbGetQuery(con, "SELECT s.name, s.common_name, rs.assembly_name, MAX(rs.release_id) AS latest_release, r.date |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
17 FROM species as s, release_species as rs, ens_release as r |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
18 WHERE s.species_id = rs.species_id AND r.release_id = rs.release_id AND r.online = 'Y' |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
19 AND r.release_id < 10000 -- ignore 10075 (the special GRCh37 site) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
20 GROUP BY rs.assembly_name |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
21 ORDER BY s.common_name, rs.release_id") |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
22 allReleases = assemblies$latest_release |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
23 uniqueReleases = unique(allReleases) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
24 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
25 # Get the <MMMYYYY> style archive link for each Ensembl release |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
26 urlRedirectMap = sapply(paste0("e", uniqueReleases, ".ensembl.org"), function(url){XML::parseURI(HEAD(url)$url)$server}) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
27 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
28 ## NOTE ## Make sure the following line is updated to the latest Ensembl mirror |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
29 assemblies$url = sub("www.", "may2017.archive.", urlRedirectMap[paste0("e", allReleases, ".ensembl.org")], fixed=TRUE) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
30 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
31 # Get all datasets from the archives |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
32 datasets = c() |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
33 for (archive in unique(assemblies$url)) { |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
34 datasets = unique(c(datasets, listDatasets(useMart("ensembl", host=archive))$dataset)) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
35 } |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
36 datasets = sub("_gene_ensembl", "", datasets, fixed=TRUE) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
37 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
38 # Match the assembly species names to the datasets (using amatch() because of cases like Mustela_putorius_furo -> mfuro) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
39 assemblies$dataset_id = datasets[amatch(tolower(assemblies$name), datasets, maxDist=3, method="osa", weight=c(0.1, 1, 1, 1))] |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
40 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
41 # Remove mouse strains (would need to add these from ENSEMBL_MOUSE_MART) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
42 assemblies = assemblies[-grep("Mus_musculus_\\S+", assemblies$name, perl=TRUE),] |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
43 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
44 # Remove unmatched assemblies (e.g. Mus spretus) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
45 assemblies = assemblies[-which(is.na(assemblies$dataset_id)),] |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
46 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
47 # Replace underscores in scientific name |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
48 assemblies$name = gsub("_", " ", assemblies$name, fixed=TRUE) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
49 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
50 # Sort assemblies first by scientific name, then descending by latest release for that assembly |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
51 assemblies = assemblies[order(assemblies$name, -assemblies$latest_release),] |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
52 |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
53 # Write dataset table (3 columns: dataset_id, host, description) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
54 dataset_id = paste0(assemblies$dataset_id, "_gene_ensembl") |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
55 host = paste0(assemblies$url) |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
56 description = paste0(assemblies$common_name, " genes (Ensembl ", assemblies$latest_release, " ", assemblies$dataset_id, |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
57 ") (", assemblies$assembly_name, ")") |
9b4ee836e35b
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6
galaxyp
parents:
diff
changeset
|
58 write.csv(paste(dataset_id, host, description, sep="\t"), file="ensembl_datasets.loc.sample") |