annotate ete_lineage_generator.py @ 6:f1eca1158f21 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
author earlhaminst
date Wed, 10 Oct 2018 05:24:04 -0400
parents 87b6de3ef63e
children b29ee6a16524
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
1 import optparse
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
2 import sys
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
3
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
4 from ete3 import NCBITaxa
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
5
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
6 # - compared to gi2taxonomy the root is excluded, since
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
7 # the value is always "root", i.e. useless information
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
8 # - additional levels that appear in the ncbi taxdb have
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
9 # been added
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
10 # (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
11 # TODO the full list of ranks could be derived from the input DB
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
12 LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
13 u"superphylum", u"phylum", u"subphylum",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
14 u"superclass", u"class", u"subclass", "infraclass",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
15 u"cohort",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
16 u"superorder", u"order", u"suborder", u"infraorder", u"parvorder",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
17 u"superfamily", u"family", u"subfamily",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
18 u"tribe", u"subtribe",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
19 u"genus", u"subgenus",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
20 u"species group", u"species subgroup", u"species", u"subspecies",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
21 u"varietas", "forma"]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
22
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
23 SHORT_RANKS = [u"kingdom",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
24 u"phylum",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
25 u"class",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
26 u"order",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
27 u"family",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
28 u"genus",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
29 u"species"]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
30
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
31
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
32 def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
33 """
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
34 process one taxid:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
35 - get lineage (as list of taxids, ranks, and names)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
36 - reverse the lineage if lower ranks are to be used for filling
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
37 - fill the ranks with the data from the lineage
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
38 ncbi: ete NCBITaxa object
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
39 taxid: a taxid (int)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
40 ranks: list of ranks (should be initialized with "NA" x number of levels of interest)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
41 RANK_IDX: mapping from rank names to indices (distance to root/leaf?)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
42 lower: use lower taxa for filling "NA"s
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
43 """
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
44 lineage = ncbi.get_lineage(taxid)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
45 lineage_ranks = ncbi.get_rank(lineage)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
46 lineage_names = ncbi.get_taxid_translator(lineage, try_synonyms=True)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
47 if lower:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
48 lineage.reverse()
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
49 for l in lineage:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
50 if not lineage_ranks[l] in RANK_IDX:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
51 continue
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
52 if ranks[RANK_IDX[lineage_ranks[l]]] != "NA":
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
53 continue
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
54 ranks[RANK_IDX[lineage_ranks[l]]] = lineage_names[l]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
55
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
56
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
57 # get command line options
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
58 parser = optparse.OptionParser()
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
59 parser.add_option('-s', '--species', dest="input_species_filename",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
60 help='Species/taxid list in text format one species in each line')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
61 parser.add_option('-d', '--database', dest="database", default=None,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
62 help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
63 parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
64 parser.add_option('-f', dest="full", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
65 help='Show all available (named) taxonomic ranks (default: only primary levels)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
66 parser.add_option('-c', dest="compress", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
67 help='Fill unnamed ranks with super/sub ranks (see -l)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
68 parser.add_option('-l', dest="lower", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
69 help='Prefer lower levels when compressed')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
70 parser.add_option('-r', '--rank', dest='ranks', action="append",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
71 help='include rank - multiple ones can be specified')
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
72 parser.add_option('-i', '--includeid', dest="addid", action="store_true", default=False,
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
73 help='add taxid column')
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
74
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
75 options, args = parser.parse_args()
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
76 # check command line options
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
77 if options.input_species_filename is None:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
78 parser.error("-s option must be specified, Species list in text format one species in each line")
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
79 if options.full and options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
80 parser.error("-f and -r can not be used at the same time")
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
81
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
82 if options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
83 for r in options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
84 if r not in LONG_RANKS:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
85 parser.error("unknown rank %s" % r)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
86 # setup output
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
87 if not options.output: # if filename is not given
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
88 of = sys.stdout
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
89 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
90 of = open(options.output, "w")
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
91 # load NCBI taxonomy DB
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
92 ncbi = NCBITaxa(dbfile=options.database)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
93 # get list of ranks that are of interest
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
94 if options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
95 RANKS = []
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
96 for r in LONG_RANKS:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
97 if r in options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
98 RANKS.append(r)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
99 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
100 if options.full:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
101 RANKS = LONG_RANKS
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
102 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
103 RANKS = SHORT_RANKS
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
104 RANK_IDX = {item: index for index, item in enumerate(RANKS)}
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
105 COMP_RANK_IDX = RANK_IDX
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
106 if options.compress:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
107 for ir in range(len(RANKS)):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
108 for ilr in range(len(LONG_RANKS)):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
109 if RANKS[ir] in LONG_RANKS[ilr]:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
110 COMP_RANK_IDX[LONG_RANKS[ilr]] = ir
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
111
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
112 # write header
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
113 of.write("# query")
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
114 if options.addid:
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
115 of.write("\ttaxid")
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
116 of.write("\t%s\n" % ("\t".join(RANKS)))
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
117
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
118 # get and write data
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
119 with open(options.input_species_filename) as f:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
120 for line in f.readlines():
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
121 line = line.strip().replace('_', ' ')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
122 try:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
123 taxid = int(line)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
124 except ValueError:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
125 # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names),
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
126 # but then a pysqlite version that supports this is needed (needs to be enabled
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
127 # during compilation)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
128 name2tax = ncbi.get_name_translator([line])
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
129 if line in name2tax:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
130 taxid = name2tax[line][0]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
131 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
132 sys.stderr.write("[%s] could not be translated into a taxid!\n" % line)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
133 continue
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
134 ranks = ["NA"] * len(RANKS)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
135 process_taxid(ncbi, taxid, ranks, RANK_IDX)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
136 if options.compress:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
137 process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower)
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
138 of.write("%s" % line)
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
139 if options.addid:
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
140 of.write("\t%d" % taxid)
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
141 of.write("\t%s\n" % "\t".join(ranks))
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
142 of.close()