annotate ete_lineage_generator.py @ 11:2db72467da51 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
author earlhaminst
date Thu, 10 Mar 2022 14:01:44 +0000
parents b29ee6a16524
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
1 import optparse
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
2 import sys
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
3
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
4 from ete3 import NCBITaxa
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
5
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
6 # - compared to gi2taxonomy the root is excluded, since
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
7 # the value is always "root", i.e. useless information
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
8 # - additional levels that appear in the ncbi taxdb have
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
9 # been added
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
10 # (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
11 # TODO the full list of ranks could be derived from the input DB
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
12 LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
13 u"superphylum", u"phylum", u"subphylum",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
14 u"superclass", u"class", u"subclass", "infraclass",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
15 u"cohort",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
16 u"superorder", u"order", u"suborder", u"infraorder", u"parvorder",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
17 u"superfamily", u"family", u"subfamily",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
18 u"tribe", u"subtribe",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
19 u"genus", u"subgenus",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
20 u"species group", u"species subgroup", u"species", u"subspecies",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
21 u"varietas", "forma"]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
22
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
23 SHORT_RANKS = [u"kingdom",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
24 u"phylum",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
25 u"class",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
26 u"order",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
27 u"family",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
28 u"genus",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
29 u"species"]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
30
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
31
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
32 def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
33 """
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
34 process one taxid:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
35 - get lineage (as list of taxids, ranks, and names)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
36 - reverse the lineage if lower ranks are to be used for filling
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
37 - fill the ranks with the data from the lineage
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
38 ncbi: ete NCBITaxa object
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
39 taxid: a taxid (int)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
40 ranks: list of ranks (should be initialized with "NA" x number of levels of interest)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
41 RANK_IDX: mapping from rank names to indices (distance to root/leaf?)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
42 lower: use lower taxa for filling "NA"s
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
43 """
11
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
44 try:
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
45 lineage_taxids = ncbi.get_lineage(taxid)
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
46 except ValueError:
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
47 sys.stderr.write("[%s] could not determine lineage!\n" % taxid)
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
48 return
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
49 if lineage_taxids is None:
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
50 sys.stderr.write("[%s] could not determine lineage!\n" % taxid)
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
51 return
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
52 lineage_ranks = ncbi.get_rank(lineage_taxids)
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
53 lineage_names = ncbi.get_taxid_translator(lineage_taxids, try_synonyms=True)
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
54 if lower:
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
55 lineage_taxids.reverse()
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
56 for parent_taxid in lineage_taxids:
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
57 parent_rank = lineage_ranks[parent_taxid]
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
58 if parent_rank not in RANK_IDX:
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
59 continue
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
60 parent_rank_index = RANK_IDX[parent_rank]
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
61 if ranks[parent_rank_index] != "NA":
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
62 continue
9
b29ee6a16524 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents: 6
diff changeset
63 ranks[parent_rank_index] = lineage_names[parent_taxid]
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
64
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
65
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
66 # get command line options
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
67 parser = optparse.OptionParser()
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
68 parser.add_option('-s', '--species', dest="input_species_filename",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
69 help='Species/taxid list in text format one species in each line')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
70 parser.add_option('-d', '--database', dest="database", default=None,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
71 help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
72 parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
73 parser.add_option('-f', dest="full", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
74 help='Show all available (named) taxonomic ranks (default: only primary levels)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
75 parser.add_option('-c', dest="compress", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
76 help='Fill unnamed ranks with super/sub ranks (see -l)')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
77 parser.add_option('-l', dest="lower", action="store_true", default=False,
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
78 help='Prefer lower levels when compressed')
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
79 parser.add_option('-r', '--rank', dest='ranks', action="append",
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
80 help='include rank - multiple ones can be specified')
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
81 parser.add_option('-i', '--includeid', dest="addid", action="store_true", default=False,
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
82 help='add taxid column')
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
83
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
84 options, args = parser.parse_args()
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
85 # check command line options
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
86 if options.input_species_filename is None:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
87 parser.error("-s option must be specified, Species list in text format one species in each line")
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
88 if options.full and options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
89 parser.error("-f and -r can not be used at the same time")
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
90
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
91 if options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
92 for r in options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
93 if r not in LONG_RANKS:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
94 parser.error("unknown rank %s" % r)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
95 # setup output
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
96 if not options.output: # if filename is not given
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
97 of = sys.stdout
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
98 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
99 of = open(options.output, "w")
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
100 # load NCBI taxonomy DB
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
101 ncbi = NCBITaxa(dbfile=options.database)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
102 # get list of ranks that are of interest
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
103 if options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
104 RANKS = []
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
105 for r in LONG_RANKS:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
106 if r in options.ranks:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
107 RANKS.append(r)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
108 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
109 if options.full:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
110 RANKS = LONG_RANKS
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
111 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
112 RANKS = SHORT_RANKS
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
113 RANK_IDX = {item: index for index, item in enumerate(RANKS)}
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
114 COMP_RANK_IDX = RANK_IDX
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
115 if options.compress:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
116 for ir in range(len(RANKS)):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
117 for ilr in range(len(LONG_RANKS)):
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
118 if RANKS[ir] in LONG_RANKS[ilr]:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
119 COMP_RANK_IDX[LONG_RANKS[ilr]] = ir
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
120
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
121 # write header
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
122 of.write("# query")
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
123 if options.addid:
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
124 of.write("\ttaxid")
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
125 of.write("\t%s\n" % ("\t".join(RANKS)))
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
126
4
87b6de3ef63e planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents: 2
diff changeset
127 # get and write data
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
128 with open(options.input_species_filename) as f:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
129 for line in f.readlines():
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
130 line = line.strip().replace('_', ' ')
11
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
131 if line == "":
2db72467da51 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents: 9
diff changeset
132 continue
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
133 try:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
134 taxid = int(line)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
135 except ValueError:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
136 # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names),
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
137 # but then a pysqlite version that supports this is needed (needs to be enabled
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
138 # during compilation)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
139 name2tax = ncbi.get_name_translator([line])
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
140 if line in name2tax:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
141 taxid = name2tax[line][0]
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
142 else:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
143 sys.stderr.write("[%s] could not be translated into a taxid!\n" % line)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
144 continue
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
145 ranks = ["NA"] * len(RANKS)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
146 process_taxid(ncbi, taxid, ranks, RANK_IDX)
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
147 if options.compress:
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
148 process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower)
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
149 of.write("%s" % line)
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
150 if options.addid:
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
151 of.write("\t%d" % taxid)
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 4
diff changeset
152 of.write("\t%s\n" % "\t".join(ranks))
2
03c10736e497 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff changeset
153 of.close()