Mercurial > repos > earlhaminst > ete
comparison ete_lineage_generator.py @ 2:03c10736e497 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
author | earlhaminst |
---|---|
date | Tue, 07 Nov 2017 11:45:13 -0500 |
parents | |
children | 87b6de3ef63e |
comparison
equal
deleted
inserted
replaced
1:a4ba317fc713 | 2:03c10736e497 |
---|---|
1 import optparse | |
2 import sys | |
3 | |
4 from ete3 import NCBITaxa | |
5 | |
6 # - compared to gi2taxonomy the root is excluded, since | |
7 # the value is always "root", i.e. useless information | |
8 # - additional levels that appear in the ncbi taxdb have | |
9 # been added | |
10 # (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks) | |
11 # TODO the full list of ranks could be derived from the input DB | |
12 LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom", | |
13 u"superphylum", u"phylum", u"subphylum", | |
14 u"superclass", u"class", u"subclass", "infraclass", | |
15 u"cohort", | |
16 u"superorder", u"order", u"suborder", u"infraorder", u"parvorder", | |
17 u"superfamily", u"family", u"subfamily", | |
18 u"tribe", u"subtribe", | |
19 u"genus", u"subgenus", | |
20 u"species group", u"species subgroup", u"species", u"subspecies", | |
21 u"varietas", "forma"] | |
22 | |
23 SHORT_RANKS = [u"kingdom", | |
24 u"phylum", | |
25 u"class", | |
26 u"order", | |
27 u"family", | |
28 u"genus", | |
29 u"species"] | |
30 | |
31 | |
32 def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False): | |
33 """ | |
34 process one taxid: | |
35 - get lineage (as list of taxids, ranks, and names) | |
36 - reverse the lineage if lower ranks are to be used for filling | |
37 - fill the ranks with the data from the lineage | |
38 ncbi: ete NCBITaxa object | |
39 taxid: a taxid (int) | |
40 ranks: list of ranks (should be initialized with "NA" x number of levels of interest) | |
41 RANK_IDX: mapping from rank names to indices (distance to root/leaf?) | |
42 lower: use lower taxa for filling "NA"s | |
43 """ | |
44 lineage = ncbi.get_lineage(taxid) | |
45 lineage_ranks = ncbi.get_rank(lineage) | |
46 lineage_names = ncbi.get_taxid_translator(lineage, try_synonyms=True) | |
47 if lower: | |
48 lineage.reverse() | |
49 for l in lineage: | |
50 if not lineage_ranks[l] in RANK_IDX: | |
51 continue | |
52 if ranks[RANK_IDX[lineage_ranks[l]]] != "NA": | |
53 continue | |
54 ranks[RANK_IDX[lineage_ranks[l]]] = lineage_names[l] | |
55 | |
56 | |
57 # get command line options | |
58 parser = optparse.OptionParser() | |
59 parser.add_option('-s', '--species', dest="input_species_filename", | |
60 help='Species/taxid list in text format one species in each line') | |
61 parser.add_option('-d', '--database', dest="database", default=None, | |
62 help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)') | |
63 parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)') | |
64 parser.add_option('-f', dest="full", action="store_true", default=False, | |
65 help='Show all available (named) taxonomic ranks (default: only primary levels)') | |
66 parser.add_option('-c', dest="compress", action="store_true", default=False, | |
67 help='Fill unnamed ranks with super/sub ranks (see -l)') | |
68 parser.add_option('-l', dest="lower", action="store_true", default=False, | |
69 help='Prefer lower levels when compressed') | |
70 parser.add_option('-r', '--rank', dest='ranks', action="append", | |
71 help='include rank - multiple ones can be specified') | |
72 | |
73 options, args = parser.parse_args() | |
74 # check command line options | |
75 if options.input_species_filename is None: | |
76 parser.error("-s option must be specified, Species list in text format one species in each line") | |
77 if options.full and options.ranks: | |
78 parser.error("-f and -r can not be used at the same time") | |
79 if options.ranks: | |
80 for r in options.ranks: | |
81 if r not in LONG_RANKS: | |
82 parser.error("unknown rank %s" % r) | |
83 # setup output | |
84 if not options.output: # if filename is not given | |
85 of = sys.stdout | |
86 else: | |
87 of = open(options.output, "w") | |
88 # load NCBI taxonomy DB | |
89 ncbi = NCBITaxa(dbfile=options.database) | |
90 # get list of ranks that are of interest | |
91 if options.ranks: | |
92 RANKS = [] | |
93 for r in LONG_RANKS: | |
94 if r in options.ranks: | |
95 RANKS.append(r) | |
96 else: | |
97 if options.full: | |
98 RANKS = LONG_RANKS | |
99 else: | |
100 RANKS = SHORT_RANKS | |
101 RANK_IDX = {item: index for index, item in enumerate(RANKS)} | |
102 COMP_RANK_IDX = RANK_IDX | |
103 if options.compress: | |
104 for ir in range(len(RANKS)): | |
105 for ilr in range(len(LONG_RANKS)): | |
106 if RANKS[ir] in LONG_RANKS[ilr]: | |
107 COMP_RANK_IDX[LONG_RANKS[ilr]] = ir | |
108 with open(options.input_species_filename) as f: | |
109 for line in f.readlines(): | |
110 line = line.strip().replace('_', ' ') | |
111 try: | |
112 taxid = int(line) | |
113 except ValueError: | |
114 # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names), | |
115 # but then a pysqlite version that supports this is needed (needs to be enabled | |
116 # during compilation) | |
117 name2tax = ncbi.get_name_translator([line]) | |
118 if line in name2tax: | |
119 taxid = name2tax[line][0] | |
120 else: | |
121 sys.stderr.write("[%s] could not be translated into a taxid!\n" % line) | |
122 continue | |
123 ranks = ["NA"] * len(RANKS) | |
124 process_taxid(ncbi, taxid, ranks, RANK_IDX) | |
125 if options.compress: | |
126 process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower) | |
127 of.write("%s\t%s\n" % (line, "\t".join(ranks))) | |
128 of.close() |