Mercurial > repos > earlhaminst > ete
annotate ete_lineage_generator.py @ 12:dc32007a6b36 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author | earlhaminst |
---|---|
date | Tue, 07 Jun 2022 08:58:05 +0000 |
parents | 2db72467da51 |
children |
rev | line source |
---|---|
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
1 import optparse |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
2 import sys |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
3 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
4 from ete3 import NCBITaxa |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
5 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
6 # - compared to gi2taxonomy the root is excluded, since |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
7 # the value is always "root", i.e. useless information |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
8 # - additional levels that appear in the ncbi taxdb have |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
9 # been added |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
10 # (order from https://en.wikipedia.org/wiki/Taxonomic_rank#All_ranks) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
11 # TODO the full list of ranks could be derived from the input DB |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
12 LONG_RANKS = [u"superkingdom", u"kingdom", u"subkingdom", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
13 u"superphylum", u"phylum", u"subphylum", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
14 u"superclass", u"class", u"subclass", "infraclass", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
15 u"cohort", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
16 u"superorder", u"order", u"suborder", u"infraorder", u"parvorder", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
17 u"superfamily", u"family", u"subfamily", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
18 u"tribe", u"subtribe", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
19 u"genus", u"subgenus", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
20 u"species group", u"species subgroup", u"species", u"subspecies", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
21 u"varietas", "forma"] |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
22 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
23 SHORT_RANKS = [u"kingdom", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
24 u"phylum", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
25 u"class", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
26 u"order", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
27 u"family", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
28 u"genus", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
29 u"species"] |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
30 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
31 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
32 def process_taxid(ncbi, taxid, ranks, RANK_IDX, lower=False): |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
33 """ |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
34 process one taxid: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
35 - get lineage (as list of taxids, ranks, and names) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
36 - reverse the lineage if lower ranks are to be used for filling |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
37 - fill the ranks with the data from the lineage |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
38 ncbi: ete NCBITaxa object |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
39 taxid: a taxid (int) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
40 ranks: list of ranks (should be initialized with "NA" x number of levels of interest) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
41 RANK_IDX: mapping from rank names to indices (distance to root/leaf?) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
42 lower: use lower taxa for filling "NA"s |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
43 """ |
11
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
44 try: |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
45 lineage_taxids = ncbi.get_lineage(taxid) |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
46 except ValueError: |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
47 sys.stderr.write("[%s] could not determine lineage!\n" % taxid) |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
48 return |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
49 if lineage_taxids is None: |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
50 sys.stderr.write("[%s] could not determine lineage!\n" % taxid) |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
51 return |
9
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
52 lineage_ranks = ncbi.get_rank(lineage_taxids) |
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
53 lineage_names = ncbi.get_taxid_translator(lineage_taxids, try_synonyms=True) |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
54 if lower: |
9
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
55 lineage_taxids.reverse() |
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
56 for parent_taxid in lineage_taxids: |
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
57 parent_rank = lineage_ranks[parent_taxid] |
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
58 if parent_rank not in RANK_IDX: |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
59 continue |
9
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
60 parent_rank_index = RANK_IDX[parent_rank] |
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
61 if ranks[parent_rank_index] != "NA": |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
62 continue |
9
b29ee6a16524
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
earlhaminst
parents:
6
diff
changeset
|
63 ranks[parent_rank_index] = lineage_names[parent_taxid] |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
64 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
65 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
66 # get command line options |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
67 parser = optparse.OptionParser() |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
68 parser.add_option('-s', '--species', dest="input_species_filename", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
69 help='Species/taxid list in text format one species in each line') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
70 parser.add_option('-d', '--database', dest="database", default=None, |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
71 help='ETE sqlite data base to use (default: ~/.etetoolkit/taxa.sqlite)') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
72 parser.add_option('-o', '--output', dest="output", help='output file name (default: stdout)') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
73 parser.add_option('-f', dest="full", action="store_true", default=False, |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
74 help='Show all available (named) taxonomic ranks (default: only primary levels)') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
75 parser.add_option('-c', dest="compress", action="store_true", default=False, |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
76 help='Fill unnamed ranks with super/sub ranks (see -l)') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
77 parser.add_option('-l', dest="lower", action="store_true", default=False, |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
78 help='Prefer lower levels when compressed') |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
79 parser.add_option('-r', '--rank', dest='ranks', action="append", |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
80 help='include rank - multiple ones can be specified') |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
81 parser.add_option('-i', '--includeid', dest="addid", action="store_true", default=False, |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
82 help='add taxid column') |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
83 |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
84 options, args = parser.parse_args() |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
85 # check command line options |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
86 if options.input_species_filename is None: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
87 parser.error("-s option must be specified, Species list in text format one species in each line") |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
88 if options.full and options.ranks: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
89 parser.error("-f and -r can not be used at the same time") |
4
87b6de3ef63e
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents:
2
diff
changeset
|
90 |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
91 if options.ranks: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
92 for r in options.ranks: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
93 if r not in LONG_RANKS: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
94 parser.error("unknown rank %s" % r) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
95 # setup output |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
96 if not options.output: # if filename is not given |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
97 of = sys.stdout |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
98 else: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
99 of = open(options.output, "w") |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
100 # load NCBI taxonomy DB |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
101 ncbi = NCBITaxa(dbfile=options.database) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
102 # get list of ranks that are of interest |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
103 if options.ranks: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
104 RANKS = [] |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
105 for r in LONG_RANKS: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
106 if r in options.ranks: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
107 RANKS.append(r) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
108 else: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
109 if options.full: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
110 RANKS = LONG_RANKS |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
111 else: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
112 RANKS = SHORT_RANKS |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
113 RANK_IDX = {item: index for index, item in enumerate(RANKS)} |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
114 COMP_RANK_IDX = RANK_IDX |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
115 if options.compress: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
116 for ir in range(len(RANKS)): |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
117 for ilr in range(len(LONG_RANKS)): |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
118 if RANKS[ir] in LONG_RANKS[ilr]: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
119 COMP_RANK_IDX[LONG_RANKS[ilr]] = ir |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
120 |
4
87b6de3ef63e
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents:
2
diff
changeset
|
121 # write header |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
122 of.write("# query") |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
123 if options.addid: |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
124 of.write("\ttaxid") |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
125 of.write("\t%s\n" % ("\t".join(RANKS))) |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
126 |
4
87b6de3ef63e
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 41e40314a9b25a9f3c06a13422d367b68334f593
earlhaminst
parents:
2
diff
changeset
|
127 # get and write data |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
128 with open(options.input_species_filename) as f: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
129 for line in f.readlines(): |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
130 line = line.strip().replace('_', ' ') |
11
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
131 if line == "": |
2db72467da51
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 67e136d433c0d925db362342919305b46fdffecd"
earlhaminst
parents:
9
diff
changeset
|
132 continue |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
133 try: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
134 taxid = int(line) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
135 except ValueError: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
136 # TODO: one could use fuzzy name lookup (i.e. accept typos in the species names), |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
137 # but then a pysqlite version that supports this is needed (needs to be enabled |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
138 # during compilation) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
139 name2tax = ncbi.get_name_translator([line]) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
140 if line in name2tax: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
141 taxid = name2tax[line][0] |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
142 else: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
143 sys.stderr.write("[%s] could not be translated into a taxid!\n" % line) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
144 continue |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
145 ranks = ["NA"] * len(RANKS) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
146 process_taxid(ncbi, taxid, ranks, RANK_IDX) |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
147 if options.compress: |
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
148 process_taxid(ncbi, taxid, ranks, COMP_RANK_IDX, options.lower) |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
149 of.write("%s" % line) |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
150 if options.addid: |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
151 of.write("\t%d" % taxid) |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
4
diff
changeset
|
152 of.write("\t%s\n" % "\t".join(ranks)) |
2
03c10736e497
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 91b634b8f9b131045bbbbf43cc8edbea59ac686b-dirty
earlhaminst
parents:
diff
changeset
|
153 of.close() |