Mercurial > repos > iuc > kraken_taxonomy_report
comparison kraken_taxonomy_report.py @ 1:b97694b21bc3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/kraken_taxonomy_report/ commit 3265247e909410db2a6d6087a2c0d3a9885c120c
author | iuc |
---|---|
date | Wed, 23 Nov 2016 03:27:33 -0500 |
parents | 3f1a0d47ea8d |
children | 528a1d91b066 |
comparison
equal
deleted
inserted
replaced
0:3f1a0d47ea8d | 1:b97694b21bc3 |
---|---|
4 # and optionally creates a newick Tree | 4 # and optionally creates a newick Tree |
5 # Copyright (c) 2016 Daniel Blankenberg | 5 # Copyright (c) 2016 Daniel Blankenberg |
6 # Licensed under the Academic Free License version 3.0 | 6 # Licensed under the Academic Free License version 3.0 |
7 # https://github.com/blankenberg/Kraken-Taxonomy-Report | 7 # https://github.com/blankenberg/Kraken-Taxonomy-Report |
8 | 8 |
9 from __future__ import print_function | |
10 | |
9 import sys | 11 import sys |
10 import os | 12 import os |
11 import optparse | 13 import optparse |
12 import re | 14 import re |
13 | 15 |
14 __VERSION__ = '0.0.1' | 16 __VERSION__ = '0.0.2' |
15 | 17 |
16 __URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report" | 18 __URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report" |
17 | 19 |
18 # Rank names were pulled from ncbi nodes.dmp on 02/02/2016 | 20 # Rank names were pulled from ncbi nodes.dmp on 02/02/2016 |
19 # cat nodes.dmp | cut -f 5 | sort | uniq | 21 # cat nodes.dmp | cut -f 5 | sort | uniq |
80 | 82 |
81 def load_taxonomy( db_path, sanitize_names=False ): | 83 def load_taxonomy( db_path, sanitize_names=False ): |
82 child_lists = {} | 84 child_lists = {} |
83 name_map = {} | 85 name_map = {} |
84 rank_map = {} | 86 rank_map = {} |
87 names = {} # Store names here to look for duplicates (id, True/False name fixed) | |
85 with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh: | 88 with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh: |
86 for line in fh: | 89 for line in fh: |
87 line = line.rstrip( "\n\r" ) | 90 line = line.rstrip( "\n\r" ) |
88 if line.endswith( "\t|" ): | 91 if line.endswith( "\t|" ): |
89 line = line[:-2] | 92 line = line[:-2] |
92 name = fields[1] | 95 name = fields[1] |
93 if sanitize_names: | 96 if sanitize_names: |
94 name = NAME_RE.sub( NAME_REPL, name ) | 97 name = NAME_RE.sub( NAME_REPL, name ) |
95 name_type = fields[3] | 98 name_type = fields[3] |
96 if name_type == "scientific name": | 99 if name_type == "scientific name": |
100 if name in names: | |
101 print( 'Warning: name "%s" found at node "%s" but already exists originally for node "%s".' % ( name, node_id, names[name][0] ), file=sys.stderr ) | |
102 new_name = "%s_%s" % ( name, node_id ) | |
103 print( 'Transforming node "%s" named "%s" to "%s".' % ( node_id, name, new_name ), file=sys.stderr ) | |
104 assert new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % new_name | |
105 if not names[name][1]: | |
106 orig_new_name = "%s_%s" % ( name, names[name][0] ) | |
107 print( 'Transforming node "%s" named "%s" to "%s".' % ( names[name][0], name, orig_new_name ), file=sys.stderr ) | |
108 assert orig_new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % orig_new_name | |
109 name_map[names[name][0]] = orig_new_name | |
110 names[name] = ( names[name][0], True ) | |
111 name = new_name | |
112 else: | |
113 names[name] = ( node_id, False ) | |
97 name_map[ node_id ] = name | 114 name_map[ node_id ] = name |
98 | 115 |
99 with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh: | 116 with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh: |
100 for line in fh: | 117 for line in fh: |
101 line = line.rstrip( "\n\r" ) | 118 line = line.rstrip( "\n\r" ) |
103 node_id = fields[0] | 120 node_id = fields[0] |
104 parent_id = fields[1] | 121 parent_id = fields[1] |
105 rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None ) | 122 rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None ) |
106 if rank is None: | 123 if rank is None: |
107 # This should never happen, unless new taxonomy ranks are created | 124 # This should never happen, unless new taxonomy ranks are created |
108 print >> sys.stderr, 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ) | 125 print( 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ), file=sys.stderr ) |
109 rank = NO_RANK_INT | 126 rank = NO_RANK_INT |
110 if node_id == '1': | 127 if node_id == '1': |
111 parent_id = '0' | 128 parent_id = '0' |
112 if parent_id not in child_lists: | 129 if parent_id not in child_lists: |
113 child_lists[ parent_id ] = [] | 130 child_lists[ parent_id ] = [] |
123 dfs_summation( child, counts, child_lists ) | 140 dfs_summation( child, counts, child_lists ) |
124 counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 ) | 141 counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 ) |
125 | 142 |
126 | 143 |
127 def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ): | 144 def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ): |
128 if not options.summation and ( not options.show_zeros and node not in hit_taxa ): | |
129 return | |
130 rank_int = rank_map[node] | 145 rank_int = rank_map[node] |
131 code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE ) | 146 code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE ) |
132 if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa): | 147 if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa): |
133 if name is None: | 148 if name is None: |
134 name = "" | 149 name = "" |
208 parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' ) | 223 parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' ) |
209 parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' ) | 224 parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' ) |
210 parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' ) | 225 parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' ) |
211 (options, args) = parser.parse_args() | 226 (options, args) = parser.parse_args() |
212 if options.version: | 227 if options.version: |
213 print >> sys.stderr, "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ) | 228 print( "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ), file=sys.stderr ) |
214 sys.exit() | 229 sys.exit() |
215 if not args: | 230 if not args: |
216 print >> sys.stderr, parser.get_usage() | 231 print( parser.get_usage(), file=sys.stderr ) |
217 sys.exit() | 232 sys.exit() |
218 | 233 |
219 if options.cluster: | 234 if options.cluster: |
220 cluster_name = options.cluster.lower() | 235 cluster_name = options.cluster.lower() |
221 cluster = RANK_NAME_TO_INTS.get( cluster_name, None ) | 236 cluster = RANK_NAME_TO_INTS.get( cluster_name, None ) |