comparison kraken_taxonomy_report.py @ 1:b97694b21bc3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/kraken_taxonomy_report/ commit 3265247e909410db2a6d6087a2c0d3a9885c120c
author iuc
date Wed, 23 Nov 2016 03:27:33 -0500
parents 3f1a0d47ea8d
children 528a1d91b066
comparison
equal deleted inserted replaced
0:3f1a0d47ea8d 1:b97694b21bc3
4 # and optionally creates a newick Tree 4 # and optionally creates a newick Tree
5 # Copyright (c) 2016 Daniel Blankenberg 5 # Copyright (c) 2016 Daniel Blankenberg
6 # Licensed under the Academic Free License version 3.0 6 # Licensed under the Academic Free License version 3.0
7 # https://github.com/blankenberg/Kraken-Taxonomy-Report 7 # https://github.com/blankenberg/Kraken-Taxonomy-Report
8 8
9 from __future__ import print_function
10
9 import sys 11 import sys
10 import os 12 import os
11 import optparse 13 import optparse
12 import re 14 import re
13 15
14 __VERSION__ = '0.0.1' 16 __VERSION__ = '0.0.2'
15 17
16 __URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report" 18 __URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report"
17 19
18 # Rank names were pulled from ncbi nodes.dmp on 02/02/2016 20 # Rank names were pulled from ncbi nodes.dmp on 02/02/2016
19 # cat nodes.dmp | cut -f 5 | sort | uniq 21 # cat nodes.dmp | cut -f 5 | sort | uniq
80 82
81 def load_taxonomy( db_path, sanitize_names=False ): 83 def load_taxonomy( db_path, sanitize_names=False ):
82 child_lists = {} 84 child_lists = {}
83 name_map = {} 85 name_map = {}
84 rank_map = {} 86 rank_map = {}
87 names = {} # Store names here to look for duplicates (id, True/False name fixed)
85 with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh: 88 with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh:
86 for line in fh: 89 for line in fh:
87 line = line.rstrip( "\n\r" ) 90 line = line.rstrip( "\n\r" )
88 if line.endswith( "\t|" ): 91 if line.endswith( "\t|" ):
89 line = line[:-2] 92 line = line[:-2]
92 name = fields[1] 95 name = fields[1]
93 if sanitize_names: 96 if sanitize_names:
94 name = NAME_RE.sub( NAME_REPL, name ) 97 name = NAME_RE.sub( NAME_REPL, name )
95 name_type = fields[3] 98 name_type = fields[3]
96 if name_type == "scientific name": 99 if name_type == "scientific name":
100 if name in names:
101 print( 'Warning: name "%s" found at node "%s" but already exists originally for node "%s".' % ( name, node_id, names[name][0] ), file=sys.stderr )
102 new_name = "%s_%s" % ( name, node_id )
103 print( 'Transforming node "%s" named "%s" to "%s".' % ( node_id, name, new_name ), file=sys.stderr )
104 assert new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % new_name
105 if not names[name][1]:
106 orig_new_name = "%s_%s" % ( name, names[name][0] )
107 print( 'Transforming node "%s" named "%s" to "%s".' % ( names[name][0], name, orig_new_name ), file=sys.stderr )
108 assert orig_new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % orig_new_name
109 name_map[names[name][0]] = orig_new_name
110 names[name] = ( names[name][0], True )
111 name = new_name
112 else:
113 names[name] = ( node_id, False )
97 name_map[ node_id ] = name 114 name_map[ node_id ] = name
98 115
99 with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh: 116 with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh:
100 for line in fh: 117 for line in fh:
101 line = line.rstrip( "\n\r" ) 118 line = line.rstrip( "\n\r" )
103 node_id = fields[0] 120 node_id = fields[0]
104 parent_id = fields[1] 121 parent_id = fields[1]
105 rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None ) 122 rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None )
106 if rank is None: 123 if rank is None:
107 # This should never happen, unless new taxonomy ranks are created 124 # This should never happen, unless new taxonomy ranks are created
108 print >> sys.stderr, 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ) 125 print( 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ), file=sys.stderr )
109 rank = NO_RANK_INT 126 rank = NO_RANK_INT
110 if node_id == '1': 127 if node_id == '1':
111 parent_id = '0' 128 parent_id = '0'
112 if parent_id not in child_lists: 129 if parent_id not in child_lists:
113 child_lists[ parent_id ] = [] 130 child_lists[ parent_id ] = []
123 dfs_summation( child, counts, child_lists ) 140 dfs_summation( child, counts, child_lists )
124 counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 ) 141 counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 )
125 142
126 143
127 def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ): 144 def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ):
128 if not options.summation and ( not options.show_zeros and node not in hit_taxa ):
129 return
130 rank_int = rank_map[node] 145 rank_int = rank_map[node]
131 code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE ) 146 code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE )
132 if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa): 147 if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa):
133 if name is None: 148 if name is None:
134 name = "" 149 name = ""
208 parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' ) 223 parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' )
209 parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' ) 224 parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' )
210 parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' ) 225 parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' )
211 (options, args) = parser.parse_args() 226 (options, args) = parser.parse_args()
212 if options.version: 227 if options.version:
213 print >> sys.stderr, "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ) 228 print( "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ), file=sys.stderr )
214 sys.exit() 229 sys.exit()
215 if not args: 230 if not args:
216 print >> sys.stderr, parser.get_usage() 231 print( parser.get_usage(), file=sys.stderr )
217 sys.exit() 232 sys.exit()
218 233
219 if options.cluster: 234 if options.cluster:
220 cluster_name = options.cluster.lower() 235 cluster_name = options.cluster.lower()
221 cluster = RANK_NAME_TO_INTS.get( cluster_name, None ) 236 cluster = RANK_NAME_TO_INTS.get( cluster_name, None )