diff kraken_taxonomy_report.py @ 1:b97694b21bc3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/kraken_taxonomy_report/ commit 3265247e909410db2a6d6087a2c0d3a9885c120c
author iuc
date Wed, 23 Nov 2016 03:27:33 -0500
parents 3f1a0d47ea8d
children 528a1d91b066
line wrap: on
line diff
--- a/kraken_taxonomy_report.py	Wed Jun 01 17:25:40 2016 -0400
+++ b/kraken_taxonomy_report.py	Wed Nov 23 03:27:33 2016 -0500
@@ -6,12 +6,14 @@
 # Licensed under the Academic Free License version 3.0
 # https://github.com/blankenberg/Kraken-Taxonomy-Report
 
+from __future__ import print_function
+
 import sys
 import os
 import optparse
 import re
 
-__VERSION__ = '0.0.1'
+__VERSION__ = '0.0.2'
 
 __URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report"
 
@@ -82,6 +84,7 @@
     child_lists = {}
     name_map = {}
     rank_map = {}
+    names = {}  # Store names here to look for duplicates (id, True/False name fixed)
     with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh:
         for line in fh:
             line = line.rstrip( "\n\r" )
@@ -94,6 +97,20 @@
                 name = NAME_RE.sub( NAME_REPL, name )
             name_type = fields[3]
             if name_type == "scientific name":
+                if name in names:
+                    print( 'Warning: name "%s" found at node "%s" but already exists originally for node "%s".' % ( name, node_id, names[name][0] ), file=sys.stderr )
+                    new_name = "%s_%s" % ( name, node_id )
+                    print( 'Transforming node "%s" named "%s" to "%s".' % ( node_id, name, new_name ), file=sys.stderr )
+                    assert new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % new_name
+                    if not names[name][1]:
+                        orig_new_name = "%s_%s" % ( name, names[name][0] )
+                        print( 'Transforming node "%s" named "%s" to "%s".' % ( names[name][0], name, orig_new_name ), file=sys.stderr )
+                        assert orig_new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % orig_new_name
+                        name_map[names[name][0]] = orig_new_name
+                        names[name] = ( names[name][0], True )
+                    name = new_name
+                else:
+                    names[name] = ( node_id, False )
                 name_map[ node_id ] = name
 
     with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh:
@@ -105,7 +122,7 @@
             rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None )
             if rank is None:
                 # This should never happen, unless new taxonomy ranks are created
-                print >> sys.stderr, 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME )
+                print( 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ), file=sys.stderr )
                 rank = NO_RANK_INT
             if node_id == '1':
                 parent_id = '0'
@@ -125,8 +142,6 @@
 
 
 def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ):
-    if not options.summation and ( not options.show_zeros and node not in hit_taxa ):
-        return
     rank_int = rank_map[node]
     code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE )
     if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa):
@@ -210,10 +225,10 @@
     parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' )
     (options, args) = parser.parse_args()
     if options.version:
-        print >> sys.stderr, "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ )
+        print( "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ), file=sys.stderr )
         sys.exit()
     if not args:
-        print >> sys.stderr, parser.get_usage()
+        print( parser.get_usage(), file=sys.stderr )
         sys.exit()
 
     if options.cluster: