diff tools/maf/maf_to_interval.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_interval.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and output intervals for specified list of species.
+"""
+import sys, os
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy.tools.util import maf_utilities
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():    
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    output_id = sys.argv[3]
+    #where to store files that become additional output
+    database_tmp_dir =  sys.argv[4]
+    primary_spec = sys.argv[5]
+    species = sys.argv[6].split( ',' )
+    all_species = sys.argv[7].split( ',' )
+    partial = sys.argv[8]
+    keep_gaps = sys.argv[9]
+    out_files = {}
+    
+    if "None" in species:
+        species = []
+    
+    if primary_spec not in species:
+        species.append( primary_spec )
+    if primary_spec not in all_species:
+        all_species.append( primary_spec )
+    
+    all_species.sort()
+    for spec in species:
+        if spec == primary_spec:
+            out_files[ spec ] = open( output_filename, 'wb+' )
+        else:
+            out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_interval_%s' % ( output_id, spec, spec ) ), 'wb+' )
+        out_files[ spec ].write( '#chrom\tstart\tend\tstrand\tscore\tname\t%s\n' % ( '\t'.join( all_species ) ) )
+    num_species = len( all_species )
+    
+    file_in = open( input_filename, 'r' )
+    maf_reader = maf.Reader( file_in )
+    
+    for i, m in enumerate( maf_reader ):
+        for j, block in enumerate( maf_utilities.iter_blocks_split_by_species( m ) ):
+            if len( block.components ) < num_species and partial == "partial_disallowed": continue
+            sequences = {}
+            for c in block.components:
+                spec, chrom = maf_utilities.src_split( c.src )
+                if keep_gaps == 'remove_gaps':
+                    sequences[ spec ] = c.text.replace( '-', '' )
+                else:
+                    sequences[ spec ] = c.text
+            sequences = '\t'.join( [ sequences.get( spec, '' ) for spec in all_species ] )
+            for spec in species:
+                c = block.get_component_by_src_start( spec )
+                if c is not None:
+                    spec2, chrom = maf_utilities.src_split( c.src )
+                    assert spec2 == spec, Exception( 'Species name inconsistancy found in component: %s != %s' % ( spec, spec2 ) )
+                    out_files[ spec ].write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( chrom, c.forward_strand_start, c.forward_strand_end, c.strand, m.score, "%s_%s_%s" % (spec, i, j), sequences ) )
+    file_in.close()
+    for file_out in out_files.values():
+        file_out.close()
+
+if __name__ == "__main__": __main__()