diff tools/maf/vcf_to_maf_customtrack.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/vcf_to_maf_customtrack.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,151 @@
+#Dan Blankenberg
+from optparse import OptionParser
+import sys
+import galaxy_utils.sequence.vcf
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+
+UNKNOWN_NUCLEOTIDE = '*'
+
+class PopulationVCFParser( object ):
+    def __init__( self, reader, name ):
+        self.reader = reader
+        self.name = name
+        self.counter = 0
+    def next( self ):
+        rval = []
+        vc = self.reader.next()
+        for i, allele in enumerate( vc.alt ):
+            rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) )
+        self.counter += 1
+        return ( vc, rval )
+    def __iter__( self ):
+        while True:
+            yield self.next()
+
+class SampleVCFParser( object ):
+    def __init__( self, reader ):
+        self.reader = reader
+        self.counter = 0
+    def next( self ):
+        rval = []
+        vc = self.reader.next()
+        alleles = [ vc.ref ] + vc.alt
+        
+        if 'GT' in vc.format:
+            gt_index = vc.format.index( 'GT' )
+            for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ):
+                gt_indexes = []
+                for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here?
+                    try:
+                        gt_indexes.append( int( i ) )
+                    except:
+                        gt_indexes.append( None )
+                for i, allele_i in enumerate( gt_indexes ):
+                    if allele_i is not None:
+                        rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) )
+        self.counter += 1
+        return ( vc, rval )
+    def __iter__( self ):
+        while True:
+            yield self.next()
+
+def main():
+    usage = "usage: %prog [options] output_file dbkey inputfile pop_name"
+    parser = OptionParser( usage=usage )
+    parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis")
+    parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis")
+    parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track")
+    parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).")
+    
+
+    ( options, args ) = parser.parse_args()
+    
+    if len ( args ) < 3:
+        if options.galaxy:
+            print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n"
+        parser.error( "Need to specify an output file, a dbkey and at least one input file" )
+    
+    if not ( options.population ^ options.sample ):
+        parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' )
+    
+    out = open( args.pop(0), 'wb' )
+    out.write( 'track name="%s" visibility=pack\n' %  options.name.replace( "\"", "'" ) )
+    
+    maf_writer = bx.align.maf.Writer( out )
+    
+    dbkey = args.pop(0)
+    
+    vcf_files = []
+    if options.population:
+        i = 0
+        while args:
+            filename = args.pop( 0 )
+            pop_name = args.pop( 0 ).replace( ' ', '_' )
+            if not pop_name:
+                pop_name = 'population_%i' % ( i + 1 )
+            vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name  ) )
+            i += 1
+    else:
+        while args:
+            filename = args.pop( 0 )
+            vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) )
+    
+    non_spec_skipped = 0
+    for vcf_file in vcf_files:
+        for vc, variants in vcf_file:
+            num_ins = 0
+            num_dels = 0
+            for variant_name, variant_text in variants:
+                if 'D' in variant_text:
+                    num_dels = max( num_dels, int( variant_text[1:] ) )
+                elif 'I' in variant_text:
+                    num_ins = max( num_ins, len( variant_text ) - 1 )
+            
+            alignment = bx.align.maf.Alignment()
+            ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )
+            start_pos = vc.pos - 1
+            if num_dels and start_pos:
+                ref_text = UNKNOWN_NUCLEOTIDE + ref_text
+                start_pos -= 1
+            alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % (
+                 dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ),
+                 start = start_pos, size = len( ref_text.replace( '-', '' ) ),
+                 strand = '+', src_size = start_pos + len( ref_text ),
+                 text = ref_text ) )
+            for variant_name, variant_text in variants:
+                #FIXME:
+                ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec
+                ## this check is due to data having indels not represented in the published format spec, 
+                ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz
+                if variant_text and variant_text[0] in [ '-', '+' ]:
+                    non_spec_skipped += 1
+                    continue
+                
+                #do we need a left padding unknown nucleotide (do we have deletions)?
+                if num_dels and start_pos:
+                    var_text = UNKNOWN_NUCLEOTIDE
+                else:
+                    var_text = ''
+                if 'D' in variant_text:
+                    cur_num_del = int( variant_text[1:] )
+                    pre_del = min( len( vc.ref ), cur_num_del )
+                    post_del = cur_num_del - pre_del
+                    var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del
+                    var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) )
+                elif 'I' in variant_text:
+                    cur_num_ins = len( variant_text ) - 1
+                    var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) )
+                else:
+                    var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )    
+                alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) )
+            maf_writer.write( alignment )
+
+    maf_writer.close()
+    
+    if non_spec_skipped:
+        print 'Skipped %i non-specification compliant indels.' % non_spec_skipped
+
+if __name__ == "__main__": main()