Mercurial > repos > xuebing > sharplabtool
comparison tools/maf/vcf_to_maf_customtrack.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #Dan Blankenberg | |
2 from optparse import OptionParser | |
3 import sys | |
4 import galaxy_utils.sequence.vcf | |
5 | |
6 from galaxy import eggs | |
7 import pkg_resources; pkg_resources.require( "bx-python" ) | |
8 import bx.align.maf | |
9 | |
10 UNKNOWN_NUCLEOTIDE = '*' | |
11 | |
12 class PopulationVCFParser( object ): | |
13 def __init__( self, reader, name ): | |
14 self.reader = reader | |
15 self.name = name | |
16 self.counter = 0 | |
17 def next( self ): | |
18 rval = [] | |
19 vc = self.reader.next() | |
20 for i, allele in enumerate( vc.alt ): | |
21 rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) ) | |
22 self.counter += 1 | |
23 return ( vc, rval ) | |
24 def __iter__( self ): | |
25 while True: | |
26 yield self.next() | |
27 | |
28 class SampleVCFParser( object ): | |
29 def __init__( self, reader ): | |
30 self.reader = reader | |
31 self.counter = 0 | |
32 def next( self ): | |
33 rval = [] | |
34 vc = self.reader.next() | |
35 alleles = [ vc.ref ] + vc.alt | |
36 | |
37 if 'GT' in vc.format: | |
38 gt_index = vc.format.index( 'GT' ) | |
39 for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ): | |
40 gt_indexes = [] | |
41 for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here? | |
42 try: | |
43 gt_indexes.append( int( i ) ) | |
44 except: | |
45 gt_indexes.append( None ) | |
46 for i, allele_i in enumerate( gt_indexes ): | |
47 if allele_i is not None: | |
48 rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) ) | |
49 self.counter += 1 | |
50 return ( vc, rval ) | |
51 def __iter__( self ): | |
52 while True: | |
53 yield self.next() | |
54 | |
55 def main(): | |
56 usage = "usage: %prog [options] output_file dbkey inputfile pop_name" | |
57 parser = OptionParser( usage=usage ) | |
58 parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis") | |
59 parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis") | |
60 parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track") | |
61 parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).") | |
62 | |
63 | |
64 ( options, args ) = parser.parse_args() | |
65 | |
66 if len ( args ) < 3: | |
67 if options.galaxy: | |
68 print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n" | |
69 parser.error( "Need to specify an output file, a dbkey and at least one input file" ) | |
70 | |
71 if not ( options.population ^ options.sample ): | |
72 parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' ) | |
73 | |
74 out = open( args.pop(0), 'wb' ) | |
75 out.write( 'track name="%s" visibility=pack\n' % options.name.replace( "\"", "'" ) ) | |
76 | |
77 maf_writer = bx.align.maf.Writer( out ) | |
78 | |
79 dbkey = args.pop(0) | |
80 | |
81 vcf_files = [] | |
82 if options.population: | |
83 i = 0 | |
84 while args: | |
85 filename = args.pop( 0 ) | |
86 pop_name = args.pop( 0 ).replace( ' ', '_' ) | |
87 if not pop_name: | |
88 pop_name = 'population_%i' % ( i + 1 ) | |
89 vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name ) ) | |
90 i += 1 | |
91 else: | |
92 while args: | |
93 filename = args.pop( 0 ) | |
94 vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) ) | |
95 | |
96 non_spec_skipped = 0 | |
97 for vcf_file in vcf_files: | |
98 for vc, variants in vcf_file: | |
99 num_ins = 0 | |
100 num_dels = 0 | |
101 for variant_name, variant_text in variants: | |
102 if 'D' in variant_text: | |
103 num_dels = max( num_dels, int( variant_text[1:] ) ) | |
104 elif 'I' in variant_text: | |
105 num_ins = max( num_ins, len( variant_text ) - 1 ) | |
106 | |
107 alignment = bx.align.maf.Alignment() | |
108 ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) | |
109 start_pos = vc.pos - 1 | |
110 if num_dels and start_pos: | |
111 ref_text = UNKNOWN_NUCLEOTIDE + ref_text | |
112 start_pos -= 1 | |
113 alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % ( | |
114 dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ), | |
115 start = start_pos, size = len( ref_text.replace( '-', '' ) ), | |
116 strand = '+', src_size = start_pos + len( ref_text ), | |
117 text = ref_text ) ) | |
118 for variant_name, variant_text in variants: | |
119 #FIXME: | |
120 ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec | |
121 ## this check is due to data having indels not represented in the published format spec, | |
122 ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz | |
123 if variant_text and variant_text[0] in [ '-', '+' ]: | |
124 non_spec_skipped += 1 | |
125 continue | |
126 | |
127 #do we need a left padding unknown nucleotide (do we have deletions)? | |
128 if num_dels and start_pos: | |
129 var_text = UNKNOWN_NUCLEOTIDE | |
130 else: | |
131 var_text = '' | |
132 if 'D' in variant_text: | |
133 cur_num_del = int( variant_text[1:] ) | |
134 pre_del = min( len( vc.ref ), cur_num_del ) | |
135 post_del = cur_num_del - pre_del | |
136 var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del | |
137 var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) ) | |
138 elif 'I' in variant_text: | |
139 cur_num_ins = len( variant_text ) - 1 | |
140 var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) ) | |
141 else: | |
142 var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) ) | |
143 alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) ) | |
144 maf_writer.write( alignment ) | |
145 | |
146 maf_writer.close() | |
147 | |
148 if non_spec_skipped: | |
149 print 'Skipped %i non-specification compliant indels.' % non_spec_skipped | |
150 | |
151 if __name__ == "__main__": main() |