Mercurial > repos > xuebing > sharplabtool
diff tools/indels/indel_sam2interval.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/indels/indel_sam2interval.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +""" +Allows user to filter out non-indels from SAM. + +usage: %prog [options] + -i, --input=i: The input SAM file + -u, --include_base=u: Whether or not to include the base for insertions + -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown + -o, --int_out=o: The interval output file for the converted SAM file + -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file + -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file +""" + +import re, sys +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def numeric_sort( text1, text2 ): + """ + For two items containing space-separated text, compares equivalent pieces + numerically if both numeric or as text otherwise + """ + pieces1 = text1.split() + pieces2 = text2.split() + if len( pieces1 ) == 0: + return 1 + if len( pieces2 ) == 0: + return -1 + for i, pc1 in enumerate( pieces1 ): + if i == len( pieces2 ): + return 1 + if not pieces2[i].isdigit(): + if pc1.isdigit(): + return -1 + else: + if pc1 > pieces2[i]: + return 1 + elif pc1 < pieces2[i]: + return -1 + else: + if not pc1.isdigit(): + return 1 + else: + if int( pc1 ) > int( pieces2[i] ): + return 1 + elif int( pc1 ) < int( pieces2[i] ): + return -1 + if i < len( pieces2 ) - 1: + return -1 + return 0 + +def __main__(): + #Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + + # open up output files + output = open( options.int_out, 'wb' ) + if options.bed_ins_out != 'None': + output_bed_ins = open( options.bed_ins_out, 'wb' ) + else: + output_bed_ins = None + if options.bed_del_out != 'None': + output_bed_del = open( options.bed_del_out, 'wb' ) + else: + output_bed_del = None + + # the pattern to match, assuming just one indel per cigar string + pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) + pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) + + # go through all lines in input file + out_data = {} + multi_indel_lines = 0 + for line in open( options.input, 'rb' ): + if line and not line.startswith( '#' ) and not line.startswith( '@' ) : + split_line = line.split( '\t' ) + if split_line < 12: + continue + # grab relevant pieces + cigar = split_line[5].strip() + pos = int( split_line[3] ) + chr = split_line[2] + base_string = split_line[9] + # parse cigar string + m = pat_indel.match( cigar ) + if not m: + m = pat_multi.match( cigar ) + # skip this line if no match + if not m: + continue + # account for multiple indels or operations we don't process + else: + multi_indel_lines += 1 + continue + else: + match = m.groupdict() + left = int( match[ 'lmatch' ] ) + middle = int( match[ 'ins_del_width' ] ) + middle_type = match[ 'ins_del' ] + bases = base_string[ left : left + middle ] + # calculate start and end positions, and output to insertion or deletion file + start = left + pos + if middle_type == 'D': + end = start + middle + data = [ chr, start, end, 'D' ] + if options.include_base == "true": + data.append( '-' ) + else: + end = start + 1 + data = [ chr, start, end, 'I' ] + if options.include_base == "true": + data.append( bases ) + location = '\t'.join( [ '%s' % d for d in data ] ) + try: + out_data[ location ] += 1 + except KeyError: + out_data[ location ] = 1 + # output to interval file + # get all locations and sort + locations = out_data.keys() + locations.sort( numeric_sort ) + last_line = '' + # output each location, either with counts or each occurrence + for loc in locations: + sp_loc = loc.split( '\t' ) + cur_line = '\t'.join( sp_loc[:3] ) + if options.collapse == 'true': + output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) ) + if output_bed_del and sp_loc[3] == 'D': + output_bed_del.write( '%s\n' % cur_line ) + if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line: + output_bed_ins.write( '%s\n' % cur_line ) + last_line = cur_line + else: + for i in range( out_data[ loc ] ): + output.write( '%s\n' % loc ) + if output_bed_del or output_bed_ins: + if output_bed_del and sp_loc[3] == 'D': + output_bed_del.write( '%s\n' % cur_line ) + if output_bed_ins and sp_loc[3] == 'I': + output_bed_ins.write( '%s\n' % cur_line ) + + # cleanup, close files + if output_bed_ins: + output_bed_ins.close() + if output_bed_del: + output_bed_del.close() + output.close() + + # if skipped lines because of more than one indel, output message + if multi_indel_lines > 0: + sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) + +if __name__=="__main__": __main__()