Mercurial > repos > devteam > sam2interval
diff sam2interval.py @ 0:8c737b8ddc45 draft
Uploaded tool tarball.
author | devteam |
---|---|
date | Mon, 26 Aug 2013 15:12:38 -0400 |
parents | |
children | 75557c0908a9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2interval.py Mon Aug 26 15:12:38 2013 -0400 @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import sys +import optparse +import re + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def main(): + usage = """%prog [options] + +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-f','--input_sam_file', + metavar="INPUT_SAM_FILE", + dest='input_sam', + default = False, + help='Name of the SAM file to be filtered. STDIN is default') + + parser.add_option( + '-c','--flag_column', + dest='flag_col', + default = '2', + help='Column containing SAM bitwise flag. 1-based') + + parser.add_option( + '-s','--start_column', + dest='start_col', + default = '4', + help='Column containing position. 1-based') + + parser.add_option( + '-g','--cigar_column', + dest='cigar_col', + default = '6', + help='Column containing CIGAR or extended CIGAR string') + + parser.add_option( + '-r','--ref_column', + dest='ref_col', + default = '3', + help='Column containing name of the reference sequence coordinate. 1-based') + + parser.add_option( + '-e','--read_column', + dest='read_col', + default = '1', + help='Column containing read name. 1-based') + + parser.add_option( + '-p','--print_all', + dest='prt_all', + action='store_true', + default = False, + help='Print coordinates and original SAM?') + + options, args = parser.parse_args() + + if options.input_sam: + infile = open ( options.input_sam, 'r') + else: + infile = sys.stdin + + cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' ) + + print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata + + for line in infile: + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and not line.startswith( '@' ) : + fields = line.split( '\t' ) + start = int( fields[ int( options.start_col ) - 1 ] ) - 1 + end = 0 + for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ): + end += int( op[ 0:len( op ) - 1 ] ) + + strand = '+' + if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ): + strand = '-' + read_name = fields[ int( options.read_col ) - 1 ] + ref_name = fields[ int( options.ref_col ) - 1 ] + + if ref_name != '*': + # Do not print lines with unmapped reads that contain '*' instead of chromosome name + if options.prt_all: + print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line) + else: + print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name) + +if __name__ == "__main__": main() +