diff sam2interval.py @ 0:8c737b8ddc45 draft

Uploaded tool tarball.
author devteam
date Mon, 26 Aug 2013 15:12:38 -0400
parents
children 75557c0908a9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sam2interval.py	Mon Aug 26 15:12:38 2013 -0400
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import sys
+import optparse
+import re
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    usage = """%prog [options]
+    
+options (listed below) default to 'None' if omitted
+    """
+    parser = optparse.OptionParser(usage=usage)
+
+    parser.add_option(
+        '-f','--input_sam_file',
+        metavar="INPUT_SAM_FILE",
+        dest='input_sam',
+        default = False,
+        help='Name of the SAM file to be filtered. STDIN is default')
+            
+    parser.add_option(
+        '-c','--flag_column',
+        dest='flag_col',
+        default = '2',
+        help='Column containing SAM bitwise flag. 1-based')
+        
+    parser.add_option(
+        '-s','--start_column',
+        dest='start_col',
+        default = '4',
+        help='Column containing position. 1-based')
+
+    parser.add_option(
+        '-g','--cigar_column',
+        dest='cigar_col',
+        default = '6',
+        help='Column containing CIGAR or extended CIGAR string')
+
+    parser.add_option(
+        '-r','--ref_column',
+        dest='ref_col',
+        default = '3',
+        help='Column containing name of the reference sequence coordinate. 1-based')
+        
+    parser.add_option(
+        '-e','--read_column',
+        dest='read_col',
+        default = '1',
+        help='Column containing read name. 1-based')
+
+    parser.add_option(
+        '-p','--print_all',
+        dest='prt_all',
+        action='store_true',
+        default = False,
+        help='Print coordinates and original SAM?')
+    
+    options, args = parser.parse_args()
+
+    if options.input_sam:
+        infile = open ( options.input_sam, 'r')
+    else:
+        infile = sys.stdin
+
+    cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' )
+
+    print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata
+
+    for line in infile:
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+            fields = line.split( '\t' )
+            start = int( fields[ int( options.start_col ) - 1 ] ) - 1
+            end = 0
+            for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ):
+                end += int( op[ 0:len( op ) - 1 ] )
+                
+            strand = '+' 
+            if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ):
+                strand = '-'
+            read_name = fields[ int( options.read_col ) - 1 ]
+            ref_name  = fields[ int( options.ref_col ) - 1 ]
+            
+            if ref_name != '*':
+                # Do not print lines with unmapped reads that contain '*' instead of chromosome name        
+                if options.prt_all: 
+                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line)
+                else:
+                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name)
+
+if __name__ == "__main__": main()
+