Mercurial > repos > devteam > sam2interval
view sam2interval.py @ 1:75557c0908a9 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/sam2interval commit 206cd8245e7619b0e924c5066d0172129222993d"
author | devteam |
---|---|
date | Wed, 05 Feb 2020 06:58:53 -0500 |
parents | 8c737b8ddc45 |
children |
line wrap: on
line source
#!/usr/bin/env python import sys import optparse import re def stop_err(msg): sys.exit(msg) def main(): usage = """%prog [options] options (listed below) default to 'None' if omitted """ parser = optparse.OptionParser(usage=usage) parser.add_option( '-f', '--input_sam_file', metavar="INPUT_SAM_FILE", dest='input_sam', default=False, help='Name of the SAM file to be filtered. STDIN is default') parser.add_option( '-c', '--flag_column', dest='flag_col', default='2', help='Column containing SAM bitwise flag. 1-based') parser.add_option( '-s', '--start_column', dest='start_col', default='4', help='Column containing position. 1-based') parser.add_option( '-g', '--cigar_column', dest='cigar_col', default='6', help='Column containing CIGAR or extended CIGAR string') parser.add_option( '-r', '--ref_column', dest='ref_col', default='3', help='Column containing name of the reference sequence coordinate. 1-based') parser.add_option( '-e', '--read_column', dest='read_col', default='1', help='Column containing read name. 1-based') parser.add_option( '-p', '--print_all', dest='prt_all', action='store_true', default=False, help='Print coordinates and original SAM?') options, args = parser.parse_args() if options.input_sam: infile = open(options.input_sam, 'r') else: infile = sys.stdin cigar = re.compile('\d+M|\d+N|\d+D|\d+P') print('#chrom\tstart\tend\tstrand\tread_name') # provide a (partial) header so that strand is automatically set in metadata for line in infile: line = line.rstrip('\r\n') if line and not line.startswith('#') and not line.startswith('@'): fields = line.split('\t') start = int(fields[int(options.start_col) - 1]) - 1 end = 0 for op in cigar.findall(fields[int(options.cigar_col) - 1]): end += int(op[0:len(op) - 1]) strand = '+' if bool(int(fields[int(options.flag_col) - 1]) & 0x0010): strand = '-' read_name = fields[int(options.read_col) - 1] ref_name = fields[int(options.ref_col) - 1] if ref_name != '*': # Do not print lines with unmapped reads that contain '*' instead of chromosome name if options.prt_all: print('%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end + start), strand, line)) else: print('%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end + start), strand, read_name)) if __name__ == "__main__": main()