view sam2interval.py @ 1:75557c0908a9 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/sam2interval commit 206cd8245e7619b0e924c5066d0172129222993d"
author devteam
date Wed, 05 Feb 2020 06:58:53 -0500
parents 8c737b8ddc45
children
line wrap: on
line source

#!/usr/bin/env python

import sys
import optparse
import re


def stop_err(msg):
    sys.exit(msg)


def main():
    usage = """%prog [options]

options (listed below) default to 'None' if omitted
    """
    parser = optparse.OptionParser(usage=usage)

    parser.add_option(
        '-f', '--input_sam_file',
        metavar="INPUT_SAM_FILE",
        dest='input_sam',
        default=False,
        help='Name of the SAM file to be filtered. STDIN is default')

    parser.add_option(
        '-c', '--flag_column',
        dest='flag_col',
        default='2',
        help='Column containing SAM bitwise flag. 1-based')

    parser.add_option(
        '-s', '--start_column',
        dest='start_col',
        default='4',
        help='Column containing position. 1-based')

    parser.add_option(
        '-g', '--cigar_column',
        dest='cigar_col',
        default='6',
        help='Column containing CIGAR or extended CIGAR string')

    parser.add_option(
        '-r', '--ref_column',
        dest='ref_col',
        default='3',
        help='Column containing name of the reference sequence coordinate. 1-based')

    parser.add_option(
        '-e', '--read_column',
        dest='read_col',
        default='1',
        help='Column containing read name. 1-based')

    parser.add_option(
        '-p', '--print_all',
        dest='prt_all',
        action='store_true',
        default=False,
        help='Print coordinates and original SAM?')

    options, args = parser.parse_args()

    if options.input_sam:
        infile = open(options.input_sam, 'r')
    else:
        infile = sys.stdin

    cigar = re.compile('\d+M|\d+N|\d+D|\d+P')

    print('#chrom\tstart\tend\tstrand\tread_name')  # provide a (partial) header so that strand is automatically set in metadata

    for line in infile:
        line = line.rstrip('\r\n')
        if line and not line.startswith('#') and not line.startswith('@'):
            fields = line.split('\t')
            start = int(fields[int(options.start_col) - 1]) - 1
            end = 0
            for op in cigar.findall(fields[int(options.cigar_col) - 1]):
                end += int(op[0:len(op) - 1])

            strand = '+'
            if bool(int(fields[int(options.flag_col) - 1]) & 0x0010):
                strand = '-'
            read_name = fields[int(options.read_col) - 1]
            ref_name = fields[int(options.ref_col) - 1]

            if ref_name != '*':
                # Do not print lines with unmapped reads that contain '*' instead of chromosome name
                if options.prt_all:
                    print('%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end + start), strand, line))
                else:
                    print('%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end + start), strand, read_name))


if __name__ == "__main__":
    main()