Mercurial > repos > greg > bam_to_psl
comparison bam_to_psl.py @ 0:b6794f4cb1c6 draft default tip
Uploaded
| author | greg |
|---|---|
| date | Mon, 12 Dec 2022 14:42:58 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b6794f4cb1c6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # This code is very loosely based on | |
| 4 # the FusionCatcher sam2psl script here: | |
| 5 # https://github.com/ndaniel/fusioncatcher/blob/master/bin/sam2psl.py | |
| 6 | |
| 7 import argparse | |
| 8 import sys | |
| 9 | |
| 10 | |
| 11 def parse_cigar(c): | |
| 12 r = [] | |
| 13 d = '' | |
| 14 mismatches_x = 0 | |
| 15 c = c.upper() | |
| 16 for a in c: | |
| 17 if a.isdigit(): | |
| 18 d = d + a | |
| 19 elif a in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']: | |
| 20 dd = int(d) | |
| 21 r.append((a, dd)) | |
| 22 if a == 'X': | |
| 23 mismatches_x = mismatches_x + dd | |
| 24 d = '' | |
| 25 else: | |
| 26 msg = "Error: unknown CIGAR: %s\n" % str(c) | |
| 27 sys.exit(msg) | |
| 28 return (r, mismatches_x) | |
| 29 | |
| 30 | |
| 31 def blocks(cigar, ig=0): | |
| 32 # Returns block of matches. Hard clipping is | |
| 33 # converted to soft clipping index on read. | |
| 34 ir = 0 | |
| 35 rr = [] | |
| 36 rg = [] | |
| 37 match = 0 | |
| 38 mismatch = 0 | |
| 39 mismatch_x = 0 | |
| 40 mismatch_clip = 0 | |
| 41 insert_query = 0 | |
| 42 insert_query_count = 0 | |
| 43 insert_ref = 0 | |
| 44 insert_ref_count = 0 | |
| 45 # Sum of lengths of the M/I/S/=/X operations | |
| 46 # will equal the length of SEQ. | |
| 47 seq_len = 0 | |
| 48 (cig, mismatch_x) = parse_cigar(cigar) | |
| 49 mismatch = mismatch_x | |
| 50 for e in cig: | |
| 51 if e[0] in ('S', 'H'): | |
| 52 ir = ir + e[1] | |
| 53 mismatch_clip = mismatch_clip + e[1] | |
| 54 seq_len = seq_len + e[1] | |
| 55 elif e[0] in ('I',): | |
| 56 ir = ir + e[1] | |
| 57 mismatch = mismatch + e[1] | |
| 58 insert_query = insert_query + e[1] | |
| 59 insert_query_count = insert_query_count + 1 | |
| 60 seq_len = seq_len + e[1] | |
| 61 elif e[0] in ('X'): | |
| 62 ir = ir + e[1] | |
| 63 ig = ig + e[1] | |
| 64 mismatch = mismatch + e[1] | |
| 65 mismatch_x = mismatch_x + e[1] | |
| 66 seq_len = seq_len + e[1] | |
| 67 elif e[0] in ('M', '='): | |
| 68 rr.append((ir, ir + e[1])) | |
| 69 rg.append((ig, ig + e[1])) | |
| 70 ir = ir + e[1] | |
| 71 ig = ig + e[1] | |
| 72 match = match + e[1] | |
| 73 seq_len = seq_len + e[1] | |
| 74 elif e[0] in ('D', 'N', 'P'): | |
| 75 ig = ig + e[1] | |
| 76 insert_ref = insert_ref + e[1] | |
| 77 insert_ref_count = insert_ref_count + 1 | |
| 78 return (rr, rg, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) | |
| 79 | |
| 80 | |
| 81 def get_psl(sam, lens): | |
| 82 # Returns PSL coordinates. | |
| 83 if sam and sam[1].isdigit(): | |
| 84 unmapped = True if int(sam[1]) & 0x4 else False | |
| 85 if (not unmapped) and sam[2] != '*' and sam[5] != '*' and sam[0] != '*': | |
| 86 # Initialize psl_items to those | |
| 87 # that constitute a PSL empty line. | |
| 88 psl_items = ['0', '0', '0', '0', '0', '0', '0', '0', '+', 's', '0', '0', '0', 'r', '0', '0', '0', '0', ',', ',', ','] | |
| 89 # Read sequence length. | |
| 90 psl_items[14] = lens.get(sam[2], 0) | |
| 91 # Reference name. | |
| 92 psl_items[13] = sam[2] | |
| 93 # Read name. | |
| 94 psl_items[9] = sam[0] | |
| 95 # Strand. | |
| 96 psl_items[8] = "-" if int(sam[1]) & 0x10 else '+' | |
| 97 # Start position. | |
| 98 psl_items[15] = int(sam[3]) - 1 | |
| 99 (interval_query, interval_ref, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) = blocks(sam[5], ig=psl_items[15]) | |
| 100 # Read sequence length. | |
| 101 if sam[9] != '*' and sam[5].find('H') == -1: | |
| 102 psl_items[10] = len(sam[9]) | |
| 103 else: | |
| 104 # The length of SEQ is the sum of the | |
| 105 # lengths of the M/I/S/=/X operations. | |
| 106 psl_items[10] = seq_len | |
| 107 psl_items[4] = insert_query_count | |
| 108 psl_items[5] = insert_query | |
| 109 psl_items[6] = insert_ref_count | |
| 110 psl_items[7] = insert_ref | |
| 111 # Extract the mismatches using tag | |
| 112 # NM:i (NM is mismatches per reads). | |
| 113 tag_nm_i = [e.partition("NM:i:")[2] for e in sam[11:] if e.startswith('NM:i:')] | |
| 114 if not tag_nm_i: | |
| 115 # NM is not ideal because it is mismatches | |
| 116 # per # fragment and not per read, but is | |
| 117 # etter than nothing. | |
| 118 tag_nm_i = [e.partition("nM:i:")[2] for e in sam[11:] if e.startswith('nM:i:')] | |
| 119 tag_nm_i = int(tag_nm_i[0]) if tag_nm_i else 0 | |
| 120 if tag_nm_i > float(0.90) * seq_len: | |
| 121 tag_nm_i = 0 | |
| 122 # Compute the matches and mismatches (include the | |
| 123 # clipping as mismatches). | |
| 124 psl_items[0] = match | |
| 125 psl_items[1] = mismatch | |
| 126 if interval_query: | |
| 127 psl_items[11] = interval_query[0][0] | |
| 128 psl_items[12] = interval_query[-1][1] | |
| 129 psl_items[16] = interval_ref[-1][1] | |
| 130 psl_items[17] = len(interval_query) | |
| 131 # BLAT always gives the coordinates as | |
| 132 # everything is mapped on the forwward | |
| 133 # strand. | |
| 134 psl_items[18] = ','.join([str(e[1] - e[0]) for e in interval_query]) + ',' | |
| 135 psl_items[19] = ','.join([str(e[0]) for e in interval_query]) + ',' | |
| 136 psl_items[20] = ','.join([str(e[0]) for e in interval_ref]) + ',' | |
| 137 return map(str, psl_items) | |
| 138 else: | |
| 139 return None | |
| 140 | |
| 141 | |
| 142 def to_psl(input_file, psl_file): | |
| 143 # Convert a SAM file to PSL format. | |
| 144 header = dict() | |
| 145 with open(input_file, 'r') as fin, open(psl_file, 'w') as fou: | |
| 146 for i, sam_line in enumerate(fin): | |
| 147 sam_line = sam_line.rstrip('\r\n') | |
| 148 if sam_line: | |
| 149 sam_items = sam_line.split('\t') | |
| 150 if sam_items[0].startswith('@'): | |
| 151 if sam_items[0].startswith('@SQ') and sam_items[1].startswith('SN:') and sam_items[2].startswith('LN:'): | |
| 152 k = sam_items[1][3:] | |
| 153 v = int(sam_items[2][3:]) | |
| 154 header[k] = v | |
| 155 else: | |
| 156 psl_items = get_psl(sam_items, header) | |
| 157 if psl_items is not None: | |
| 158 fou.write('%s\n' % '\t'.join(psl_items)) | |
| 159 | |
| 160 | |
| 161 if __name__ == '__main__': | |
| 162 | |
| 163 parser = argparse.ArgumentParser() | |
| 164 | |
| 165 parser.add_argument("--input_file", action="store", dest="input_file", help="Input file in SAM format.") | |
| 166 parser.add_argument("--output_file", action="store", dest="output_file", help="Output file in PSL format.") | |
| 167 | |
| 168 args = parser.parse_args() | |
| 169 | |
| 170 to_psl(args.input_file, args.output_file) |
