Mercurial > repos > greg > bam_to_psl
comparison bam_to_psl.py @ 0:b6794f4cb1c6 draft default tip
Uploaded
author | greg |
---|---|
date | Mon, 12 Dec 2022 14:42:58 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b6794f4cb1c6 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # This code is very loosely based on | |
4 # the FusionCatcher sam2psl script here: | |
5 # https://github.com/ndaniel/fusioncatcher/blob/master/bin/sam2psl.py | |
6 | |
7 import argparse | |
8 import sys | |
9 | |
10 | |
11 def parse_cigar(c): | |
12 r = [] | |
13 d = '' | |
14 mismatches_x = 0 | |
15 c = c.upper() | |
16 for a in c: | |
17 if a.isdigit(): | |
18 d = d + a | |
19 elif a in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']: | |
20 dd = int(d) | |
21 r.append((a, dd)) | |
22 if a == 'X': | |
23 mismatches_x = mismatches_x + dd | |
24 d = '' | |
25 else: | |
26 msg = "Error: unknown CIGAR: %s\n" % str(c) | |
27 sys.exit(msg) | |
28 return (r, mismatches_x) | |
29 | |
30 | |
31 def blocks(cigar, ig=0): | |
32 # Returns block of matches. Hard clipping is | |
33 # converted to soft clipping index on read. | |
34 ir = 0 | |
35 rr = [] | |
36 rg = [] | |
37 match = 0 | |
38 mismatch = 0 | |
39 mismatch_x = 0 | |
40 mismatch_clip = 0 | |
41 insert_query = 0 | |
42 insert_query_count = 0 | |
43 insert_ref = 0 | |
44 insert_ref_count = 0 | |
45 # Sum of lengths of the M/I/S/=/X operations | |
46 # will equal the length of SEQ. | |
47 seq_len = 0 | |
48 (cig, mismatch_x) = parse_cigar(cigar) | |
49 mismatch = mismatch_x | |
50 for e in cig: | |
51 if e[0] in ('S', 'H'): | |
52 ir = ir + e[1] | |
53 mismatch_clip = mismatch_clip + e[1] | |
54 seq_len = seq_len + e[1] | |
55 elif e[0] in ('I',): | |
56 ir = ir + e[1] | |
57 mismatch = mismatch + e[1] | |
58 insert_query = insert_query + e[1] | |
59 insert_query_count = insert_query_count + 1 | |
60 seq_len = seq_len + e[1] | |
61 elif e[0] in ('X'): | |
62 ir = ir + e[1] | |
63 ig = ig + e[1] | |
64 mismatch = mismatch + e[1] | |
65 mismatch_x = mismatch_x + e[1] | |
66 seq_len = seq_len + e[1] | |
67 elif e[0] in ('M', '='): | |
68 rr.append((ir, ir + e[1])) | |
69 rg.append((ig, ig + e[1])) | |
70 ir = ir + e[1] | |
71 ig = ig + e[1] | |
72 match = match + e[1] | |
73 seq_len = seq_len + e[1] | |
74 elif e[0] in ('D', 'N', 'P'): | |
75 ig = ig + e[1] | |
76 insert_ref = insert_ref + e[1] | |
77 insert_ref_count = insert_ref_count + 1 | |
78 return (rr, rg, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) | |
79 | |
80 | |
81 def get_psl(sam, lens): | |
82 # Returns PSL coordinates. | |
83 if sam and sam[1].isdigit(): | |
84 unmapped = True if int(sam[1]) & 0x4 else False | |
85 if (not unmapped) and sam[2] != '*' and sam[5] != '*' and sam[0] != '*': | |
86 # Initialize psl_items to those | |
87 # that constitute a PSL empty line. | |
88 psl_items = ['0', '0', '0', '0', '0', '0', '0', '0', '+', 's', '0', '0', '0', 'r', '0', '0', '0', '0', ',', ',', ','] | |
89 # Read sequence length. | |
90 psl_items[14] = lens.get(sam[2], 0) | |
91 # Reference name. | |
92 psl_items[13] = sam[2] | |
93 # Read name. | |
94 psl_items[9] = sam[0] | |
95 # Strand. | |
96 psl_items[8] = "-" if int(sam[1]) & 0x10 else '+' | |
97 # Start position. | |
98 psl_items[15] = int(sam[3]) - 1 | |
99 (interval_query, interval_ref, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) = blocks(sam[5], ig=psl_items[15]) | |
100 # Read sequence length. | |
101 if sam[9] != '*' and sam[5].find('H') == -1: | |
102 psl_items[10] = len(sam[9]) | |
103 else: | |
104 # The length of SEQ is the sum of the | |
105 # lengths of the M/I/S/=/X operations. | |
106 psl_items[10] = seq_len | |
107 psl_items[4] = insert_query_count | |
108 psl_items[5] = insert_query | |
109 psl_items[6] = insert_ref_count | |
110 psl_items[7] = insert_ref | |
111 # Extract the mismatches using tag | |
112 # NM:i (NM is mismatches per reads). | |
113 tag_nm_i = [e.partition("NM:i:")[2] for e in sam[11:] if e.startswith('NM:i:')] | |
114 if not tag_nm_i: | |
115 # NM is not ideal because it is mismatches | |
116 # per # fragment and not per read, but is | |
117 # etter than nothing. | |
118 tag_nm_i = [e.partition("nM:i:")[2] for e in sam[11:] if e.startswith('nM:i:')] | |
119 tag_nm_i = int(tag_nm_i[0]) if tag_nm_i else 0 | |
120 if tag_nm_i > float(0.90) * seq_len: | |
121 tag_nm_i = 0 | |
122 # Compute the matches and mismatches (include the | |
123 # clipping as mismatches). | |
124 psl_items[0] = match | |
125 psl_items[1] = mismatch | |
126 if interval_query: | |
127 psl_items[11] = interval_query[0][0] | |
128 psl_items[12] = interval_query[-1][1] | |
129 psl_items[16] = interval_ref[-1][1] | |
130 psl_items[17] = len(interval_query) | |
131 # BLAT always gives the coordinates as | |
132 # everything is mapped on the forwward | |
133 # strand. | |
134 psl_items[18] = ','.join([str(e[1] - e[0]) for e in interval_query]) + ',' | |
135 psl_items[19] = ','.join([str(e[0]) for e in interval_query]) + ',' | |
136 psl_items[20] = ','.join([str(e[0]) for e in interval_ref]) + ',' | |
137 return map(str, psl_items) | |
138 else: | |
139 return None | |
140 | |
141 | |
142 def to_psl(input_file, psl_file): | |
143 # Convert a SAM file to PSL format. | |
144 header = dict() | |
145 with open(input_file, 'r') as fin, open(psl_file, 'w') as fou: | |
146 for i, sam_line in enumerate(fin): | |
147 sam_line = sam_line.rstrip('\r\n') | |
148 if sam_line: | |
149 sam_items = sam_line.split('\t') | |
150 if sam_items[0].startswith('@'): | |
151 if sam_items[0].startswith('@SQ') and sam_items[1].startswith('SN:') and sam_items[2].startswith('LN:'): | |
152 k = sam_items[1][3:] | |
153 v = int(sam_items[2][3:]) | |
154 header[k] = v | |
155 else: | |
156 psl_items = get_psl(sam_items, header) | |
157 if psl_items is not None: | |
158 fou.write('%s\n' % '\t'.join(psl_items)) | |
159 | |
160 | |
161 if __name__ == '__main__': | |
162 | |
163 parser = argparse.ArgumentParser() | |
164 | |
165 parser.add_argument("--input_file", action="store", dest="input_file", help="Input file in SAM format.") | |
166 parser.add_argument("--output_file", action="store", dest="output_file", help="Output file in PSL format.") | |
167 | |
168 args = parser.parse_args() | |
169 | |
170 to_psl(args.input_file, args.output_file) |