Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2sscs.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 22 Jul 2022 09:19:44 +0000 |
parents | 6ccff403db8a |
children |
comparison
equal
deleted
inserted
replaced
77:1797e461d674 | 78:fdfe9a919ff7 |
---|---|
21 from __future__ import division | 21 from __future__ import division |
22 | 22 |
23 import argparse | 23 import argparse |
24 import json | 24 import json |
25 import os | 25 import os |
26 import re | |
26 import sys | 27 import sys |
27 | 28 |
28 import numpy as np | 29 import numpy as np |
29 import pysam | 30 import pysam |
30 from cyvcf2 import VCF | 31 from cyvcf2 import VCF |
69 ref = variant.REF | 70 ref = variant.REF |
70 if len(variant.ALT) == 0: | 71 if len(variant.ALT) == 0: |
71 continue | 72 continue |
72 else: | 73 else: |
73 alt = variant.ALT[0] | 74 alt = variant.ALT[0] |
75 alt = alt.upper() | |
76 ref = ref.upper() | |
77 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) | |
78 continue | |
74 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt | 79 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
75 if len(ref) == len(alt): | 80 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): |
76 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): | 81 if pileupcolumn.reference_pos == stop_pos: |
77 if pileupcolumn.reference_pos == stop_pos: | 82 count_alt = 0 |
78 count_alt = 0 | 83 count_ref = 0 |
79 count_ref = 0 | 84 count_indel = 0 |
80 count_indel = 0 | 85 for pileupread in pileupcolumn.pileups: |
81 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 86 if not pileupread.is_refskip: |
82 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 87 tag = pileupread.alignment.query_name |
83 for pileupread in pileupcolumn.pileups: | 88 abba = tag[-2:] |
84 if not pileupread.is_del and not pileupread.is_refskip: | 89 if pileupread.is_del: |
85 tag = pileupread.alignment.query_name | 90 p = pileupread.query_position_or_next |
86 abba = tag[-2:] | 91 e = p + len(alt) - 1 |
87 # query position is None if is_del or is_refskip is set. | 92 else: |
88 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | 93 p = pileupread.query_position |
89 count_alt += 1 | 94 e = p + len(alt) |
90 if chrom_stop_pos in mut_pos_dict: | 95 tag = pileupread.alignment.query_name |
91 if abba in mut_pos_dict[chrom_stop_pos]: | 96 if "_" in tag: |
92 mut_pos_dict[chrom_stop_pos][abba] += 1 | 97 tag = re.split('_', tag)[0] |
98 s = p | |
99 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) | |
100 if len(ref) < len(alt): # insertion | |
101 if "I" in split_cigar: | |
102 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] | |
103 for ai in all_insertions: # if multiple insertions in DCS | |
104 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
105 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele | |
106 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): | |
107 nuc = pileupread.alignment.query_sequence[s:e] | |
108 break | |
93 else: | 109 else: |
94 mut_pos_dict[chrom_stop_pos][abba] = 1 | 110 nuc = pileupread.alignment.query_sequence[s] |
111 else: | |
112 nuc = pileupread.alignment.query_sequence[s] | |
113 elif len(ref) > len(alt): # deletion | |
114 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] | |
115 if "D" in split_cigar: | |
116 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
117 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
118 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
119 all_deletions_mod = split_cigar[:ai - 1] | |
120 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
121 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
122 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
123 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
124 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
125 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele | |
126 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): | |
127 nuc = pileupread.alignment.query_sequence[s:e] | |
128 if nuc == "": | |
129 nuc = str(alt) | |
130 break | |
131 else: | |
132 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
133 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there | |
134 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] | |
135 if nuc.upper() == ref[:len(nuc)]: | |
136 nuc = str(ref) | |
137 else: | |
138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
139 else: # SNV: query position is None if is_del or is_refskip is set. | |
140 nuc = pileupread.alignment.query_sequence[s] | |
141 | |
142 nuc = nuc.upper() | |
143 | |
144 if nuc == alt: | |
145 count_alt += 1 | |
146 if chrom_stop_pos in mut_pos_dict: | |
147 if abba in mut_pos_dict[chrom_stop_pos]: | |
148 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
95 else: | 149 else: |
96 mut_pos_dict[chrom_stop_pos] = {} | |
97 mut_pos_dict[chrom_stop_pos][abba] = 1 | 150 mut_pos_dict[chrom_stop_pos][abba] = 1 |
98 if chrom_stop_pos not in ref_pos_dict: | 151 else: |
99 ref_pos_dict[chrom_stop_pos] = {} | 152 mut_pos_dict[chrom_stop_pos] = {} |
100 ref_pos_dict[chrom_stop_pos][abba] = 0 | 153 mut_pos_dict[chrom_stop_pos][abba] = 1 |
101 | 154 if chrom_stop_pos not in ref_pos_dict: |
102 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | 155 ref_pos_dict[chrom_stop_pos] = {} |
103 count_ref += 1 | 156 ref_pos_dict[chrom_stop_pos][abba] = 0 |
104 if chrom_stop_pos in ref_pos_dict: | 157 elif nuc == ref: |
105 if abba in ref_pos_dict[chrom_stop_pos]: | 158 count_ref += 1 |
106 ref_pos_dict[chrom_stop_pos][abba] += 1 | 159 if chrom_stop_pos in ref_pos_dict: |
107 else: | 160 if abba in ref_pos_dict[chrom_stop_pos]: |
108 ref_pos_dict[chrom_stop_pos][abba] = 1 | 161 ref_pos_dict[chrom_stop_pos][abba] += 1 |
109 else: | 162 else: |
110 ref_pos_dict[chrom_stop_pos] = {} | |
111 ref_pos_dict[chrom_stop_pos][abba] = 1 | 163 ref_pos_dict[chrom_stop_pos][abba] = 1 |
112 else: | 164 else: |
113 count_indel += 1 | 165 ref_pos_dict[chrom_stop_pos] = {} |
166 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
167 else: | |
168 count_indel += 1 | |
169 print("coverage at pos %s = %s, ref = %s, alt = %s, other = %s,\n" % | |
170 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | |
114 | 171 |
115 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | 172 # if mutation is in DCS file but not in SSCS, then set counts to NA |
116 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | 173 if chrom_stop_pos not in mut_pos_dict.keys(): |
117 | 174 mut_pos_dict[chrom_stop_pos] = {} |
118 # if mutation is in DCS file but not in SSCS, then set counts to NA | 175 mut_pos_dict[chrom_stop_pos]["ab"] = 0 |
119 if chrom_stop_pos not in mut_pos_dict.keys(): | 176 mut_pos_dict[chrom_stop_pos]["ba"] = 0 |
120 mut_pos_dict[chrom_stop_pos] = {} | 177 ref_pos_dict[chrom_stop_pos] = {} |
121 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | 178 ref_pos_dict[chrom_stop_pos]["ab"] = 0 |
122 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | 179 ref_pos_dict[chrom_stop_pos]["ba"] = 0 |
123 ref_pos_dict[chrom_stop_pos] = {} | |
124 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | |
125 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | |
126 else: | |
127 print("indels are currently not evaluated") | |
128 bam.close() | 180 bam.close() |
129 | 181 |
130 # save counts | 182 # save counts |
131 with open(sscs_counts_json, "w") as f: | 183 with open(sscs_counts_json, "w") as f: |
132 json.dump((mut_pos_dict, ref_pos_dict), f) | 184 json.dump((mut_pos_dict, ref_pos_dict), f) |