comparison mut2sscs.py @ 78:fdfe9a919ff7 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author mheinzl
date Fri, 22 Jul 2022 09:19:44 +0000
parents 6ccff403db8a
children
comparison
equal deleted inserted replaced
77:1797e461d674 78:fdfe9a919ff7
21 from __future__ import division 21 from __future__ import division
22 22
23 import argparse 23 import argparse
24 import json 24 import json
25 import os 25 import os
26 import re
26 import sys 27 import sys
27 28
28 import numpy as np 29 import numpy as np
29 import pysam 30 import pysam
30 from cyvcf2 import VCF 31 from cyvcf2 import VCF
69 ref = variant.REF 70 ref = variant.REF
70 if len(variant.ALT) == 0: 71 if len(variant.ALT) == 0:
71 continue 72 continue
72 else: 73 else:
73 alt = variant.ALT[0] 74 alt = variant.ALT[0]
75 alt = alt.upper()
76 ref = ref.upper()
77 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
78 continue
74 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt 79 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
75 if len(ref) == len(alt): 80 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
76 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): 81 if pileupcolumn.reference_pos == stop_pos:
77 if pileupcolumn.reference_pos == stop_pos: 82 count_alt = 0
78 count_alt = 0 83 count_ref = 0
79 count_ref = 0 84 count_indel = 0
80 count_indel = 0 85 for pileupread in pileupcolumn.pileups:
81 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), 86 if not pileupread.is_refskip:
82 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) 87 tag = pileupread.alignment.query_name
83 for pileupread in pileupcolumn.pileups: 88 abba = tag[-2:]
84 if not pileupread.is_del and not pileupread.is_refskip: 89 if pileupread.is_del:
85 tag = pileupread.alignment.query_name 90 p = pileupread.query_position_or_next
86 abba = tag[-2:] 91 e = p + len(alt) - 1
87 # query position is None if is_del or is_refskip is set. 92 else:
88 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: 93 p = pileupread.query_position
89 count_alt += 1 94 e = p + len(alt)
90 if chrom_stop_pos in mut_pos_dict: 95 tag = pileupread.alignment.query_name
91 if abba in mut_pos_dict[chrom_stop_pos]: 96 if "_" in tag:
92 mut_pos_dict[chrom_stop_pos][abba] += 1 97 tag = re.split('_', tag)[0]
98 s = p
99 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
100 if len(ref) < len(alt): # insertion
101 if "I" in split_cigar:
102 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
103 for ai in all_insertions: # if multiple insertions in DCS
104 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
105 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele
106 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count):
107 nuc = pileupread.alignment.query_sequence[s:e]
108 break
93 else: 109 else:
94 mut_pos_dict[chrom_stop_pos][abba] = 1 110 nuc = pileupread.alignment.query_sequence[s]
111 else:
112 nuc = pileupread.alignment.query_sequence[s]
113 elif len(ref) > len(alt): # deletion
114 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
115 if "D" in split_cigar:
116 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
117 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS
118 if di > 0: # more than 1 deletion, don't count previous deletion to position
119 all_deletions_mod = split_cigar[:ai - 1]
120 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
121 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
122 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
123 else: # first deletion in read, sum all previous (mis)matches and insertions to position
124 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
125 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele
126 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
127 nuc = pileupread.alignment.query_sequence[s:e]
128 if nuc == "":
129 nuc = str(alt)
130 break
131 else:
132 nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
133 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
134 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
135 if nuc.upper() == ref[:len(nuc)]:
136 nuc = str(ref)
137 else:
138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
139 else: # SNV: query position is None if is_del or is_refskip is set.
140 nuc = pileupread.alignment.query_sequence[s]
141
142 nuc = nuc.upper()
143
144 if nuc == alt:
145 count_alt += 1
146 if chrom_stop_pos in mut_pos_dict:
147 if abba in mut_pos_dict[chrom_stop_pos]:
148 mut_pos_dict[chrom_stop_pos][abba] += 1
95 else: 149 else:
96 mut_pos_dict[chrom_stop_pos] = {}
97 mut_pos_dict[chrom_stop_pos][abba] = 1 150 mut_pos_dict[chrom_stop_pos][abba] = 1
98 if chrom_stop_pos not in ref_pos_dict: 151 else:
99 ref_pos_dict[chrom_stop_pos] = {} 152 mut_pos_dict[chrom_stop_pos] = {}
100 ref_pos_dict[chrom_stop_pos][abba] = 0 153 mut_pos_dict[chrom_stop_pos][abba] = 1
101 154 if chrom_stop_pos not in ref_pos_dict:
102 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: 155 ref_pos_dict[chrom_stop_pos] = {}
103 count_ref += 1 156 ref_pos_dict[chrom_stop_pos][abba] = 0
104 if chrom_stop_pos in ref_pos_dict: 157 elif nuc == ref:
105 if abba in ref_pos_dict[chrom_stop_pos]: 158 count_ref += 1
106 ref_pos_dict[chrom_stop_pos][abba] += 1 159 if chrom_stop_pos in ref_pos_dict:
107 else: 160 if abba in ref_pos_dict[chrom_stop_pos]:
108 ref_pos_dict[chrom_stop_pos][abba] = 1 161 ref_pos_dict[chrom_stop_pos][abba] += 1
109 else: 162 else:
110 ref_pos_dict[chrom_stop_pos] = {}
111 ref_pos_dict[chrom_stop_pos][abba] = 1 163 ref_pos_dict[chrom_stop_pos][abba] = 1
112 else: 164 else:
113 count_indel += 1 165 ref_pos_dict[chrom_stop_pos] = {}
166 ref_pos_dict[chrom_stop_pos][abba] = 1
167 else:
168 count_indel += 1
169 print("coverage at pos %s = %s, ref = %s, alt = %s, other = %s,\n" %
170 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel))
114 171
115 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % 172 # if mutation is in DCS file but not in SSCS, then set counts to NA
116 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) 173 if chrom_stop_pos not in mut_pos_dict.keys():
117 174 mut_pos_dict[chrom_stop_pos] = {}
118 # if mutation is in DCS file but not in SSCS, then set counts to NA 175 mut_pos_dict[chrom_stop_pos]["ab"] = 0
119 if chrom_stop_pos not in mut_pos_dict.keys(): 176 mut_pos_dict[chrom_stop_pos]["ba"] = 0
120 mut_pos_dict[chrom_stop_pos] = {} 177 ref_pos_dict[chrom_stop_pos] = {}
121 mut_pos_dict[chrom_stop_pos]["ab"] = 0 178 ref_pos_dict[chrom_stop_pos]["ab"] = 0
122 mut_pos_dict[chrom_stop_pos]["ba"] = 0 179 ref_pos_dict[chrom_stop_pos]["ba"] = 0
123 ref_pos_dict[chrom_stop_pos] = {}
124 ref_pos_dict[chrom_stop_pos]["ab"] = 0
125 ref_pos_dict[chrom_stop_pos]["ba"] = 0
126 else:
127 print("indels are currently not evaluated")
128 bam.close() 180 bam.close()
129 181
130 # save counts 182 # save counts
131 with open(sscs_counts_json, "w") as f: 183 with open(sscs_counts_json, "w") as f:
132 json.dump((mut_pos_dict, ref_pos_dict), f) 184 json.dump((mut_pos_dict, ref_pos_dict), f)