Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 22 Jul 2022 09:19:44 +0000 |
parents | 1797e461d674 |
children | d7aea14291e8 |
comparison
equal
deleted
inserted
replaced
77:1797e461d674 | 78:fdfe9a919ff7 |
---|---|
85 sscs_json = args.sscsJson | 85 sscs_json = args.sscsJson |
86 outfile = args.outputFile | 86 outfile = args.outputFile |
87 outfile2 = args.outputFile2 | 87 outfile2 = args.outputFile2 |
88 outfile3 = args.outputFile3 | 88 outfile3 = args.outputFile3 |
89 outputFile_csv = args.outputFile_csv | 89 outputFile_csv = args.outputFile_csv |
90 | |
90 thresh = args.thresh | 91 thresh = args.thresh |
91 phred_score = args.phred | 92 phred_score = args.phred |
92 trim = args.trim | 93 trim = args.trim |
93 chimera_correction = args.chimera_correction | 94 chimera_correction = args.chimera_correction |
94 thr = args.softclipping_dist | 95 thr = args.softclipping_dist |
109 if thr <= 0: | 110 if thr <= 0: |
110 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr)) | 111 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr)) |
111 | 112 |
112 # load dicts | 113 # load dicts |
113 with open(json_file, "r") as f: | 114 with open(json_file, "r") as f: |
114 (tag_dict, cvrg_dict) = json.load(f) | 115 (tag_dict, cvrg_dict, tag_dict_ref) = json.load(f) |
115 | 116 |
116 with open(sscs_json, "r") as f: | 117 with open(sscs_json, "r") as f: |
117 (mut_pos_dict, ref_pos_dict) = json.load(f) | 118 (mut_pos_dict, ref_pos_dict) = json.load(f) |
118 | 119 |
119 # read bam file | 120 # read bam file |
136 ref = variant.REF | 137 ref = variant.REF |
137 if len(variant.ALT) == 0: | 138 if len(variant.ALT) == 0: |
138 continue | 139 continue |
139 else: | 140 else: |
140 alt = variant.ALT[0] | 141 alt = variant.ALT[0] |
142 | |
143 alt = alt.upper() | |
144 ref = ref.upper() | |
145 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) | |
146 continue | |
147 | |
141 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt | 148 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
142 | 149 mut_array.append([chrom, stop_pos, ref, alt]) |
143 if len(ref) == len(alt): | 150 i += 1 |
144 mut_array.append([chrom, stop_pos, ref, alt]) | 151 mut_dict[chrom_stop_pos] = {} |
145 i += 1 | 152 mut_read_pos_dict[chrom_stop_pos] = {} |
146 mut_dict[chrom_stop_pos] = {} | 153 reads_dict[chrom_stop_pos] = {} |
147 mut_read_pos_dict[chrom_stop_pos] = {} | 154 mut_read_cigar_dict[chrom_stop_pos] = {} |
148 reads_dict[chrom_stop_pos] = {} | 155 real_start_end[chrom_stop_pos] = {} |
149 mut_read_cigar_dict[chrom_stop_pos] = {} | 156 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): |
150 real_start_end[chrom_stop_pos] = {} | 157 if pileupcolumn.reference_pos == stop_pos: |
151 | 158 count_alt = 0 |
152 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | 159 count_ref = 0 |
153 if pileupcolumn.reference_pos == stop_pos: | 160 count_indel = 0 |
154 count_alt = 0 | 161 count_n = 0 |
155 count_ref = 0 | 162 count_other = 0 |
156 count_indel = 0 | 163 count_lowq = 0 |
157 count_n = 0 | 164 n = 0 |
158 count_other = 0 | 165 for pileupread in pileupcolumn.pileups: |
159 count_lowq = 0 | 166 n += 1 |
160 n = 0 | 167 if not pileupread.is_refskip: |
161 for pileupread in pileupcolumn.pileups: | 168 if pileupread.is_del: |
162 n += 1 | 169 p = pileupread.query_position_or_next |
163 if not pileupread.is_del and not pileupread.is_refskip: | 170 e = p + len(alt) - 1 |
164 tag = pileupread.alignment.query_name | 171 else: |
165 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | 172 p = pileupread.query_position |
166 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 | 173 e = p + len(alt) |
167 # if read is softclipped, store real position in reference | 174 s = p |
168 if "S" in pileupread.alignment.cigarstring: | 175 tag = pileupread.alignment.query_name |
169 # spftclipped at start | 176 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) |
170 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): | 177 |
171 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) | 178 if len(ref) < len(alt): |
172 end = pileupread.alignment.reference_end | 179 if "I" in split_cigar: |
173 # softclipped at end | 180 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] |
174 elif re.search(r"S$", pileupread.alignment.cigarstring): | 181 for ai in all_insertions: # if multiple insertions in DCS |
175 end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) | 182 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] |
176 start = pileupread.alignment.reference_start | 183 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele |
184 if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1: # if pe read matches exatcly to insertion | |
185 nuc = pileupread.alignment.query_sequence[s:e] | |
186 phred = ord(pileupread.alignment.qual[s]) - 33 | |
187 break | |
188 elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1: # if pe read has shorter insertion -- not alt | |
189 nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1] | |
190 phred = ord(pileupread.alignment.qual[s]) - 33 | |
191 break | |
192 else: # insertion in pe reads but not at the desired position | |
193 nuc = pileupread.alignment.query_sequence[s] | |
194 phred = ord(pileupread.alignment.qual[s]) - 33 | |
195 elif "D" in split_cigar: # if deletion in pe read, don't count | |
196 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
197 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
198 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
199 all_deletions_mod = split_cigar[:ai - 1] | |
200 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
201 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
202 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
203 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
204 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
205 if "D" in split_cigar and sum(del_index) == p + 1: # if deletion at that position | |
206 nuc = "D" | |
207 phred = ord(pileupread.alignment.qual[s]) - 33 | |
208 break | |
209 else: | |
210 nuc = pileupread.alignment.query_sequence[s] | |
211 phred = ord(pileupread.alignment.qual[s]) - 33 | |
212 else: # insertion in pe reads but not at the desired position | |
213 nuc = pileupread.alignment.query_sequence[s] | |
214 phred = ord(pileupread.alignment.qual[s]) - 33 | |
215 | |
216 elif len(ref) > len(alt): | |
217 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] | |
218 if "D" in split_cigar: | |
219 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
220 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
221 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
222 all_deletions_mod = split_cigar[:ai - 1] | |
223 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
224 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
225 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
226 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
227 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
228 del_count = split_cigar[ai - 1] # deletion on that position but does not necesserily match in the length | |
229 if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1: # if pe read matches exatcly to deletion | |
230 nuc = pileupread.alignment.query_sequence[s:e] | |
231 phred = ord(pileupread.alignment.qual[s]) - 33 | |
232 if nuc == "": | |
233 nuc = str(alt) | |
234 break | |
235 elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1: # if pe read has shorter deletion --> not alt | |
236 nuc = str(ref)[:int(del_count)+1] | |
237 phred = ord(pileupread.alignment.qual[s]) - 33 | |
238 break | |
239 else: # deletion in pe reads but not at the desired position | |
240 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
241 phred = ord(pileupread.alignment.qual[s]) - 33 | |
242 elif "I" in split_cigar: # if pe read has insertion --> not count | |
243 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] | |
244 for ai in all_insertions: # if multiple insertions in DCS | |
245 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
246 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele | |
247 if "I" in split_cigar and sum(ins_index) == p + 1: | |
248 nuc = "I" | |
249 phred = ord(pileupread.alignment.qual[s]) - 33 | |
250 break | |
251 else: | |
252 nuc = pileupread.alignment.query_sequence[s] | |
253 phred = ord(pileupread.alignment.qual[s]) - 33 | |
254 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there | |
255 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] | |
256 phred = ord(pileupread.alignment.qual[s]) - 33 | |
257 if nuc.upper() == ref[:len(nuc)]: | |
258 nuc = str(ref) | |
259 else: # deletion in pe reads but not at the desired position | |
260 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
261 phred = ord(pileupread.alignment.qual[s]) - 33 | |
262 else: # SNV: query position is None if is_del or is_refskip is set. | |
263 nuc = pileupread.alignment.query_sequence[s] | |
264 phred = ord(pileupread.alignment.qual[s]) - 33 | |
265 | |
266 nuc = nuc.upper() | |
267 | |
268 # if read is softclipped, store real position in reference | |
269 if "S" in pileupread.alignment.cigarstring: | |
270 # spftclipped at start | |
271 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): | |
272 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) | |
273 end = pileupread.alignment.reference_end | |
274 # softclipped at end | |
275 elif re.search(r"S$", pileupread.alignment.cigarstring): | |
276 end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) | |
277 start = pileupread.alignment.reference_start | |
278 else: | |
279 end = pileupread.alignment.reference_end | |
280 start = pileupread.alignment.reference_start | |
281 if phred < phred_score: | |
282 nuc = "lowQ" | |
283 if tag not in mut_dict[chrom_stop_pos]: | |
284 mut_dict[chrom_stop_pos][tag] = {} | |
285 if nuc in mut_dict[chrom_stop_pos][tag]: | |
286 mut_dict[chrom_stop_pos][tag][nuc] += 1 | |
287 else: | |
288 mut_dict[chrom_stop_pos][tag][nuc] = 1 | |
289 if tag not in mut_read_pos_dict[chrom_stop_pos]: | |
290 mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1] | |
291 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] | |
292 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] | |
293 real_start_end[chrom_stop_pos][tag] = [(start, end)] | |
294 else: | |
295 mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1) | |
296 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) | |
297 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) | |
298 real_start_end[chrom_stop_pos][tag].append((start, end)) | |
299 if nuc == alt: | |
300 count_alt += 1 | |
301 if tag not in mut_read_dict: | |
302 mut_read_dict[tag] = {} | |
303 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) | |
177 else: | 304 else: |
178 end = pileupread.alignment.reference_end | 305 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) |
179 start = pileupread.alignment.reference_start | 306 elif nuc == ref: |
180 | 307 count_ref += 1 |
181 if phred < phred_score: | 308 elif nuc == "N": |
182 nuc = "lowQ" | 309 count_n += 1 |
183 if tag not in mut_dict[chrom_stop_pos]: | 310 elif nuc == "lowQ": |
184 mut_dict[chrom_stop_pos][tag] = {} | 311 count_lowq += 1 |
185 if nuc in mut_dict[chrom_stop_pos][tag]: | 312 else: |
186 mut_dict[chrom_stop_pos][tag][nuc] += 1 | 313 count_other += 1 |
187 else: | 314 else: |
188 mut_dict[chrom_stop_pos][tag][nuc] = 1 | 315 count_indel += 1 |
189 if tag not in mut_read_pos_dict[chrom_stop_pos]: | |
190 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] | |
191 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] | |
192 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] | |
193 real_start_end[chrom_stop_pos][tag] = [(start, end)] | |
194 else: | |
195 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) | |
196 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) | |
197 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) | |
198 real_start_end[chrom_stop_pos][tag].append((start, end)) | |
199 if nuc == alt: | |
200 count_alt += 1 | |
201 if tag not in mut_read_dict: | |
202 mut_read_dict[tag] = {} | |
203 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) | |
204 else: | |
205 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) | |
206 elif nuc == ref: | |
207 count_ref += 1 | |
208 elif nuc == "N": | |
209 count_n += 1 | |
210 elif nuc == "lowQ": | |
211 count_lowq += 1 | |
212 else: | |
213 count_other += 1 | |
214 else: | |
215 count_indel += 1 | |
216 | 316 |
217 mut_array = np.array(mut_array) | 317 mut_array = np.array(mut_array) |
218 for read in bam.fetch(until_eof=True): | 318 for read in bam.fetch(until_eof=True): |
219 if read.is_unmapped: | 319 if read.is_unmapped: |
220 pure_tag = read.query_name[:-5] | 320 pure_tag = read.query_name[:-5] |
221 nuc = "na" | 321 nuc = "na" |
222 for key in tag_dict[pure_tag].keys(): | 322 if pure_tag in tag_dict.keys(): # stored all ref and alt reads --> get only alt reads |
223 if key not in mut_dict: | 323 for key in tag_dict[pure_tag].keys(): |
224 mut_dict[key] = {} | 324 if key not in mut_dict: |
225 if read.query_name not in mut_dict[key]: | 325 mut_dict[key] = {} |
226 mut_dict[key][read.query_name] = {} | 326 if read.query_name not in mut_dict[key]: |
227 if nuc in mut_dict[key][read.query_name]: | 327 mut_dict[key][read.query_name] = {} |
228 mut_dict[key][read.query_name][nuc] += 1 | 328 if nuc in mut_dict[key][read.query_name]: |
229 else: | 329 mut_dict[key][read.query_name][nuc] += 1 |
230 mut_dict[key][read.query_name][nuc] = 1 | 330 else: |
331 mut_dict[key][read.query_name][nuc] = 1 | |
231 bam.close() | 332 bam.close() |
232 | |
233 # create pure_tags_dict | 333 # create pure_tags_dict |
234 pure_tags_dict = {} | 334 pure_tags_dict = {} |
335 pure_tags_dict_ref = {} | |
235 for key1, value1 in sorted(mut_dict.items()): | 336 for key1, value1 in sorted(mut_dict.items()): |
236 i = np.where(np.array(['#'.join(str(i) for i in z) | 337 i = np.where(np.array(['#'.join(str(i) for i in z) |
237 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] | 338 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
238 ref = mut_array[i, 2] | 339 ref = mut_array[i, 2] |
239 alt = mut_array[i, 3] | 340 alt = mut_array[i, 3] |
240 pure_tags_dict[key1] = {} | 341 pure_tags_dict[key1] = {} |
342 pure_tags_dict_ref[key1] = {} | |
241 for key2, value2 in sorted(value1.items()): | 343 for key2, value2 in sorted(value1.items()): |
242 for key3, value3 in value2.items(): | 344 for key3, value3 in value2.items(): |
243 pure_tag = key2[:-5] | 345 pure_tag = key2[:-5] |
244 if key3 == alt: | 346 if key3 == alt: |
245 if pure_tag in pure_tags_dict[key1]: | 347 if pure_tag in pure_tags_dict[key1]: |
246 pure_tags_dict[key1][pure_tag] += 1 | 348 pure_tags_dict[key1][pure_tag] += 1 |
247 else: | 349 else: |
248 pure_tags_dict[key1][pure_tag] = 1 | 350 pure_tags_dict[key1][pure_tag] = 1 |
249 | 351 |
352 if key3 == ref: | |
353 if pure_tag in pure_tags_dict_ref[key1]: | |
354 pure_tags_dict_ref[key1][pure_tag] += 1 | |
355 else: | |
356 pure_tags_dict_ref[key1][pure_tag] = 1 | |
357 | |
250 # create pure_tags_dict_short with thresh | 358 # create pure_tags_dict_short with thresh |
251 if thresh > 0: | 359 if thresh > 0: |
252 pure_tags_dict_short = {} | 360 pure_tags_dict_short = {} |
253 for key, value in sorted(pure_tags_dict.items()): | 361 for key, value in sorted(pure_tags_dict.items()): |
254 if len(value) < thresh: | 362 if len(value) < thresh: |
277 | 385 |
278 format13 = workbook3.add_format({'bg_color': '#BCF5A9'}) # green | 386 format13 = workbook3.add_format({'bg_color': '#BCF5A9'}) # green |
279 format23 = workbook3.add_format({'bg_color': '#FFC7CE'}) # red | 387 format23 = workbook3.add_format({'bg_color': '#FFC7CE'}) # red |
280 format33 = workbook3.add_format({'bg_color': '#FACC2E'}) # yellow | 388 format33 = workbook3.add_format({'bg_color': '#FACC2E'}) # yellow |
281 | 389 |
282 header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', | 390 header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', |
283 'read median length.ba', 'DCS median length', | 391 'read median length.ba', 'DCS median length', |
284 'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba', | 392 'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba', |
285 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', | 393 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', |
286 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba', | 394 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba', |
287 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba', | 395 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba', |
288 'in phase', 'chimeric tag') | 396 'in phase', 'chimeric tag') |
289 ws1.write_row(0, 0, header_line) | 397 ws1.write_row(0, 0, header_line) |
290 csv_writer.writerow(header_line) | 398 csv_writer.writerow(header_line) |
399 | |
291 counter_tier11 = 0 | 400 counter_tier11 = 0 |
292 counter_tier12 = 0 | 401 counter_tier12 = 0 |
293 counter_tier21 = 0 | 402 counter_tier21 = 0 |
294 counter_tier22 = 0 | 403 counter_tier22 = 0 |
295 counter_tier23 = 0 | 404 counter_tier23 = 0 |
306 counter_tier6 = 0 | 415 counter_tier6 = 0 |
307 counter_tier7 = 0 | 416 counter_tier7 = 0 |
308 | 417 |
309 row = 1 | 418 row = 1 |
310 tier_dict = {} | 419 tier_dict = {} |
420 tier_dict_ref = {} | |
311 chimera_dict = {} | 421 chimera_dict = {} |
312 for key1, value1 in sorted(mut_dict.items()): | 422 for key1, value1 in sorted(mut_dict.items()): |
313 counts_mut = 0 | 423 counts_mut = 0 |
314 chimeric_tag_list = [] | 424 chimeric_tag_list = [] |
315 chimeric_tag = {} | 425 chimeric_tag = {} |
316 if key1 in pure_tags_dict_short.keys(): | 426 if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()): # ref or alt |
427 | |
317 change_tier_after_print = [] | 428 change_tier_after_print = [] |
318 i = np.where(np.array(['#'.join(str(i) for i in z) | 429 i = np.where(np.array(['#'.join(str(i) for i in z) |
319 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] | 430 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
320 ref = mut_array[i, 2] | 431 ref = mut_array[i, 2] |
321 alt = mut_array[i, 3] | 432 alt = mut_array[i, 3] |
322 dcs_median = cvrg_dict[key1][2] | 433 dcs_median = cvrg_dict[key1][2] |
323 whole_array = list(pure_tags_dict_short[key1].keys()) | 434 whole_array = list(pure_tags_dict_short[key1].keys()) |
324 | 435 |
325 tier_dict[key1] = {} | 436 tier_dict[key1] = {} |
437 tier_dict_ref[key1] = {} | |
326 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), | 438 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), |
327 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), | 439 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), |
328 ("tier 6", 0), ("tier 7", 0)] | 440 ("tier 6", 0), ("tier 7", 0)] |
329 for k, v in values_tier_dict: | 441 for k, v in values_tier_dict: |
330 tier_dict[key1][k] = v | 442 tier_dict[key1][k] = v |
443 tier_dict_ref[key1][k] = v | |
331 | 444 |
332 used_keys = [] | 445 used_keys = [] |
333 if 'ab' in mut_pos_dict[key1].keys(): | 446 if 'ab' in mut_pos_dict[key1].keys(): |
334 sscs_mut_ab = mut_pos_dict[key1]['ab'] | 447 sscs_mut_ab = mut_pos_dict[key1]['ab'] |
335 else: | 448 else: |
347 else: | 460 else: |
348 sscs_ref_ba = 0 | 461 sscs_ref_ba = 0 |
349 for key2, value2 in sorted(value1.items()): | 462 for key2, value2 in sorted(value1.items()): |
350 add_mut14 = "" | 463 add_mut14 = "" |
351 add_mut23 = "" | 464 add_mut23 = "" |
352 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()): | 465 if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys(): # skip reads that have not alt or ref |
466 continue | |
467 | |
468 if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys): | |
469 | |
470 if key2[:-5] in pure_tags_dict_short[key1].keys(): | |
471 variant_type = "alt" | |
472 elif key2[:-5] in pure_tags_dict_ref[key1].keys(): | |
473 variant_type = "ref" | |
474 | |
353 if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): | 475 if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): |
354 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) | 476 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) |
355 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): | 477 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): |
356 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na'] | 478 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na'] |
357 # na1f = na1/total1 | 479 # na1f = na1/total1 |
548 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] | 670 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] |
549 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2'] | 671 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2'] |
550 | 672 |
551 used_keys.append(key2[:-5]) | 673 used_keys.append(key2[:-5]) |
552 counts_mut += 1 | 674 counts_mut += 1 |
553 if (alt1f + alt2f + alt3f + alt4f) > 0.5: | 675 if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)): |
676 if variant_type == "alt": | |
677 tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f | |
678 tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f | |
679 elif variant_type == "ref": | |
680 tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f | |
681 tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f | |
682 | |
554 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new | 683 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new |
555 if total1new == 0: | 684 if total1new == 0: |
556 ref1f = alt1f = None | 685 ref1f = alt1f = None |
557 alt1ff = -1 | 686 alt1ff = -1 |
558 alt1ff_trim = -1 | 687 alt1ff_trim = -1 |
688 tier1ff = -1 | |
689 tier1ff_trim = -1 | |
559 else: | 690 else: |
560 alt1ff = alt1f | 691 alt1ff = alt1f |
561 alt1ff_trim = alt1f | 692 alt1ff_trim = alt1f |
693 tier1ff = tier1ff | |
694 tier1ff_trim = tier1ff_trim | |
562 if total2new == 0: | 695 if total2new == 0: |
563 ref2f = alt2f = None | 696 ref2f = alt2f = None |
564 alt2ff = -1 | 697 alt2ff = -1 |
565 alt2ff_trim = -1 | 698 alt2ff_trim = -1 |
699 tier2ff = -1 | |
700 tier2ff_trim = -1 | |
566 else: | 701 else: |
567 alt2ff = alt2f | 702 alt2ff = alt2f |
568 alt2ff_trim = alt2f | 703 alt2ff_trim = alt2f |
704 tier2ff = tier2ff | |
705 tier2ff_trim = tier2ff_trim | |
569 if total3new == 0: | 706 if total3new == 0: |
570 ref3f = alt3f = None | 707 ref3f = alt3f = None |
571 alt3ff = -1 | 708 alt3ff = -1 |
572 alt3ff_trim = -1 | 709 alt3ff_trim = -1 |
710 tier3ff = -1 | |
711 tier3ff_trim = -1 | |
573 else: | 712 else: |
574 alt3ff = alt3f | 713 alt3ff = alt3f |
575 alt3ff_trim = alt3f | 714 alt3ff_trim = alt3f |
715 tier3ff = tier3ff | |
716 tier3ff_trim = tier3ff_trim | |
576 if total4new == 0: | 717 if total4new == 0: |
577 ref4f = alt4f = None | 718 ref4f = alt4f = None |
578 alt4ff = -1 | 719 alt4ff = -1 |
579 alt4ff_trim = -1 | 720 alt4ff_trim = -1 |
721 tier4ff = -1 | |
722 tier4ff_trim = -1 | |
580 else: | 723 else: |
581 alt4ff = alt4f | 724 alt4ff = alt4f |
582 alt4ff_trim = alt4f | 725 alt4ff_trim = alt4f |
726 tier4ff = tier4ff | |
727 tier4ff_trim = tier4ff_trim | |
583 | 728 |
584 beg1 = beg4 = beg2 = beg3 = 0 | 729 beg1 = beg4 = beg2 = beg3 = 0 |
585 | 730 |
586 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 731 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
587 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 732 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
603 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False | 748 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False |
604 | 749 |
605 # mate 1 - SSCS ab | 750 # mate 1 - SSCS ab |
606 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] | 751 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] |
607 safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) | 752 safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) |
608 if (safe_div_result == None): | 753 if (safe_div_result is None): |
609 ratio1 = False | 754 ratio1 = False |
610 else: | 755 else: |
611 ratio1 = safe_div_result >= threshold_reads | 756 ratio1 = safe_div_result >= threshold_reads |
612 if any(ij is True for ij in softclipped_idx1): | 757 if any(ij is True for ij in softclipped_idx1): |
613 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] | 758 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] |
627 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 772 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
628 | 773 |
629 # mate 1 - SSCS ba | 774 # mate 1 - SSCS ba |
630 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] | 775 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] |
631 safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) | 776 safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) |
632 if (safe_div_result == None): | 777 if (safe_div_result is None): |
633 ratio4 = False | 778 ratio4 = False |
634 else: | 779 else: |
635 ratio4 = safe_div_result >= threshold_reads | 780 ratio4 = safe_div_result >= threshold_reads |
636 if any(ij is True for ij in softclipped_idx4): | 781 if any(ij is True for ij in softclipped_idx4): |
637 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] | 782 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] |
651 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 796 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
652 | 797 |
653 # mate 2 - SSCS ab | 798 # mate 2 - SSCS ab |
654 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] | 799 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] |
655 safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) | 800 safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) |
656 if (safe_div_result == None): | 801 if (safe_div_result is None): |
657 ratio2 = False | 802 ratio2 = False |
658 else: | 803 else: |
659 ratio2 = safe_div_result >= threshold_reads | 804 ratio2 = safe_div_result >= threshold_reads |
660 if any(ij is True for ij in softclipped_idx2): | 805 if any(ij is True for ij in softclipped_idx2): |
661 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] | 806 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] |
675 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 820 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
676 | 821 |
677 # mate 2 - SSCS ba | 822 # mate 2 - SSCS ba |
678 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] | 823 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] |
679 safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) | 824 safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) |
680 if (safe_div_result == None): | 825 if (safe_div_result is None): |
681 ratio3 = False | 826 ratio3 = False |
682 else: | 827 else: |
683 ratio3 = safe_div_result >= threshold_reads | 828 ratio3 = safe_div_result >= threshold_reads |
684 if any(ij is True for ij in softclipped_idx3): | 829 if any(ij is True for ij in softclipped_idx3): |
685 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] | 830 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] |
696 else: | 841 else: |
697 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number | 842 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number |
698 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 843 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
699 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 844 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
700 | 845 |
701 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant | 846 if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) & # contradictory variant |
702 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | | 847 all(float(ij) == 0. for ij in [tier2ff, tier3ff])) | |
703 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & | 848 (all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) & |
704 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): | 849 all(float(ij) == 0. for ij in [tier1ff, tier4ff]))): |
705 alt1ff = 0 | 850 tier1ff = 0 |
706 alt4ff = 0 | 851 tier4ff = 0 |
707 alt2ff = 0 | 852 tier2ff = 0 |
708 alt3ff = 0 | 853 tier3ff = 0 |
709 trimmed = False | 854 trimmed = False |
710 contradictory = True | 855 contradictory = True |
711 # softclipping tiers | 856 # softclipping tiers |
712 # information of both mates available --> all reads for both mates and SSCS are softclipped | 857 # information of both mates available --> all reads for both mates and SSCS are softclipped |
713 elif (ratio1 & ratio4 & ratio2 & ratio3 & | 858 elif (ratio1 & ratio4 & ratio2 & ratio3 & |
714 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 859 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
715 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 860 all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available |
716 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 861 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
717 softclipped_mutation_allMates = True | 862 softclipped_mutation_allMates = True |
718 softclipped_mutation_oneOfTwoMates = False | 863 softclipped_mutation_oneOfTwoMates = False |
719 softclipped_mutation_oneOfTwoSSCS = False | 864 softclipped_mutation_oneOfTwoSSCS = False |
720 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 865 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
721 softclipped_mutation_oneMate = False | 866 softclipped_mutation_oneMate = False |
722 softclipped_mutation_oneMateOneSSCS = False | 867 softclipped_mutation_oneMateOneSSCS = False |
723 alt1ff = 0 | 868 tier1ff = 0 |
724 alt4ff = 0 | 869 tier4ff = 0 |
725 alt2ff = 0 | 870 tier2ff = 0 |
726 alt3ff = 0 | 871 tier3ff = 0 |
727 trimmed = False | 872 trimmed = False |
728 contradictory = False | 873 contradictory = False |
729 # information of both mates available --> only one mate softclipped | 874 # information of both mates available --> only one mate softclipped |
730 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | | 875 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | |
731 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & | 876 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & |
732 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 877 all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available |
733 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 878 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
734 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red | 879 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red |
735 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue | 880 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue |
736 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red | 881 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red |
737 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue | 882 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue |
761 n_spacer_barcode = max_end2 - max_end1 | 906 n_spacer_barcode = max_end2 - max_end1 |
762 read_len_median2 = read_len_median2 - n_spacer_barcode | 907 read_len_median2 = read_len_median2 - n_spacer_barcode |
763 read_len_median3 = read_len_median3 - n_spacer_barcode | 908 read_len_median3 = read_len_median3 - n_spacer_barcode |
764 else: | 909 else: |
765 softclipped_mutation_oneOfTwoMates = True | 910 softclipped_mutation_oneOfTwoMates = True |
766 alt1ff = 0 | 911 tier1ff = 0 |
767 alt4ff = 0 | 912 tier4ff = 0 |
768 alt2ff = 0 | 913 tier2ff = 0 |
769 alt3ff = 0 | 914 tier3ff = 0 |
770 trimmed = False | 915 trimmed = False |
771 contradictory = False | 916 contradictory = False |
772 softclipped_mutation_allMates = False | 917 softclipped_mutation_allMates = False |
773 softclipped_mutation_oneOfTwoSSCS = False | 918 softclipped_mutation_oneOfTwoSSCS = False |
774 softclipped_mutation_oneMate = False | 919 softclipped_mutation_oneMate = False |
778 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): | 923 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
779 beg1 = total1new | 924 beg1 = total1new |
780 total1new = 0 | 925 total1new = 0 |
781 alt1ff = 0 | 926 alt1ff = 0 |
782 alt1f = 0 | 927 alt1f = 0 |
928 tier1ff = 0 | |
783 trimmed = True | 929 trimmed = True |
784 | 930 |
785 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): | 931 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): |
786 beg4 = total4new | 932 beg4 = total4new |
787 total4new = 0 | 933 total4new = 0 |
788 alt4ff = 0 | 934 alt4ff = 0 |
789 alt4f = 0 | 935 alt4f = 0 |
936 tier4ff = 0 | |
790 trimmed = True | 937 trimmed = True |
791 | 938 |
792 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): | 939 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): |
793 beg2 = total2new | 940 beg2 = total2new |
794 total2new = 0 | 941 total2new = 0 |
795 alt2ff = 0 | 942 alt2ff = 0 |
796 alt2f = 0 | 943 alt2f = 0 |
944 tier2ff = 0 | |
797 trimmed = True | 945 trimmed = True |
798 | 946 |
799 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): | 947 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): |
800 beg3 = total3new | 948 beg3 = total3new |
801 total3new = 0 | 949 total3new = 0 |
802 alt3ff = 0 | 950 alt3ff = 0 |
803 alt3f = 0 | 951 alt3f = 0 |
952 tier3ff = 0 | |
804 trimmed = True | 953 trimmed = True |
805 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 954 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
806 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 955 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
807 | 956 |
808 # information of both mates available --> only one mate softclipped | 957 # information of both mates available --> only one mate softclipped |
809 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 958 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
810 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 959 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
811 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 960 all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available |
812 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 961 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
813 softclipped_mutation_allMates = False | 962 softclipped_mutation_allMates = False |
814 softclipped_mutation_oneOfTwoMates = False | 963 softclipped_mutation_oneOfTwoMates = False |
815 softclipped_mutation_oneOfTwoSSCS = True | 964 softclipped_mutation_oneOfTwoSSCS = True |
816 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 965 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
817 softclipped_mutation_oneMate = False | 966 softclipped_mutation_oneMate = False |
818 softclipped_mutation_oneMateOneSSCS = False | 967 softclipped_mutation_oneMateOneSSCS = False |
819 alt1ff = 0 | 968 tier1ff = 0 |
820 alt4ff = 0 | 969 tier4ff = 0 |
821 alt2ff = 0 | 970 tier2ff = 0 |
822 alt3ff = 0 | 971 tier3ff = 0 |
823 trimmed = False | 972 trimmed = False |
824 contradictory = False | 973 contradictory = False |
825 | 974 |
826 # information of one mate available --> all reads of one mate are softclipped | 975 # information of one mate available --> all reads of one mate are softclipped |
827 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & | 976 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & |
828 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | | 977 all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) | |
829 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 978 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
830 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available | 979 all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))): # all mates available |
831 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 980 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
832 softclipped_mutation_allMates = False | 981 softclipped_mutation_allMates = False |
833 softclipped_mutation_oneOfTwoMates = False | 982 softclipped_mutation_oneOfTwoMates = False |
834 softclipped_mutation_oneOfTwoSSCS = False | 983 softclipped_mutation_oneOfTwoSSCS = False |
835 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 984 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
836 softclipped_mutation_oneMate = True | 985 softclipped_mutation_oneMate = True |
837 softclipped_mutation_oneMateOneSSCS = False | 986 softclipped_mutation_oneMateOneSSCS = False |
838 alt1ff = 0 | 987 tier1ff = 0 |
839 alt4ff = 0 | 988 tier4ff = 0 |
840 alt2ff = 0 | 989 tier2ff = 0 |
841 alt3ff = 0 | 990 tier3ff = 0 |
842 trimmed = False | 991 trimmed = False |
843 contradictory = False | 992 contradictory = False |
844 # information of one mate available --> only one SSCS is softclipped | 993 # information of one mate available --> only one SSCS is softclipped |
845 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 994 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
846 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | | 995 (all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) | |
847 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 996 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
848 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available | 997 (all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))): # all mates available |
849 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 998 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
850 softclipped_mutation_allMates = False | 999 softclipped_mutation_allMates = False |
851 softclipped_mutation_oneOfTwoMates = False | 1000 softclipped_mutation_oneOfTwoMates = False |
852 softclipped_mutation_oneOfTwoSSCS = False | 1001 softclipped_mutation_oneOfTwoSSCS = False |
853 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 1002 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
854 softclipped_mutation_oneMate = False | 1003 softclipped_mutation_oneMate = False |
855 softclipped_mutation_oneMateOneSSCS = True | 1004 softclipped_mutation_oneMateOneSSCS = True |
856 alt1ff = 0 | 1005 tier1ff = 0 |
857 alt4ff = 0 | 1006 tier4ff = 0 |
858 alt2ff = 0 | 1007 tier2ff = 0 |
859 alt3ff = 0 | 1008 tier3ff = 0 |
860 trimmed = False | 1009 trimmed = False |
861 contradictory = False | 1010 contradictory = False |
862 | 1011 |
863 else: | 1012 else: |
864 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): | 1013 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
865 beg1 = total1new | 1014 beg1 = total1new |
866 total1new = 0 | 1015 total1new = 0 |
867 alt1ff = 0 | 1016 alt1ff = 0 |
868 alt1f = 0 | 1017 alt1f = 0 |
1018 tier1ff = 0 | |
869 trimmed = True | 1019 trimmed = True |
870 | 1020 |
871 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): | 1021 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): |
872 beg4 = total4new | 1022 beg4 = total4new |
873 total4new = 0 | 1023 total4new = 0 |
874 alt4ff = 0 | 1024 alt4ff = 0 |
875 alt4f = 0 | 1025 alt4f = 0 |
1026 tier4ff = 0 | |
876 trimmed = True | 1027 trimmed = True |
877 | 1028 |
878 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): | 1029 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): |
879 beg2 = total2new | 1030 beg2 = total2new |
880 total2new = 0 | 1031 total2new = 0 |
881 alt2ff = 0 | 1032 alt2ff = 0 |
882 alt2f = 0 | 1033 alt2f = 0 |
1034 tier2ff = 0 | |
883 trimmed = True | 1035 trimmed = True |
884 | 1036 |
885 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): | 1037 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): |
886 beg3 = total3new | 1038 beg3 = total3new |
887 total3new = 0 | 1039 total3new = 0 |
888 alt3ff = 0 | 1040 alt3ff = 0 |
889 alt3f = 0 | 1041 alt3f = 0 |
1042 tier3ff = 0 | |
890 trimmed = True | 1043 trimmed = True |
891 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 1044 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
892 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 1045 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
893 | 1046 |
894 # assign tiers | 1047 # assign tiers |
895 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & | 1048 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & |
896 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | | 1049 all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | |
897 (all(int(ij) >= 3 for ij in [total2new, total3new]) & | 1050 (all(int(ij) >= 3 for ij in [total2new, total3new]) & |
898 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): | 1051 all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): |
899 tier = "1.1" | 1052 tier = "1.1" |
900 counter_tier11 += 1 | 1053 counter_tier11 += 1 |
901 tier_dict[key1]["tier 1.1"] += 1 | 1054 if variant_type == "alt": |
1055 tier_dict[key1]["tier 1.1"] += 1 | |
1056 elif variant_type == "ref": | |
1057 tier_dict_ref[key1]["tier 1.1"] += 1 | |
902 | 1058 |
903 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & | 1059 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & |
904 any(int(ij) >= 3 for ij in [total1new, total4new]) & | 1060 any(int(ij) >= 3 for ij in [total1new, total4new]) & |
905 any(int(ij) >= 3 for ij in [total2new, total3new]) & | 1061 any(int(ij) >= 3 for ij in [total2new, total3new]) & |
906 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): | 1062 all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): |
907 tier = "1.2" | 1063 tier = "1.2" |
908 counter_tier12 += 1 | 1064 counter_tier12 += 1 |
909 tier_dict[key1]["tier 1.2"] += 1 | 1065 if variant_type == "alt": |
1066 tier_dict[key1]["tier 1.2"] += 1 | |
1067 elif variant_type == "ref": | |
1068 tier_dict_ref[key1]["tier 1.2"] += 1 | |
910 | 1069 |
911 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & | 1070 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & |
912 any(int(ij) >= 3 for ij in [total1new, total4new]) & | 1071 any(int(ij) >= 3 for ij in [total1new, total4new]) & |
913 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | | 1072 all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | |
914 (all(int(ij) >= 1 for ij in [total2new, total3new]) & | 1073 (all(int(ij) >= 1 for ij in [total2new, total3new]) & |
915 any(int(ij) >= 3 for ij in [total2new, total3new]) & | 1074 any(int(ij) >= 3 for ij in [total2new, total3new]) & |
916 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): | 1075 all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): |
917 tier = "2.1" | 1076 tier = "2.1" |
918 counter_tier21 += 1 | 1077 counter_tier21 += 1 |
919 tier_dict[key1]["tier 2.1"] += 1 | 1078 if variant_type == "alt": |
1079 tier_dict[key1]["tier 2.1"] += 1 | |
1080 elif variant_type == "ref": | |
1081 tier_dict_ref[key1]["tier 2.1"] += 1 | |
920 | 1082 |
921 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & | 1083 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & |
922 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): | 1084 all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): |
923 tier = "2.2" | 1085 tier = "2.2" |
924 counter_tier22 += 1 | 1086 counter_tier22 += 1 |
925 tier_dict[key1]["tier 2.2"] += 1 | 1087 if variant_type == "alt": |
1088 tier_dict[key1]["tier 2.2"] += 1 | |
1089 elif variant_type == "ref": | |
1090 tier_dict_ref[key1]["tier 2.2"] += 1 | |
926 | 1091 |
927 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & | 1092 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & |
928 any(int(ij) >= 3 for ij in [total2new, total3new]) & | 1093 any(int(ij) >= 3 for ij in [total2new, total3new]) & |
929 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & | 1094 all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) & |
930 any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) | | 1095 any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) | |
931 (all(int(ij) >= 1 for ij in [total2new, total3new]) & | 1096 (all(int(ij) >= 1 for ij in [total2new, total3new]) & |
932 any(int(ij) >= 3 for ij in [total1new, total4new]) & | 1097 any(int(ij) >= 3 for ij in [total1new, total4new]) & |
933 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & | 1098 all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) & |
934 any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): | 1099 any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))): |
935 tier = "2.3" | 1100 tier = "2.3" |
936 counter_tier23 += 1 | 1101 counter_tier23 += 1 |
937 tier_dict[key1]["tier 2.3"] += 1 | 1102 if variant_type == "alt": |
1103 tier_dict[key1]["tier 2.3"] += 1 | |
1104 elif variant_type == "ref": | |
1105 tier_dict_ref[key1]["tier 2.3"] += 1 | |
938 | 1106 |
939 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & | 1107 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & |
940 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | | 1108 all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | |
941 (all(int(ij) >= 1 for ij in [total2new, total3new]) & | 1109 (all(int(ij) >= 1 for ij in [total2new, total3new]) & |
942 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): | 1110 all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): |
943 tier = "2.4" | 1111 tier = "2.4" |
944 counter_tier24 += 1 | 1112 counter_tier24 += 1 |
945 tier_dict[key1]["tier 2.4"] += 1 | 1113 if variant_type == "alt": |
1114 tier_dict[key1]["tier 2.4"] += 1 | |
1115 elif variant_type == "ref": | |
1116 tier_dict_ref[key1]["tier 2.4"] += 1 | |
946 | 1117 |
947 elif ((len(pure_tags_dict_short[key1]) > 1) & | 1118 elif ((len(pure_tags_dict_short[key1]) > 1) & |
948 (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | | 1119 (all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) | |
949 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): | 1120 all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): |
950 tier = "3.1" | 1121 tier = "3.1" |
951 counter_tier31 += 1 | 1122 counter_tier31 += 1 |
952 tier_dict[key1]["tier 3.1"] += 1 | 1123 if variant_type == "alt": |
1124 tier_dict[key1]["tier 3.1"] += 1 | |
1125 elif variant_type == "ref": | |
1126 tier_dict_ref[key1]["tier 3.1"] += 1 | |
953 | 1127 |
954 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & | 1128 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & |
955 all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) | | 1129 all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) | |
956 (all(int(ij) >= 1 for ij in [total2new, total3new]) & | 1130 (all(int(ij) >= 1 for ij in [total2new, total3new]) & |
957 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): | 1131 all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): |
958 tier = "3.2" | 1132 tier = "3.2" |
959 counter_tier32 += 1 | 1133 counter_tier32 += 1 |
960 tier_dict[key1]["tier 3.2"] += 1 | 1134 if variant_type == "alt": |
1135 tier_dict[key1]["tier 3.2"] += 1 | |
1136 elif variant_type == "ref": | |
1137 tier_dict_ref[key1]["tier 3.2"] += 1 | |
961 | 1138 |
962 elif (trimmed): | 1139 elif (trimmed): |
963 tier = "4" | 1140 tier = "4" |
964 counter_tier4 += 1 | 1141 counter_tier4 += 1 |
965 tier_dict[key1]["tier 4"] += 1 | 1142 if variant_type == "alt": |
1143 tier_dict[key1]["tier 4"] += 1 | |
1144 elif variant_type == "ref": | |
1145 tier_dict_ref[key1]["tier 4"] += 1 | |
966 | 1146 |
967 # assign tiers | 1147 # assign tiers |
968 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | 1148 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & |
969 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | 1149 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | |
970 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | 1150 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & |
971 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | 1151 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): |
972 trimmed_actual_high_tier = True | 1152 trimmed_actual_high_tier = True |
973 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | 1153 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & |
974 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | 1154 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & |
975 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | 1155 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & |
976 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): | 1156 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): |
977 trimmed_actual_high_tier = True | 1157 trimmed_actual_high_tier = True |
978 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | 1158 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & |
979 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | 1159 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & |
980 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | 1160 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | |
981 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | 1161 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & |
982 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | 1162 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & |
983 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | 1163 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): |
984 trimmed_actual_high_tier = True | 1164 trimmed_actual_high_tier = True |
985 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | 1165 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & |
986 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): | 1166 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): |
987 trimmed_actual_high_tier = True | 1167 trimmed_actual_high_tier = True |
988 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | 1168 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & |
989 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | 1169 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & |
990 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) & | 1170 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) & |
991 any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) | | 1171 any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) | |
992 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | 1172 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & |
993 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | 1173 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & |
994 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) & | 1174 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) & |
995 any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))): | 1175 any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))): |
996 trimmed_actual_high_tier = True | 1176 trimmed_actual_high_tier = True |
997 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | 1177 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & |
998 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | 1178 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | |
999 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | 1179 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & |
1000 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | 1180 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): |
1001 trimmed_actual_high_tier = True | 1181 trimmed_actual_high_tier = True |
1002 else: | 1182 else: |
1003 trimmed_actual_high_tier = False | 1183 trimmed_actual_high_tier = False |
1004 | 1184 |
1005 elif softclipped_mutation_allMates: | 1185 elif softclipped_mutation_allMates: |
1006 tier = "5.1" | 1186 tier = "5.1" |
1007 counter_tier51 += 1 | 1187 counter_tier51 += 1 |
1008 tier_dict[key1]["tier 5.1"] += 1 | 1188 if variant_type == "alt": |
1189 tier_dict[key1]["tier 5.1"] += 1 | |
1190 elif variant_type == "ref": | |
1191 tier_dict_ref[key1]["tier 5.1"] += 1 | |
1009 | 1192 |
1010 elif softclipped_mutation_oneOfTwoMates: | 1193 elif softclipped_mutation_oneOfTwoMates: |
1011 tier = "5.2" | 1194 tier = "5.2" |
1012 counter_tier52 += 1 | 1195 counter_tier52 += 1 |
1013 tier_dict[key1]["tier 5.2"] += 1 | 1196 if variant_type == "alt": |
1197 tier_dict[key1]["tier 5.2"] += 1 | |
1198 elif variant_type == "ref": | |
1199 tier_dict_ref[key1]["tier 5.2"] += 1 | |
1014 | 1200 |
1015 elif softclipped_mutation_oneOfTwoSSCS: | 1201 elif softclipped_mutation_oneOfTwoSSCS: |
1016 tier = "5.3" | 1202 tier = "5.3" |
1017 counter_tier53 += 1 | 1203 counter_tier53 += 1 |
1018 tier_dict[key1]["tier 5.3"] += 1 | 1204 if variant_type == "alt": |
1205 tier_dict[key1]["tier 5.3"] += 1 | |
1206 elif variant_type == "ref": | |
1207 tier_dict_ref[key1]["tier 5.3"] += 1 | |
1019 | 1208 |
1020 elif softclipped_mutation_oneMate: | 1209 elif softclipped_mutation_oneMate: |
1021 tier = "5.4" | 1210 tier = "5.4" |
1022 counter_tier54 += 1 | 1211 counter_tier54 += 1 |
1023 tier_dict[key1]["tier 5.4"] += 1 | 1212 if variant_type == "alt": |
1213 tier_dict[key1]["tier 5.4"] += 1 | |
1214 elif variant_type == "ref": | |
1215 tier_dict_ref[key1]["tier 5.4"] += 1 | |
1024 | 1216 |
1025 elif softclipped_mutation_oneMateOneSSCS: | 1217 elif softclipped_mutation_oneMateOneSSCS: |
1026 tier = "5.5" | 1218 tier = "5.5" |
1027 counter_tier55 += 1 | 1219 counter_tier55 += 1 |
1028 tier_dict[key1]["tier 5.5"] += 1 | 1220 if variant_type == "alt": |
1221 tier_dict[key1]["tier 5.5"] += 1 | |
1222 elif variant_type == "ref": | |
1223 tier_dict_ref[key1]["tier 5.5"] += 1 | |
1029 | 1224 |
1030 elif (contradictory): | 1225 elif (contradictory): |
1031 tier = "6" | 1226 tier = "6" |
1032 counter_tier6 += 1 | 1227 counter_tier6 += 1 |
1033 tier_dict[key1]["tier 6"] += 1 | 1228 if variant_type == "alt": |
1229 tier_dict[key1]["tier 6"] += 1 | |
1230 elif variant_type == "ref": | |
1231 tier_dict_ref[key1]["tier 6"] += 1 | |
1034 | 1232 |
1035 else: | 1233 else: |
1036 tier = "7" | 1234 tier = "7" |
1037 counter_tier7 += 1 | 1235 counter_tier7 += 1 |
1038 tier_dict[key1]["tier 7"] += 1 | 1236 if variant_type == "alt": |
1237 tier_dict[key1]["tier 7"] += 1 | |
1238 elif variant_type == "ref": | |
1239 tier_dict_ref[key1]["tier 7"] += 1 | |
1039 | 1240 |
1040 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) | 1241 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) |
1041 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) | 1242 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) |
1042 sample_tag = key2[:-5] | 1243 sample_tag = key2[:-5] |
1043 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time | 1244 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time |
1112 read_pos4 = read_len_median4 = None | 1313 read_pos4 = read_len_median4 = None |
1113 if (read_pos2 == -1): | 1314 if (read_pos2 == -1): |
1114 read_pos2 = read_len_median2 = None | 1315 read_pos2 = read_len_median2 = None |
1115 if (read_pos3 == -1): | 1316 if (read_pos3 == -1): |
1116 read_pos3 = read_len_median3 = None | 1317 read_pos3 = read_len_median3 = None |
1117 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) | 1318 line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) |
1118 # ws1.write_row(row, 0, line) | 1319 # ws1.write_row(row, 0, line) |
1119 # csv_writer.writerow(line) | 1320 # csv_writer.writerow(line) |
1120 line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) | 1321 line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) |
1121 # ws1.write_row(row + 1, 0, line2) | 1322 # ws1.write_row(row + 1, 0, line2) |
1122 # csv_writer.writerow(line2) | 1323 # csv_writer.writerow(line2) |
1123 | |
1124 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), | 1324 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), |
1125 # {'type': 'formula', | 1325 # {'type': 'formula', |
1126 # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), | 1326 # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), |
1127 # 'format': format1, | 1327 # 'format': format1, |
1128 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1328 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
1153 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) | 1353 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) |
1154 | 1354 |
1155 # write to file | 1355 # write to file |
1156 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 | 1356 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 |
1157 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) | 1357 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) |
1358 sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]]) | |
1158 correct_tier = False | 1359 correct_tier = False |
1360 correct_tier_ref = False | |
1159 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: | 1361 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: |
1160 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] | 1362 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] |
1161 tier_dict[key1]["tier 4"] = 0 | 1363 tier_dict[key1]["tier 4"] = 0 |
1162 correct_tier = True | 1364 correct_tier = True |
1365 | |
1366 if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0: | |
1367 tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"] | |
1368 tier_dict_ref[key1]["tier 4"] = 0 | |
1369 correct_tier_ref = True | |
1163 | 1370 |
1164 for sample in change_tier_after_print: | 1371 for sample in change_tier_after_print: |
1165 row_number = sample[0] | 1372 row_number = sample[0] |
1166 line1 = sample[1] | 1373 line1 = sample[1] |
1167 line2 = sample[2] | 1374 line2 = sample[2] |
1168 actual_high_tier = sample[3] | 1375 actual_high_tier = sample[3] |
1169 current_tier = list(line1)[1] | 1376 current_tier = list(line1)[1] |
1170 | 1377 |
1171 if correct_tier and (current_tier == "4") and actual_high_tier: | 1378 if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier: |
1172 line1 = list(line1) | 1379 line1 = list(line1) |
1173 line1[1] = "2.5" | 1380 line1[1] = "2.5" |
1174 line1 = tuple(line1) | 1381 line1 = tuple(line1) |
1175 counter_tier25 += 1 | 1382 counter_tier25 += 1 |
1176 counter_tier4 -= 1 | 1383 counter_tier4 -= 1 |
1384 if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier: | |
1385 line1 = list(line1) | |
1386 line1[1] = "2.5" | |
1387 line1 = tuple(line1) | |
1388 counter_tier25 += 1 | |
1389 counter_tier4 -= 1 | |
1390 | |
1177 ws1.write_row(row_number, 0, line1) | 1391 ws1.write_row(row_number, 0, line1) |
1178 csv_writer.writerow(line1) | 1392 csv_writer.writerow(line1) |
1179 ws1.write_row(row_number + 1, 0, line2) | 1393 ws1.write_row(row_number + 1, 0, line2) |
1180 csv_writer.writerow(line2) | 1394 csv_writer.writerow(line2) |
1181 | 1395 if line1[2] == "alt": |
1182 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | 1396 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1183 {'type': 'formula', | 1397 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1184 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), | 1398 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1185 'format': format1, | 1399 elif line1[2] == "ref": |
1186 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | 1400 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1187 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | 1401 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1188 {'type': 'formula', | 1402 ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1189 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), | 1403 |
1190 'format': format3, | |
1191 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | |
1192 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | |
1193 {'type': 'formula', | |
1194 'criteria': '=$B${}>="3"'.format(row_number + 1), | |
1195 'format': format2, | |
1196 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | |
1197 # sheet 2 | 1404 # sheet 2 |
1198 if chimera_correction: | 1405 if chimera_correction: |
1199 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', | 1406 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', |
1200 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', | 1407 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', |
1201 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', | 1408 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', |
1202 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') | 1409 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', |
1410 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' | |
1411 ) | |
1203 else: | 1412 else: |
1204 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', | 1413 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', |
1205 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', | 1414 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', |
1206 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', | 1415 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', |
1207 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') | 1416 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', |
1208 | 1417 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' |
1418 ) | |
1209 ws2.write_row(0, 0, header_line2) | 1419 ws2.write_row(0, 0, header_line2) |
1210 row = 0 | 1420 row = 0 |
1211 | 1421 |
1212 for key1, value1 in sorted(tier_dict.items()): | 1422 for key1, value1 in sorted(tier_dict.items()): |
1213 if key1 in pure_tags_dict_short.keys(): | 1423 if key1 in pure_tags_dict_short.keys(): |
1218 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) | 1428 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) |
1219 ref_count = cvrg_dict[key1][0] | 1429 ref_count = cvrg_dict[key1][0] |
1220 alt_count = cvrg_dict[key1][1] | 1430 alt_count = cvrg_dict[key1][1] |
1221 cvrg = ref_count + alt_count | 1431 cvrg = ref_count + alt_count |
1222 | 1432 |
1433 ref_tiers = tier_dict_ref[key1] | |
1434 | |
1223 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) | 1435 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) |
1224 lst = [var_id, cvrg] | 1436 lst = [var_id, cvrg] |
1225 used_tiers = [] | 1437 used_tiers = [] |
1438 used_tiers_ref = [t for k, t in sorted(ref_tiers.items())] | |
1226 cum_af = [] | 1439 cum_af = [] |
1227 for key2, value2 in sorted(value1.items()): | 1440 for key2, value2 in sorted(value1.items()): |
1228 # calculate cummulative AF | 1441 # calculate cummulative AF |
1229 used_tiers.append(value2) | 1442 used_tiers.append(value2) |
1230 if len(used_tiers) > 1: | 1443 if len(used_tiers) > 1: |
1231 cum = safe_div(sum(used_tiers), cvrg) | 1444 cum = safe_div(sum(used_tiers), cvrg) |
1232 cum_af.append(cum) | 1445 cum_af.append(cum) |
1233 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place | 1446 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place |
1234 continue | 1447 continue |
1235 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) | 1448 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) |
1236 lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) | 1449 lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))]) |
1237 if chimera_correction: | 1450 if chimera_correction: |
1238 chimeras_all = chimera_dict[key1][1] | 1451 chimeras_all = chimera_dict[key1][1] |
1239 new_alt = sum(used_tiers[0:7]) - chimeras_all | 1452 new_alt = sum(used_tiers[0:7]) - chimeras_all |
1240 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7]))) | 1453 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7]))) |
1241 if fraction_chimeras is None: | 1454 if fraction_chimeras is None: |
1242 fraction_chimeras = 0. | 1455 fraction_chimeras = 0. |
1243 new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras) | 1456 new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras) |
1244 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)]) | 1457 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)]) |
1245 lst.extend([alt_count, safe_div(alt_count, cvrg)]) | 1458 lst.extend([alt_count, safe_div(alt_count, cvrg)]) |
1246 lst.extend(used_tiers) | 1459 lst.extend(used_tiers) |
1247 lst.extend(cum_af) | 1460 lst.extend(used_tiers_ref) |
1461 # lst.extend(cum_af) | |
1248 lst = tuple(lst) | 1462 lst = tuple(lst) |
1249 ws2.write_row(row + 1, 0, lst) | 1463 ws2.write_row(row + 1, 0, lst) |
1250 if chimera_correction: | 1464 if chimera_correction: |
1251 ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)}) | 1465 ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)}) |
1252 ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)}) | 1466 ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)}) |
1253 ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)}) | 1467 ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)}) |
1254 else: | 1468 else: |
1255 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) | 1469 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) |
1256 ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)}) | 1470 ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)}) |
1257 ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)}) | 1471 ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)}) |
1258 row += 1 | 1472 row += 1 |