comparison read2mut.py @ 63:f0fc93b7945c draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Thu, 18 Mar 2021 10:07:50 +0000
parents 66c1245436b9
children fd342f5a97d9
comparison
equal deleted inserted replaced
62:66c1245436b9 63:f0fc93b7945c
8 Looks for reads with mutation at known 8 Looks for reads with mutation at known
9 positions and calculates frequencies and stats. 9 positions and calculates frequencies and stats.
10 10
11 ======= ========== ================= ================================ 11 ======= ========== ================= ================================
12 Version Date Author Description 12 Version Date Author Description
13 0.2.1 2019-10-27 Gundula Povysil - 13 0.2.2 2019-10-27 Gundula Povysil -
14 ======= ========== ================= ================================ 14 ======= ========== ================= ================================
15 15
16 16
17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
267 if len(value) < thresh: 267 if len(value) < thresh:
268 pure_tags_dict_short[key] = value 268 pure_tags_dict_short[key] = value
269 else: 269 else:
270 pure_tags_dict_short = pure_tags_dict 270 pure_tags_dict_short = pure_tags_dict
271 271
272 # whole_array = []
273 # for k in pure_tags_dict.values():
274 # if len(k) != 0:
275 # keys = k.keys()
276 # if len(keys) > 1:
277 # for k1 in keys:
278 # whole_array.append(k1)
279 # else:
280 # whole_array.append(keys[0])
281
282 csv_data = open(outputFile_csv, "wb") 272 csv_data = open(outputFile_csv, "wb")
283 csv_writer = csv.writer(csv_data, delimiter=",") 273 csv_writer = csv.writer(csv_data, delimiter=",")
284 274
285 # output summary with threshold 275 # output summary with threshold
286 workbook = xlsxwriter.Workbook(outfile) 276 workbook = xlsxwriter.Workbook(outfile)
319 counter_tier24 = 0 309 counter_tier24 = 0
320 counter_tier31 = 0 310 counter_tier31 = 0
321 counter_tier32 = 0 311 counter_tier32 = 0
322 counter_tier25 = 0 312 counter_tier25 = 0
323 counter_tier4 = 0 313 counter_tier4 = 0
324 # if chimera_correction:
325 # counter_tier43 = 0
326 counter_tier51 = 0 314 counter_tier51 = 0
327 counter_tier52 = 0 315 counter_tier52 = 0
328 counter_tier53 = 0 316 counter_tier53 = 0
329 counter_tier54 = 0 317 counter_tier54 = 0
330 counter_tier55 = 0 318 counter_tier55 = 0
332 counter_tier7 = 0 320 counter_tier7 = 0
333 321
334 row = 1 322 row = 1
335 tier_dict = {} 323 tier_dict = {}
336 chimera_dict = {} 324 chimera_dict = {}
337 #change_tier_after_print = {}
338 for key1, value1 in sorted(mut_dict.items()): 325 for key1, value1 in sorted(mut_dict.items()):
339 counts_mut = 0 326 counts_mut = 0
340 chimeric_tag_list = [] 327 chimeric_tag_list = []
341 chimeric_tag = {} 328 chimeric_tag = {}
342 if key1 in pure_tags_dict_short.keys(): 329 if key1 in pure_tags_dict_short.keys():
343 change_tier_after_print = [] 330 change_tier_after_print = []
344
345 i = np.where(np.array(['#'.join(str(i) for i in z) 331 i = np.where(np.array(['#'.join(str(i) for i in z)
346 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] 332 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
347 ref = mut_array[i, 2] 333 ref = mut_array[i, 2]
348 alt = mut_array[i, 3] 334 alt = mut_array[i, 3]
349 dcs_median = cvrg_dict[key1][2] 335 dcs_median = cvrg_dict[key1][2]
638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] 624 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] 625 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] 626 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 627 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
642 if any(ij is True for ij in softclipped_both_ends_idx1): 628 if any(ij is True for ij in softclipped_both_ends_idx1):
643 print(softclipped_both_ends_idx1)
644 for nr, indx in enumerate(softclipped_both_ends_idx1): 629 for nr, indx in enumerate(softclipped_both_ends_idx1):
645 if indx: 630 if indx:
646 if dist_start_read1[nr] <= dist_end_read1[nr]: 631 if dist_start_read1[nr] <= dist_end_read1[nr]:
647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number 632 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
648 else: 633 else:
659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] 644 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] 645 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] 646 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 647 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
663 if any(ij is True for ij in softclipped_both_ends_idx4): 648 if any(ij is True for ij in softclipped_both_ends_idx4):
664 print(softclipped_both_ends_idx4)
665 for nr, indx in enumerate(softclipped_both_ends_idx4): 649 for nr, indx in enumerate(softclipped_both_ends_idx4):
666 if indx: 650 if indx:
667 if dist_start_read4[nr] <= dist_end_read4[nr]: 651 if dist_start_read4[nr] <= dist_end_read4[nr]:
668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number 652 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
669 else: 653 else:
680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] 664 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] 665 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] 666 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 667 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
684 if any(ij is True for ij in softclipped_both_ends_idx2): 668 if any(ij is True for ij in softclipped_both_ends_idx2):
685 print(softclipped_both_ends_idx2)
686 for nr, indx in enumerate(softclipped_both_ends_idx2): 669 for nr, indx in enumerate(softclipped_both_ends_idx2):
687 if indx: 670 if indx:
688 if dist_start_read2[nr] <= dist_end_read2[nr]: 671 if dist_start_read2[nr] <= dist_end_read2[nr]:
689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number 672 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
690 else: 673 else:
701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] 684 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] 685 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] 686 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 687 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
705 if any(ij is True for ij in softclipped_both_ends_idx3): 688 if any(ij is True for ij in softclipped_both_ends_idx3):
706 print(softclipped_both_ends_idx3)
707 for nr, indx in enumerate(softclipped_both_ends_idx3): 689 for nr, indx in enumerate(softclipped_both_ends_idx3):
708 if indx: 690 if indx:
709 if dist_start_read3[nr] <= dist_end_read3[nr]: 691 if dist_start_read3[nr] <= dist_end_read3[nr]:
710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh 692 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
711 else: 693 else:
1089 min_tag_array2 = array2[min_index] # get whole tag with min HD 1071 min_tag_array2 = array2[min_index] # get whole tag with min HD
1090 min_value = dist.min() 1072 min_value = dist.min()
1091 # calculate HD of "b" to all "b's" or "a" to all "a's" 1073 # calculate HD of "b" to all "b's" or "a" to all "a's"
1092 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) 1074 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
1093 for e in min_tag_half2]) 1075 for e in min_tag_half2])
1094
1095 dist2 = dist_second_half.max() 1076 dist2 = dist_second_half.max()
1096 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD 1077 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
1097 max_tag = min_tag_array2[max_index] 1078 max_tag = min_tag_array2[max_index]
1098 1079
1099 # tags which have identical parts: 1080 # tags which have identical parts:
1168 else: 1149 else:
1169 chimeric_dcs_high_tiers += high_tiers 1150 chimeric_dcs_high_tiers += high_tiers
1170 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) 1151 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
1171 1152
1172 # write to file 1153 # write to file
1173
1174 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 1154 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
1175 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) 1155 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
1176
1177 correct_tier = False 1156 correct_tier = False
1178
1179 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: 1157 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
1180 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] 1158 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
1181 tier_dict[key1]["tier 4"] = 0 1159 tier_dict[key1]["tier 4"] = 0
1182 correct_tier = True 1160 correct_tier = True
1183 1161
1315 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), 1293 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
1316 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), 1294 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
1317 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), 1295 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
1318 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), 1296 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
1319 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), 1297 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
1320 ("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"), 1298 ("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"),
1321 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), 1299 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
1322 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), 1300 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
1323 ("Tier 4", "variants at the start or end of the reads"), 1301 ("Tier 4", "variants at the start or end of the reads"),
1324 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"), 1302 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"),
1325 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"), 1303 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"),