Mercurial > repos > mheinzl > variant_analyzer2
diff read2mut.py @ 63:f0fc93b7945c draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Thu, 18 Mar 2021 10:07:50 +0000 |
parents | 66c1245436b9 |
children | fd342f5a97d9 |
line wrap: on
line diff
--- a/read2mut.py Wed Mar 17 13:24:40 2021 +0000 +++ b/read2mut.py Thu Mar 18 10:07:50 2021 +0000 @@ -10,7 +10,7 @@ ======= ========== ================= ================================ Version Date Author Description -0.2.1 2019-10-27 Gundula Povysil - +0.2.2 2019-10-27 Gundula Povysil - ======= ========== ================= ================================ @@ -269,16 +269,6 @@ else: pure_tags_dict_short = pure_tags_dict - # whole_array = [] - # for k in pure_tags_dict.values(): - # if len(k) != 0: - # keys = k.keys() - # if len(keys) > 1: - # for k1 in keys: - # whole_array.append(k1) - # else: - # whole_array.append(keys[0]) - csv_data = open(outputFile_csv, "wb") csv_writer = csv.writer(csv_data, delimiter=",") @@ -321,8 +311,6 @@ counter_tier32 = 0 counter_tier25 = 0 counter_tier4 = 0 - # if chimera_correction: - # counter_tier43 = 0 counter_tier51 = 0 counter_tier52 = 0 counter_tier53 = 0 @@ -334,14 +322,12 @@ row = 1 tier_dict = {} chimera_dict = {} - #change_tier_after_print = {} for key1, value1 in sorted(mut_dict.items()): counts_mut = 0 chimeric_tag_list = [] chimeric_tag = {} if key1 in pure_tags_dict_short.keys(): change_tier_after_print = [] - i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] ref = mut_array[i, 2] @@ -640,7 +626,6 @@ dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx1): - print(softclipped_both_ends_idx1) for nr, indx in enumerate(softclipped_both_ends_idx1): if indx: if dist_start_read1[nr] <= dist_end_read1[nr]: @@ -661,7 +646,6 @@ dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx4): - print(softclipped_both_ends_idx4) for nr, indx in enumerate(softclipped_both_ends_idx4): if indx: if dist_start_read4[nr] <= dist_end_read4[nr]: @@ -682,7 +666,6 @@ dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx2): - print(softclipped_both_ends_idx2) for nr, indx in enumerate(softclipped_both_ends_idx2): if indx: if dist_start_read2[nr] <= dist_end_read2[nr]: @@ -703,7 +686,6 @@ dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx3): - print(softclipped_both_ends_idx3) for nr, indx in enumerate(softclipped_both_ends_idx3): if indx: if dist_start_read3[nr] <= dist_end_read3[nr]: @@ -1091,7 +1073,6 @@ # calculate HD of "b" to all "b's" or "a" to all "a's" dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) for e in min_tag_half2]) - dist2 = dist_second_half.max() max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD max_tag = min_tag_array2[max_index] @@ -1170,12 +1151,9 @@ chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) # write to file - # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) - correct_tier = False - if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] tier_dict[key1]["tier 4"] = 0 @@ -1317,7 +1295,7 @@ ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), - ("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"), + ("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4", "variants at the start or end of the reads"),