# HG changeset patch
# User mheinzl
# Date 1658755444 0
# Node ID d7aea14291e8be9c63aeea91c16753d3a490f82b
# Parent fdfe9a919ff77fabf6f9001e95ff230a0c04025a
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
diff -r fdfe9a919ff7 -r d7aea14291e8 read2mut.py
--- a/read2mut.py Fri Jul 22 09:19:44 2022 +0000
+++ b/read2mut.py Mon Jul 25 13:24:04 2022 +0000
@@ -313,6 +313,8 @@
count_other += 1
else:
count_indel += 1
+ print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, low quality = %s\n" %
+ (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_lowq))
mut_array = np.array(mut_array)
for read in bam.fetch(until_eof=True):
@@ -425,6 +427,10 @@
chimeric_tag = {}
if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()): # ref or alt
+ if key1 not in np.array(['#'.join(str(i) for i in z)
+ for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]):
+ continue
+
change_tier_after_print = []
i = np.where(np.array(['#'.join(str(i) for i in z)
for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
@@ -443,6 +449,7 @@
tier_dict_ref[key1][k] = v
used_keys = []
+ # used_keys_ref = []
if 'ab' in mut_pos_dict[key1].keys():
sscs_mut_ab = mut_pos_dict[key1]['ab']
else:
@@ -465,11 +472,10 @@
if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys(): # skip reads that have not alt or ref
continue
- if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys):
-
- if key2[:-5] in pure_tags_dict_short[key1].keys():
+ if (((key2[:-5] in tag_dict.keys()) and (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key1 in tag_dict[key2[:-5]].keys()) and (key2[:-5] not in used_keys)) or ((key2[:-5] in tag_dict_ref.keys()) and (key2[:-5] in pure_tags_dict_ref[key1].keys()) and (key1 in tag_dict_ref[key2[:-5]].keys()) and (key2[:-5] not in used_keys))):
+ if key2[:-5] in tag_dict.keys():
variant_type = "alt"
- elif key2[:-5] in pure_tags_dict_ref[key1].keys():
+ elif key2[:-5] in tag_dict_ref.keys():
variant_type = "ref"
if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
@@ -670,7 +676,10 @@
end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2']
+ # if variant_type == "alt":
used_keys.append(key2[:-5])
+ # elif variant_type == "ref":
+ # used_keys_ref.append(key2[:-5])
counts_mut += 1
if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
if variant_type == "alt":
@@ -1240,61 +1249,64 @@
chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
- sample_tag = key2[:-5]
- array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
- # exclude identical tag from array2, to prevent comparison to itself
- same_tag = np.where(array2 == sample_tag)
- index_array2 = np.arange(0, len(array2), 1)
- index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data
- array2 = array2[index_withoutSame]
- if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant
- array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1
- array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2
- array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1
- array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2
- min_tags_list_zeros = []
- chimera_tags = []
- for mate_b in [False, True]:
- i = 0 # counter, only used to see how many HDs of tags were already calculated
- if mate_b is False: # HD calculation for all a's
- half1_mate1 = array1_half
- half2_mate1 = array1_half2
- half1_mate2 = array2_half
- half2_mate2 = array2_half2
- elif mate_b is True: # HD calculation for all b's
- half1_mate1 = array1_half2
- half2_mate1 = array1_half
- half1_mate2 = array2_half2
- half2_mate2 = array2_half
- # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
- dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
- min_index = np.where(dist == dist.min()) # get index of min HD
- # get all "b's" of the tag or all "a's" of the tag with minimum HD
- min_tag_half2 = half2_mate2[min_index]
- min_tag_array2 = array2[min_index] # get whole tag with min HD
- min_value = dist.min()
- # calculate HD of "b" to all "b's" or "a" to all "a's"
- dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
- for e in min_tag_half2])
- dist2 = dist_second_half.max()
- max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
- max_tag = min_tag_array2[max_index]
-
- # tags which have identical parts:
- if min_value == 0 or dist2 == 0:
- min_tags_list_zeros.append(tag)
- chimera_tags.append(max_tag)
- i += 1
- chimera_tags = [x for x in chimera_tags if x != []]
- chimera_tags_new = []
- for i in chimera_tags:
- if len(i) > 1:
- for t in i:
- chimera_tags_new.append(t)
- else:
- chimera_tags_new.extend(i)
- chimera = ", ".join(chimera_tags_new)
+ if variant_type == "alt":
+ sample_tag = key2[:-5]
+ array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
+ # exclude identical tag from array2, to prevent comparison to itself
+ same_tag = np.where(array2 == sample_tag)
+ index_array2 = np.arange(0, len(array2), 1)
+ index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data
+ array2 = array2[index_withoutSame]
+ if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant
+ array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1
+ array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2
+ array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1
+ array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2
+ min_tags_list_zeros = []
+ chimera_tags = []
+ for mate_b in [False, True]:
+ i = 0 # counter, only used to see how many HDs of tags were already calculated
+ if mate_b is False: # HD calculation for all a's
+ half1_mate1 = array1_half
+ half2_mate1 = array1_half2
+ half1_mate2 = array2_half
+ half2_mate2 = array2_half2
+ elif mate_b is True: # HD calculation for all b's
+ half1_mate1 = array1_half2
+ half2_mate1 = array1_half
+ half1_mate2 = array2_half2
+ half2_mate2 = array2_half
+ # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+ dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
+ min_index = np.where(dist == dist.min()) # get index of min HD
+ # get all "b's" of the tag or all "a's" of the tag with minimum HD
+ min_tag_half2 = half2_mate2[min_index]
+ min_tag_array2 = array2[min_index] # get whole tag with min HD
+ min_value = dist.min()
+ # calculate HD of "b" to all "b's" or "a" to all "a's"
+ dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
+ for e in min_tag_half2])
+ dist2 = dist_second_half.max()
+ max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
+ max_tag = min_tag_array2[max_index]
+ # tags which have identical parts:
+ if min_value == 0 or dist2 == 0:
+ min_tags_list_zeros.append(tag)
+ chimera_tags.append(max_tag)
+ i += 1
+ chimera_tags = [x for x in chimera_tags if x != []]
+ chimera_tags_new = []
+ for i in chimera_tags:
+ if len(i) > 1:
+ for t in i:
+ chimera_tags_new.append(t)
+ else:
+ chimera_tags_new.extend(i)
+ chimera = ", ".join(chimera_tags_new)
+ else:
+ chimera_tags_new = []
+ chimera = ""
else:
chimera_tags_new = []
chimera = ""
@@ -1316,27 +1328,23 @@
if (read_pos3 == -1):
read_pos3 = read_len_median3 = None
line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
- # ws1.write_row(row, 0, line)
- # csv_writer.writerow(line)
line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
- # ws1.write_row(row + 1, 0, line2)
- # csv_writer.writerow(line2)
- # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
- # {'type': 'formula',
- # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
- # 'format': format1,
- # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
- # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
- # {'type': 'formula',
- # 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1),
- # 'format': format3,
- # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
- # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
- # {'type': 'formula',
- # 'criteria': '=$B${}>="3"'.format(row + 1),
- # 'format': format2,
- # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
- change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier))
+ if tier != "4":
+ ws1.write_row(row, 0, line)
+ csv_writer.writerow(line)
+ ws1.write_row(row + 1, 0, line2)
+ csv_writer.writerow(line2)
+ if variant_type == "alt":
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ elif variant_type == "ref":
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ else:
+ change_tier_after_print.append((line, line2, trimmed_actual_high_tier))
+
row += 3
if chimera_correction:
@@ -1367,39 +1375,39 @@
tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"]
tier_dict_ref[key1]["tier 4"] = 0
correct_tier_ref = True
-
- for sample in change_tier_after_print:
- row_number = sample[0]
- line1 = sample[1]
- line2 = sample[2]
- actual_high_tier = sample[3]
- current_tier = list(line1)[1]
-
- if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
- line1 = list(line1)
- line1[1] = "2.5"
- line1 = tuple(line1)
- counter_tier25 += 1
- counter_tier4 -= 1
- if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
- line1 = list(line1)
- line1[1] = "2.5"
- line1 = tuple(line1)
- counter_tier25 += 1
- counter_tier4 -= 1
-
- ws1.write_row(row_number, 0, line1)
- csv_writer.writerow(line1)
- ws1.write_row(row_number + 1, 0, line2)
- csv_writer.writerow(line2)
- if line1[2] == "alt":
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
- elif line1[2] == "ref":
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
- ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ # print(key1, "change tiers from tier 4 to tier 2.5 for {} DCS ...".format(len(change_tier_after_print)))
+ if len(change_tier_after_print) > 0:
+ for sample in change_tier_after_print:
+ # row_number = sample[0]
+ line1 = sample[0]
+ line2 = sample[1]
+ actual_high_tier = sample[2]
+ current_tier = list(line1)[1]
+ if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
+ line1 = list(line1)
+ line1[1] = "2.5"
+ line1 = tuple(line1)
+ counter_tier25 += 1
+ counter_tier4 -= 1
+ if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
+ line1 = list(line1)
+ line1[1] = "2.5"
+ line1 = tuple(line1)
+ counter_tier25 += 1
+ counter_tier4 -= 1
+ ws1.write_row(row, 0, line1)
+ csv_writer.writerow(line1)
+ ws1.write_row(row + 1, 0, line2)
+ csv_writer.writerow(line2)
+ if line1[2] == "alt":
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ elif line1[2] == "ref":
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ row += 3
# sheet 2
if chimera_correction:
@@ -1429,9 +1437,7 @@
ref_count = cvrg_dict[key1][0]
alt_count = cvrg_dict[key1][1]
cvrg = ref_count + alt_count
-
ref_tiers = tier_dict_ref[key1]
-
var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
lst = [var_id, cvrg]
used_tiers = []
@@ -1453,7 +1459,7 @@
fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7])))
if fraction_chimeras is None:
fraction_chimeras = 0.
- new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras)
+ new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras)
lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
lst.extend([alt_count, safe_div(alt_count, cvrg)])
lst.extend(used_tiers)
diff -r fdfe9a919ff7 -r d7aea14291e8 read2mut.xml
--- a/read2mut.xml Fri Jul 22 09:19:44 2022 +0000
+++ b/read2mut.xml Mon Jul 25 13:24:04 2022 +0000
@@ -35,7 +35,7 @@
-
+