Mercurial > repos > mheinzl > variant_analyzer2

--- a/mut2read.xml	Sun Oct 04 19:09:51 2020 +0000
+++ b/mut2read.xml	Wed Oct 07 10:57:15 2020 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="mut2read" name="DCS mutations to tags/reads:" version="1.0.1" profile="17.01">
+<tool id="mut2read" name="DCS mutations to tags/reads:" version="1.0.1" profile="19.01">
     <description>Extracts all tags that carry a mutation in the duplex consensus sequence (DCS)</description>
     <macros>
         <import>va_macros.xml</import>
--- a/mut2sscs.xml	Sun Oct 04 19:09:51 2020 +0000
+++ b/mut2sscs.xml	Wed Oct 07 10:57:15 2020 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="mut2sscs" name="DCS mutations to SSCS stats:" version="1.0.1" profile="17.01">
+<tool id="mut2sscs" name="DCS mutations to SSCS stats:" version="1.0.1" profile="19.01">
     <description>Extracts all tags from the single stranded consensus sequence (SSCS) bam file that carry a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies</description>
     <macros>
         <import>va_macros.xml</import>
--- a/read2mut.py	Sun Oct 04 19:09:51 2020 +0000
+++ b/read2mut.py	Wed Oct 07 10:57:15 2020 +0000
@@ -23,6 +23,7 @@
 from __future__ import division

 import argparse
+import itertools
 import json
 import operator
 import os
@@ -33,7 +34,6 @@
 import pysam
 import xlsxwriter

-
 def make_argparser():
     parser = argparse.ArgumentParser(description='Takes a tabular file with mutations, a BAM file and JSON files as input and prints stats about variants to a user specified output file.')
     parser.add_argument('--mutFile',
@@ -53,8 +53,7 @@
     parser.add_argument('--trim', type=int, default=10,
                         help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
     parser.add_argument('--chimera_correction', action="store_true",
-                        help='Add additional tier for chimeric variants and correct the variant frequencies')
-
+                        help='Count chimeric variants and correct the variant frequencies.')
     return parser


@@ -90,18 +89,17 @@
     if trim < 0:
         sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))

-    # 1. read mut file
     with open(file1, 'r') as mut:
         mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
-
-    # 2. load dicts
+
+    # load dicts
     with open(json_file, "r") as f:
         (tag_dict, cvrg_dict) = json.load(f)

     with open(sscs_json, "r") as f:
         (mut_pos_dict, ref_pos_dict) = json.load(f)

-    # 3. read bam file
+    # read bam file
     # pysam.index(file2)
     bam = pysam.AlignmentFile(file2, "rb")

@@ -115,7 +113,6 @@

     for m in range(0, len(mut_array[:, 0])):
         print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
-        #    for m in range(0, 5):
         chrom = mut_array[m, 1]
         stop_pos = mut_array[m, 2].astype(int)
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
@@ -125,7 +122,7 @@
         mut_read_pos_dict[chrom_stop_pos] = {}
         reads_dict[chrom_stop_pos] = {}

-        for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000):
+        for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=100000000):
             if pileupcolumn.reference_pos == stop_pos - 1:
                 count_alt = 0
                 count_ref = 0
@@ -158,7 +155,6 @@
                                 mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1)
                             reads_dict[chrom_stop_pos][tag] = np.append(
                                 reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
-
                         if nuc == alt:
                             count_alt += 1
                             if tag not in mut_read_dict:
@@ -176,9 +172,8 @@
                             count_other += 1
                     else:
                         count_indel += 1
-
                 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
-
+
     for read in bam.fetch(until_eof=True):
         if read.is_unmapped:
             pure_tag = read.query_name[:-5]
@@ -220,10 +215,6 @@
     else:
         pure_tags_dict_short = pure_tags_dict

-    #whole_array = []
-    #for k in pure_tags_dict.values():
-    #    whole_array.extend(k.keys())
-
     # 7. output summary with threshold
     workbook = xlsxwriter.Workbook(outfile)
     ws1 = workbook.add_worksheet("Results")
@@ -253,16 +244,17 @@
     counter_tier32 = 0
     counter_tier41 = 0
     counter_tier42 = 0
-
-    if chimera_correction:
-        counter_tier43 = 0
-
+    #if chimera_correction:
+    #    counter_tier43 = 0
     counter_tier5 = 0

     row = 1
     tier_dict = {}
+    chimera_dict = {}
     for key1, value1 in sorted(mut_dict.items()):
         counts_mut = 0
+        chimeric_tag_list = []
+        chimeric_tag = {}
         if key1 in pure_tags_dict_short.keys():
             i = np.where(np.array(['#'.join(str(i) for i in z)
                                    for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0]
@@ -272,11 +264,7 @@
             whole_array = pure_tags_dict_short[key1].keys()

             tier_dict[key1] = {}
-            if chimera_correction:
-                values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 4.3", 0), ("tier 5", 0)]
-            else:
-                values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
-
+            values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
             for k, v in values_tier_dict:
                 tier_dict[key1][k] = v

@@ -506,90 +494,11 @@
                         details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                         details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)

-                        chrom, pos = re.split(r'\#', key1)
-                        var_id = '-'.join([chrom, pos, ref, alt])
-                        sample_tag = key2[:-5]
-                        array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
-                        # exclude identical tag from array2, to prevent comparison to itself
-                        same_tag = np.where(array2 == sample_tag)
-                        index_array2 = np.arange(0, len(array2), 1)
-                        index_withoutSame = np.delete(index_array2, same_tag)  # delete identical tag from the data
-                        array2 = array2[index_withoutSame]
-                        if len(array2) != 0:  # only perform chimera analysis if there is more than 1 variant
-                            array1_half = sample_tag[0:int(len(sample_tag) / 2)]  # mate1 part1
-                            array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))]  # mate1 part 2
-                            array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2])  # mate2 part1
-                            array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2])  # mate2 part2
-
-                            min_tags_list_zeros = []
-                            chimera_tags = []
-                            for mate_b in [False, True]:
-                                i = 0  # counter, only used to see how many HDs of tags were already calculated
-                                if mate_b is False:  # HD calculation for all a's
-                                    half1_mate1 = array1_half
-                                    half2_mate1 = array1_half2
-                                    half1_mate2 = array2_half
-                                    half2_mate2 = array2_half2
-                                elif mate_b is True:  # HD calculation for all b's
-                                    half1_mate1 = array1_half2
-                                    half2_mate1 = array1_half
-                                    half1_mate2 = array2_half2
-                                    half2_mate2 = array2_half
-                                # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
-                                dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2])
-                                min_index = np.where(dist == dist.min())  # get index of min HD
-                                # get all "b's" of the tag or all "a's" of the tag with minimum HD
-                                min_tag_half2 = half2_mate2[min_index]
-                                min_tag_array2 = array2[min_index]  # get whole tag with min HD
-                                min_value = dist.min()
-                                # calculate HD of "b" to all "b's" or "a" to all "a's"
-                                dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e))
-                                                             for e in min_tag_half2])
-
-                                dist2 = dist_second_half.max()
-                                max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
-                                max_tag = min_tag_array2[max_index]
-
-                                # tags which have identical parts:
-                                if min_value == 0 or dist2 == 0:
-                                    min_tags_list_zeros.append(tag)
-                                    chimera_tags.append(max_tag)
-                                    # chimeric = True
-                                # else:
-                                    # chimeric = False
-
-                                # if mate_b is False:
-                                #    text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
-                                # else:
-                                #     text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
-                                i += 1
-                            chimera_tags = [x for x in chimera_tags if x != []]
-                            chimera_tags_new = []
-                            for i in chimera_tags:
-                                if len(i) > 1:
-                                    for t in i:
-                                        chimera_tags_new.append(t)
-                                else:
-                                    chimera_tags_new.extend(i)
-                            chimera_tags_new = np.asarray(chimera_tags_new)
-                            chimera = ", ".join(chimera_tags_new)
-                        else:
-                            chimera_tags_new = []
-                            chimera = ""
-
+
                         trimmed = False
                         contradictory = False
-                        chimeric_dcs = False

-                        if chimera_correction and len(chimera_tags_new) > 0: # chimeras
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
-                            chimeric_dcs = True
-                            trimmed = False
-                            contradictory = False
-                        elif ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
+                        if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
                             all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
                             (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
                             all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
@@ -599,7 +508,6 @@
                             alt3ff = 0
                             trimmed = False
                             contradictory = True
-                            chimeric_dcs = False
                         else:
                             if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
                                 beg1 = total1new
@@ -709,16 +617,89 @@
                             counter_tier42 += 1
                             tier_dict[key1]["tier 4.2"] += 1

-                        elif chimera_correction and chimeric_dcs:
-                            tier = "4.3"
-                            counter_tier43 += 1
-                            tier_dict[key1]["tier 4.3"] += 1
-
                         else:
                             tier = "5"
                             counter_tier5 += 1
                             tier_dict[key1]["tier 5"] += 1

+                        chrom, pos = re.split(r'\#', key1)
+                        var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
+                        sample_tag = key2[:-5]
+                        array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
+                        # exclude identical tag from array2, to prevent comparison to itself
+                        same_tag = np.where(array2 == sample_tag)
+                        index_array2 = np.arange(0, len(array2), 1)
+                        index_withoutSame = np.delete(index_array2, same_tag)  # delete identical tag from the data
+                        array2 = array2[index_withoutSame]
+                        if len(array2) != 0:  # only perform chimera analysis if there is more than 1 variant
+                            array1_half = sample_tag[0:int(len(sample_tag) / 2)]  # mate1 part1
+                            array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))]  # mate1 part 2
+                            array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2])  # mate2 part1
+                            array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2])  # mate2 part2
+
+                            min_tags_list_zeros = []
+                            chimera_tags = []
+                            for mate_b in [False, True]:
+                                i = 0  # counter, only used to see how many HDs of tags were already calculated
+                                if mate_b is False:  # HD calculation for all a's
+                                    half1_mate1 = array1_half
+                                    half2_mate1 = array1_half2
+                                    half1_mate2 = array2_half
+                                    half2_mate2 = array2_half2
+                                elif mate_b is True:  # HD calculation for all b's
+                                    half1_mate1 = array1_half2
+                                    half2_mate1 = array1_half
+                                    half1_mate2 = array2_half2
+                                    half2_mate2 = array2_half
+                                # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+                                dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
+                                min_index = np.where(dist == dist.min())  # get index of min HD
+                                # get all "b's" of the tag or all "a's" of the tag with minimum HD
+                                min_tag_half2 = half2_mate2[min_index]
+                                min_tag_array2 = array2[min_index]  # get whole tag with min HD
+                                min_value = dist.min()
+                                # calculate HD of "b" to all "b's" or "a" to all "a's"
+                                dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
+                                                             for e in min_tag_half2])
+
+                                dist2 = dist_second_half.max()
+                                max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
+                                max_tag = min_tag_array2[max_index]
+
+                                # tags which have identical parts:
+                                if min_value == 0 or dist2 == 0:
+                                    min_tags_list_zeros.append(tag)
+                                    chimera_tags.append(max_tag)
+                                    # chimeric = True
+                                # else:
+                                    # chimeric = False
+
+                                # if mate_b is False:
+                                #    text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
+                                # else:
+                                #     text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
+                                i += 1
+                            chimera_tags = [x for x in chimera_tags if x != []]
+                            chimera_tags_new = []
+                            for i in chimera_tags:
+                                if len(i) > 1:
+                                    for t in i:
+                                        chimera_tags_new.append(t)
+                                else:
+                                    chimera_tags_new.extend(i)
+                            chimera = ", ".join(chimera_tags_new)
+                        else:
+                            chimera_tags_new = []
+                            chimera = ""
+
+                        if len(chimera_tags_new) > 0:
+                            chimera_tags_new.append(sample_tag)
+                            key_chimera = ",".join(sorted(chimera_tags_new))
+                            if key_chimera in chimeric_tag.keys():
+                                chimeric_tag[key_chimera].append(float(tier))
+                            else:
+                                chimeric_tag[key_chimera] = [float(tier)]
+
                         if (read_pos1 == -1):
                             read_pos1 = read_len_median1 = None
                         if (read_pos4 == -1):
@@ -749,14 +730,26 @@
                                                 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})

                         row += 3
-
+            if chimera_correction:
+                chimeric_dcs_high_tiers = 0
+                chimeric_dcs = 0
+                for keys_chimera in chimeric_tag.keys():
+                    tiers = chimeric_tag[keys_chimera]
+                    chimeric_dcs += len(tiers) - 1
+                    high_tiers = sum(1 for t in tiers if t < 3.)
+                    if high_tiers == len(tiers):
+                        chimeric_dcs_high_tiers += high_tiers - 1
+                    else:
+                        chimeric_dcs_high_tiers += high_tiers
+                chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
+    # sheet 2
     if chimera_correction:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF  (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
                     'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-                    'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 4.3', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                    'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-4.3', 'AF 1.1-5')
+                    'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+                    'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
     else:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF  (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
                     'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
                     'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
                     'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
@@ -775,7 +768,7 @@
             alt_count = cvrg_dict[key1][1]
             cvrg = ref_count + alt_count

-            var_id = '-'.join([chrom, pos, ref, alt])
+            var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
             lst = [var_id, cvrg]
             used_tiers = []
             cum_af = []
@@ -787,35 +780,43 @@
                     cum_af.append(cum)
             lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
             if chimera_correction:
-                lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:]))), alt_count, safe_div(alt_count, cvrg)])
-            else:
-                lst.extend([(cvrg - sum(used_tiers[-5:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-5:]))), alt_count, safe_div(alt_count, cvrg)])
+                chimeras_all = chimera_dict[key1][0]
+                new_alt = sum(used_tiers) - chimeras_all
+                fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers)))
+                if fraction_chimeras is None:
+                    fraction_chimeras = 0.
+                new_cvrg = cvrg * (1. - fraction_chimeras)
+                lst.extend([chimeras_all, safe_div(new_alt, new_cvrg)])
+            lst.extend([(cvrg - sum(used_tiers[-5:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-5:])))])
+            if chimera_correction:
+                chimeras_all = chimera_dict[key1][1]
+                new_alt = sum(used_tiers[0:6]) - chimeras_all
+                fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6])))
+                if fraction_chimeras is None:
+                    fraction_chimeras = 0.
+                new_cvrg = (cvrg - sum(used_tiers[-5:])) * (1. - fraction_chimeras)
+                lst.extend([chimeras_all, safe_div(new_alt, new_cvrg)])
+            lst.extend([alt_count, safe_div(alt_count, cvrg)])
             lst.extend(used_tiers)
             lst.extend(cum_af)
             lst = tuple(lst)
             ws2.write_row(row + 1, 0, lst)
-            ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
-            ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
             if chimera_correction:
-                ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)})
+                ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1"', 'format': format1, 'multi_range': 'N{}:O{} N1:O1'.format(row + 2, row + 2)})
+                ws2.conditional_format('P{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1"', 'format': format3, 'multi_range': 'P{}:S{} P1:S1'.format(row + 2, row + 2)})
+                ws2.conditional_format('T{}:X{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format2, 'multi_range': 'T{}:X{} T1:X1'.format(row + 2, row + 2)})
             else:
+                ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
+                ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
                 ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:T{} P1:T1'.format(row + 2, row + 2)})
-
             row += 1

     # sheet 3
-    if chimera_correction:
-        sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
-                  ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
-                  ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
-                  ("tier 4.2", counter_tier42), ("tier 4.3", counter_tier43), ("tier 5", counter_tier5)]
-    else:
-        sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
+    sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
                   ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
                   ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
                   ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)]

-
     header = ("tier", "count")
     ws3.write_row(0, 0, header)

@@ -833,11 +834,8 @@
                                {'type': 'formula',
                                 'criteria': '=$A${}>="3"'.format(i + 2),
                                 'format': format2})
-    if chimera_correction:
-        description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 4.3", "variants that are chimeric"), ("Tier 5", "remaining variants")]
-    else:
-        description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]

+    description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
     examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
                         "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
                         "4081", "4098", "5", "10", "", ""),
@@ -902,29 +900,13 @@
                         "0", "0", "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
                         "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
-                        "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
-
-    if chimera_correction:
-        examples_tiers.extend([[("Chr5:5-20000-13963-T-C", "4.3", "TTTTTAAGAAGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
-                        "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081",
-                        "4098", "5", "10", "", "TTTTTAAGAATAACCCACAC"),
-                       ("", "", "TTTTTAAGAAGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289",
-                        "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081",
-                        "4098", "5", "10", "", "TTTTTAAGAATAACCCACAC")],
-                      [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
+                        "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
+                       [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
                         "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
                         "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
                         "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
-                        "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]])
-    else:
-        examples_tiers.extend([
-                      [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
-                        "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
-                        "0", "1", "1", "5348", "5350", "", ""),
-                       ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
-                        "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
-                        "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]])
+                        "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]

     start_row = 15
     ws3.write(start_row, 0, "Description of tiers with examples")
--- a/read2mut.xml	Sun Oct 04 19:09:51 2020 +0000
+++ b/read2mut.xml	Wed Oct 07 10:57:15 2020 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="read2mut" name="Call specific mutations in reads:" version="1.0.2" profile="17.01">
+<tool id="read2mut" name="Call specific mutations in reads:" version="1.0.3" profile="19.01">
     <description>Looks for reads with mutation at known positions and calculates frequencies and stats.</description>
     <macros>
         <import>va_macros.xml</import>
@@ -30,7 +30,7 @@
         <param name="thresh" type="integer" label="Tag count threshold" value="0" help="Integer threshold for displaying mutations. Only mutations occuring in DCS of less than thresh tags are displayed. Default of 0 displays all."/>
         <param name="phred" type="integer" label="Phred quality score threshold" min="0" max="41" value="20" help="Integer threshold for Phred quality score. Only reads higher than this threshold are considered. Default = 20."/>
         <param name="trim" type="integer" label="Trimming threshold" value="10" help="Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10."/>
-        <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Add additional tier for chimeric variants and correct the variant frequencies."/>
+        <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Count chimeric variants and correct the variant frequencies."/>
     </inputs>
     <outputs>
         <data name="output_xlsx" format="xlsx" label="${tool.name} on ${on_string}: XLSX"/>