Mercurial > repos > mheinzl > variant_analyzer2

diff read2mut.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author: mheinzl
date: Fri, 22 Jul 2022 09:19:44 +0000
parents: 1797e461d674
children: d7aea14291e8
--- a/read2mut.py	Mon Mar 29 09:22:57 2021 +0000
+++ b/read2mut.py	Fri Jul 22 09:19:44 2022 +0000
@@ -87,6 +87,7 @@
     outfile2 = args.outputFile2
     outfile3 = args.outputFile3
     outputFile_csv = args.outputFile_csv
+
     thresh = args.thresh
     phred_score = args.phred
     trim = args.trim
@@ -111,7 +112,7 @@
 
     # load dicts
     with open(json_file, "r") as f:
-        (tag_dict, cvrg_dict) = json.load(f)
+        (tag_dict, cvrg_dict, tag_dict_ref) = json.load(f)
 
     with open(sscs_json, "r") as f:
         (mut_pos_dict, ref_pos_dict) = json.load(f)
@@ -138,106 +139,207 @@
             continue
         else:
             alt = variant.ALT[0]
-        chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+
+        alt = alt.upper()
+        ref = ref.upper()
+        if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+            continue
 
-        if len(ref) == len(alt):
-            mut_array.append([chrom, stop_pos, ref, alt])
-            i += 1
-            mut_dict[chrom_stop_pos] = {}
-            mut_read_pos_dict[chrom_stop_pos] = {}
-            reads_dict[chrom_stop_pos] = {}
-            mut_read_cigar_dict[chrom_stop_pos] = {}
-            real_start_end[chrom_stop_pos] = {}
+        chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+        mut_array.append([chrom, stop_pos, ref, alt])
+        i += 1
+        mut_dict[chrom_stop_pos] = {}
+        mut_read_pos_dict[chrom_stop_pos] = {}
+        reads_dict[chrom_stop_pos] = {}
+        mut_read_cigar_dict[chrom_stop_pos] = {}
+        real_start_end[chrom_stop_pos] = {}
+        for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
+            if pileupcolumn.reference_pos == stop_pos:
+                count_alt = 0
+                count_ref = 0
+                count_indel = 0
+                count_n = 0
+                count_other = 0
+                count_lowq = 0
+                n = 0
+                for pileupread in pileupcolumn.pileups:
+                    n += 1
+                    if not pileupread.is_refskip:
+                        if pileupread.is_del:
+                            p = pileupread.query_position_or_next
+                            e = p + len(alt) - 1
+                        else:
+                            p = pileupread.query_position
+                            e = p + len(alt)
+                        s = p
+                        tag = pileupread.alignment.query_name
+                        split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
 
-            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
-                if pileupcolumn.reference_pos == stop_pos:
-                    count_alt = 0
-                    count_ref = 0
-                    count_indel = 0
-                    count_n = 0
-                    count_other = 0
-                    count_lowq = 0
-                    n = 0
-                    for pileupread in pileupcolumn.pileups:
-                        n += 1
-                        if not pileupread.is_del and not pileupread.is_refskip:
-                            tag = pileupread.alignment.query_name
-                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                            phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
-                            # if read is softclipped, store real position in reference
-                            if "S" in pileupread.alignment.cigarstring:
-                                # spftclipped at start
-                                if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
-                                    start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
-                                    end = pileupread.alignment.reference_end
-                                # softclipped at end
-                                elif re.search(r"S$", pileupread.alignment.cigarstring):
-                                    end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
-                                    start = pileupread.alignment.reference_start
-                            else:
+                        if len(ref) < len(alt):
+                            if "I" in split_cigar:
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1:  # if pe read matches exatcly to insertion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1:  # if pe read has shorter insertion -- not alt
+                                        nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:  # insertion in pe reads but not at the desired position
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif "D" in split_cigar:  # if deletion in pe read, don't count
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    if "D" in split_cigar and sum(del_index) == p + 1:  # if deletion at that position
+                                        nuc = "D"
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            else:  # insertion in pe reads but not at the desired position
+                                nuc = pileupread.alignment.query_sequence[s]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+
+                        elif len(ref) > len(alt):
+                            ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+                            if "D" in split_cigar:
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    del_count = split_cigar[ai - 1]  # deletion on that position but does not necesserily match in the length
+                                    if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1:  # if pe read matches exatcly to deletion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        if nuc == "":
+                                            nuc = str(alt)
+                                        break
+                                    elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1:  # if pe read has shorter deletion --> not alt
+                                        nuc = str(ref)[:int(del_count)+1]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:  # deletion in pe reads but not at the desired position
+                                        nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif "I" in split_cigar:  # if pe read has insertion --> not count
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1:
+                                        nuc = "I"
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
+                                        phred = ord(pileupread.alignment.qual[s]) - 33
+                            elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+                                nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+                                if nuc.upper() == ref[:len(nuc)]:
+                                    nuc = str(ref)
+                            else:  # deletion in pe reads but not at the desired position
+                                nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                                phred = ord(pileupread.alignment.qual[s]) - 33
+                        else:  # SNV: query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[s]
+                            phred = ord(pileupread.alignment.qual[s]) - 33
+
+                        nuc = nuc.upper()
+
+                        # if read is softclipped, store real position in reference
+                        if "S" in pileupread.alignment.cigarstring:
+                            # spftclipped at start
+                            if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
+                                start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
                                 end = pileupread.alignment.reference_end
+                            # softclipped at end
+                            elif re.search(r"S$", pileupread.alignment.cigarstring):
+                                end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
                                 start = pileupread.alignment.reference_start
-
-                            if phred < phred_score:
-                                nuc = "lowQ"
-                            if tag not in mut_dict[chrom_stop_pos]:
-                                mut_dict[chrom_stop_pos][tag] = {}
-                            if nuc in mut_dict[chrom_stop_pos][tag]:
-                                mut_dict[chrom_stop_pos][tag][nuc] += 1
-                            else:
-                                mut_dict[chrom_stop_pos][tag][nuc] = 1
-                            if tag not in mut_read_pos_dict[chrom_stop_pos]:
-                                mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
-                                reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
-                                mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
-                                real_start_end[chrom_stop_pos][tag] = [(start, end)]
+                        else:
+                            end = pileupread.alignment.reference_end
+                            start = pileupread.alignment.reference_start
+                        if phred < phred_score:
+                            nuc = "lowQ"
+                        if tag not in mut_dict[chrom_stop_pos]:
+                            mut_dict[chrom_stop_pos][tag] = {}
+                        if nuc in mut_dict[chrom_stop_pos][tag]:
+                            mut_dict[chrom_stop_pos][tag][nuc] += 1
+                        else:
+                            mut_dict[chrom_stop_pos][tag][nuc] = 1
+                        if tag not in mut_read_pos_dict[chrom_stop_pos]:
+                            mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1]
+                            reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
+                            mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
+                            real_start_end[chrom_stop_pos][tag] = [(start, end)]
+                        else:
+                            mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1)
+                            reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
+                            mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
+                            real_start_end[chrom_stop_pos][tag].append((start, end))
+                        if nuc == alt:
+                            count_alt += 1
+                            if tag not in mut_read_dict:
+                                mut_read_dict[tag] = {}
+                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
                             else:
-                                mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
-                                reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
-                                mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
-                                real_start_end[chrom_stop_pos][tag].append((start, end))
-                            if nuc == alt:
-                                count_alt += 1
-                                if tag not in mut_read_dict:
-                                    mut_read_dict[tag] = {}
-                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                                else:
-                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                            elif nuc == ref:
-                                count_ref += 1
-                            elif nuc == "N":
-                                count_n += 1
-                            elif nuc == "lowQ":
-                                count_lowq += 1
-                            else:
-                                count_other += 1
+                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
+                        elif nuc == ref:
+                            count_ref += 1
+                        elif nuc == "N":
+                            count_n += 1
+                        elif nuc == "lowQ":
+                            count_lowq += 1
                         else:
-                            count_indel += 1
+                            count_other += 1
+                    else:
+                        count_indel += 1
 
     mut_array = np.array(mut_array)
     for read in bam.fetch(until_eof=True):
         if read.is_unmapped:
             pure_tag = read.query_name[:-5]
             nuc = "na"
-            for key in tag_dict[pure_tag].keys():
-                if key not in mut_dict:
-                    mut_dict[key] = {}
-                if read.query_name not in mut_dict[key]:
-                    mut_dict[key][read.query_name] = {}
-                if nuc in mut_dict[key][read.query_name]:
-                    mut_dict[key][read.query_name][nuc] += 1
-                else:
-                    mut_dict[key][read.query_name][nuc] = 1
+            if pure_tag in tag_dict.keys():  # stored all ref and alt reads --> get only alt reads
+                for key in tag_dict[pure_tag].keys():
+                    if key not in mut_dict:
+                        mut_dict[key] = {}
+                    if read.query_name not in mut_dict[key]:
+                        mut_dict[key][read.query_name] = {}
+                    if nuc in mut_dict[key][read.query_name]:
+                        mut_dict[key][read.query_name][nuc] += 1
+                    else:
+                        mut_dict[key][read.query_name][nuc] = 1
     bam.close()
-
     # create pure_tags_dict
     pure_tags_dict = {}
+    pure_tags_dict_ref = {}
     for key1, value1 in sorted(mut_dict.items()):
         i = np.where(np.array(['#'.join(str(i) for i in z)
                                for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
         ref = mut_array[i, 2]
         alt = mut_array[i, 3]
         pure_tags_dict[key1] = {}
+        pure_tags_dict_ref[key1] = {}
         for key2, value2 in sorted(value1.items()):
             for key3, value3 in value2.items():
                 pure_tag = key2[:-5]
@@ -247,6 +349,12 @@
                     else:
                         pure_tags_dict[key1][pure_tag] = 1
 
+                if key3 == ref:
+                    if pure_tag in pure_tags_dict_ref[key1]:
+                        pure_tags_dict_ref[key1][pure_tag] += 1
+                    else:
+                        pure_tags_dict_ref[key1][pure_tag] = 1
+
     # create pure_tags_dict_short with thresh
     if thresh > 0:
         pure_tags_dict_short = {}
@@ -279,7 +387,7 @@
     format23 = workbook3.add_format({'bg_color': '#FFC7CE'})  # red
     format33 = workbook3.add_format({'bg_color': '#FACC2E'})  # yellow
 
-    header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
+    header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
                    'read median length.ba', 'DCS median length',
                    'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba',
                    'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
@@ -288,6 +396,7 @@
                    'in phase', 'chimeric tag')
     ws1.write_row(0, 0, header_line)
     csv_writer.writerow(header_line)
+
     counter_tier11 = 0
     counter_tier12 = 0
     counter_tier21 = 0
@@ -308,12 +417,14 @@
 
     row = 1
     tier_dict = {}
+    tier_dict_ref = {}
     chimera_dict = {}
     for key1, value1 in sorted(mut_dict.items()):
         counts_mut = 0
         chimeric_tag_list = []
         chimeric_tag = {}
-        if key1 in pure_tags_dict_short.keys():
+        if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()):  # ref or alt
+
             change_tier_after_print = []
             i = np.where(np.array(['#'.join(str(i) for i in z)
                                    for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
@@ -323,11 +434,13 @@
             whole_array = list(pure_tags_dict_short[key1].keys())
 
             tier_dict[key1] = {}
+            tier_dict_ref[key1] = {}
             values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0),
                                 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
                                 ("tier 6", 0), ("tier 7", 0)]
             for k, v in values_tier_dict:
                 tier_dict[key1][k] = v
+                tier_dict_ref[key1][k] = v
 
             used_keys = []
             if 'ab' in mut_pos_dict[key1].keys():
@@ -349,7 +462,16 @@
             for key2, value2 in sorted(value1.items()):
                 add_mut14 = ""
                 add_mut23 = ""
-                if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
+                if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys():  # skip reads that have not alt or ref
+                    continue
+
+                if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys):
+
+                    if key2[:-5] in pure_tags_dict_short[key1].keys():
+                        variant_type = "alt"
+                    elif key2[:-5] in pure_tags_dict_ref[key1].keys():
+                        variant_type = "ref"
+
                     if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
                         total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
                         if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
@@ -550,36 +672,59 @@
 
                     used_keys.append(key2[:-5])
                     counts_mut += 1
-                    if (alt1f + alt2f + alt3f + alt4f) > 0.5:
+                    if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
+                        if variant_type == "alt":
+                            tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f
+                            tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f
+                        elif variant_type == "ref":
+                            tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f
+                            tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f
+
                         total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new
                         if total1new == 0:
                             ref1f = alt1f = None
                             alt1ff = -1
                             alt1ff_trim = -1
+                            tier1ff = -1
+                            tier1ff_trim = -1
                         else:
                             alt1ff = alt1f
                             alt1ff_trim = alt1f
+                            tier1ff = tier1ff
+                            tier1ff_trim = tier1ff_trim
                         if total2new == 0:
                             ref2f = alt2f = None
                             alt2ff = -1
                             alt2ff_trim = -1
+                            tier2ff = -1
+                            tier2ff_trim = -1
                         else:
                             alt2ff = alt2f
                             alt2ff_trim = alt2f
+                            tier2ff = tier2ff
+                            tier2ff_trim = tier2ff_trim
                         if total3new == 0:
                             ref3f = alt3f = None
                             alt3ff = -1
                             alt3ff_trim = -1
+                            tier3ff = -1
+                            tier3ff_trim = -1
                         else:
                             alt3ff = alt3f
                             alt3ff_trim = alt3f
+                            tier3ff = tier3ff
+                            tier3ff_trim = tier3ff_trim
                         if total4new == 0:
                             ref4f = alt4f = None
                             alt4ff = -1
                             alt4ff_trim = -1
+                            tier4ff = -1
+                            tier4ff_trim = -1
                         else:
                             alt4ff = alt4f
                             alt4ff_trim = alt4f
+                            tier4ff = tier4ff
+                            tier4ff_trim = tier4ff_trim
 
                         beg1 = beg4 = beg2 = beg3 = 0
 
@@ -605,7 +750,7 @@
                         # mate 1 - SSCS ab
                         softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
                         safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio1 = False
                         else:
                             ratio1 = safe_div_result >= threshold_reads
@@ -629,7 +774,7 @@
                         # mate 1 - SSCS ba
                         softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
                         safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio4 = False
                         else:
                             ratio4 = safe_div_result >= threshold_reads
@@ -653,7 +798,7 @@
                         # mate 2 - SSCS ab
                         softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
                         safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio2 = False
                         else:
                             ratio2 = safe_div_result >= threshold_reads
@@ -677,7 +822,7 @@
                         # mate 2 - SSCS ba
                         softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
                         safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3)))
-                        if (safe_div_result == None):
+                        if (safe_div_result is None):
                             ratio3 = False
                         else:
                             ratio3 = safe_div_result >= threshold_reads
@@ -698,21 +843,21 @@
                             ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
                             ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
 
-                        if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) &  # contradictory variant
-                            all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
-                            (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
-                             all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                        if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) &  # contradictory variant
+                            all(float(ij) == 0. for ij in [tier2ff, tier3ff])) |
+                            (all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) &
+                             all(float(ij) == 0. for ij in [tier1ff, tier4ff]))):
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = True
                         # softclipping tiers
                         # information of both mates available --> all reads for both mates and SSCS are softclipped
                         elif (ratio1 & ratio4 & ratio2 & ratio3 &
                               (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = True
                             softclipped_mutation_oneOfTwoMates = False
@@ -720,16 +865,16 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
                         # information of both mates available --> only one mate softclipped
                         elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
                                (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4]))  # red
                             min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3]))  # blue
@@ -763,10 +908,10 @@
                                     read_len_median3 = read_len_median3 - n_spacer_barcode
                             else:
                                 softclipped_mutation_oneOfTwoMates = True
-                                alt1ff = 0
-                                alt4ff = 0
-                                alt2ff = 0
-                                alt3ff = 0
+                                tier1ff = 0
+                                tier4ff = 0
+                                tier2ff = 0
+                                tier3ff = 0
                                 trimmed = False
                                 contradictory = False
                             softclipped_mutation_allMates = False
@@ -780,6 +925,7 @@
                                     total1new = 0
                                     alt1ff = 0
                                     alt1f = 0
+                                    tier1ff = 0
                                     trimmed = True
 
                                 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
@@ -787,6 +933,7 @@
                                     total4new = 0
                                     alt4ff = 0
                                     alt4f = 0
+                                    tier4ff = 0
                                     trimmed = True
 
                                 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
@@ -794,6 +941,7 @@
                                     total2new = 0
                                     alt2ff = 0
                                     alt2f = 0
+                                    tier2ff = 0
                                     trimmed = True
 
                                 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
@@ -801,6 +949,7 @@
                                     total3new = 0
                                     alt3ff = 0
                                     alt3f = 0
+                                    tier3ff = 0
                                     trimmed = True
                                 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                                 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
@@ -808,7 +957,7 @@
                         # information of both mates available --> only one mate softclipped
                         elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
                               ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+                              all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -816,18 +965,18 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
 
                         # information of one mate available --> all reads of one mate are softclipped
                         elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
-                              all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+                              all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) |
                               (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-                              all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))):  # all mates available
+                              all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -835,17 +984,17 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = True
                             softclipped_mutation_oneMateOneSSCS = False
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
                         # information of one mate available --> only one SSCS is softclipped
                         elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
-                              (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+                              (all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) |
                               (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-                              (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))):  # all mates available
+                              (all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))):  # all mates available
                             # if distance between softclipping and mutation is at start or end of the read smaller than threshold
                             softclipped_mutation_allMates = False
                             softclipped_mutation_oneOfTwoMates = False
@@ -853,10 +1002,10 @@
                             softclipped_mutation_oneOfTwoSSCS_diffMates = False
                             softclipped_mutation_oneMate = False
                             softclipped_mutation_oneMateOneSSCS = True
-                            alt1ff = 0
-                            alt4ff = 0
-                            alt2ff = 0
-                            alt3ff = 0
+                            tier1ff = 0
+                            tier4ff = 0
+                            tier2ff = 0
+                            tier3ff = 0
                             trimmed = False
                             contradictory = False
 
@@ -866,6 +1015,7 @@
                                 total1new = 0
                                 alt1ff = 0
                                 alt1f = 0
+                                tier1ff = 0
                                 trimmed = True
 
                             if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
@@ -873,6 +1023,7 @@
                                 total4new = 0
                                 alt4ff = 0
                                 alt4f = 0
+                                tier4ff = 0
                                 trimmed = True
 
                             if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
@@ -880,6 +1031,7 @@
                                 total2new = 0
                                 alt2ff = 0
                                 alt2f = 0
+                                tier2ff = 0
                                 trimmed = True
 
                             if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
@@ -887,117 +1039,145 @@
                                 total3new = 0
                                 alt3ff = 0
                                 alt3f = 0
+                                tier3ff = 0
                                 trimmed = True
                             details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                             details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 
                         # assign tiers
                         if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
-                             all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                             all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                             (all(int(ij) >= 3 for ij in [total2new, total3new]) &
-                             all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                             all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "1.1"
                             counter_tier11 += 1
-                            tier_dict[key1]["tier 1.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 1.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 1.1"] += 1
 
                         elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
                               any(int(ij) >= 3 for ij in [total1new, total4new]) &
                               any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                              all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+                              all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
                             tier = "1.2"
                             counter_tier12 += 1
-                            tier_dict[key1]["tier 1.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 1.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 1.2"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
                                any(int(ij) >= 3 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
                                any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "2.1"
                             counter_tier21 += 1
-                            tier_dict[key1]["tier 2.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.1"] += 1
 
                         elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
-                              all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+                              all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
                             tier = "2.2"
                             counter_tier22 += 1
-                            tier_dict[key1]["tier 2.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.2"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
                                any(int(ij) >= 3 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
-                               any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) &
+                               any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
                                any(int(ij) >= 3 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
-                               any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) &
+                               any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))):
                             tier = "2.3"
                             counter_tier23 += 1
-                            tier_dict[key1]["tier 2.3"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.3"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.3"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
                             tier = "2.4"
                             counter_tier24 += 1
-                            tier_dict[key1]["tier 2.4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 2.4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 2.4"] += 1
 
                         elif ((len(pure_tags_dict_short[key1]) > 1) &
-                              (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
-                               all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+                              (all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) |
+                               all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
                             tier = "3.1"
                             counter_tier31 += 1
-                            tier_dict[key1]["tier 3.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 3.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 3.1"] += 1
 
                         elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-                               all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
+                               all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) |
                               (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-                               all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+                               all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
                             tier = "3.2"
                             counter_tier32 += 1
-                            tier_dict[key1]["tier 3.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 3.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 3.2"] += 1
 
                         elif (trimmed):
                             tier = "4"
                             counter_tier4 += 1
-                            tier_dict[key1]["tier 4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 4"] += 1
 
                             # assign tiers
                             if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                 all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                 all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
                                   any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
                                   any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                  all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+                                  all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
                                    any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
                                    any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
-                                  all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+                                  all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
                                    any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) &
-                                   any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) &
+                                   any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
                                    any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) &
-                                   any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) &
+                                   any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))):
                                 trimmed_actual_high_tier = True
                             elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+                                   all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
                                   (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
-                                   all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+                                   all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
                                 trimmed_actual_high_tier = True
                             else:
                                 trimmed_actual_high_tier = False
@@ -1005,37 +1185,58 @@
                         elif softclipped_mutation_allMates:
                             tier = "5.1"
                             counter_tier51 += 1
-                            tier_dict[key1]["tier 5.1"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.1"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.1"] += 1
 
                         elif softclipped_mutation_oneOfTwoMates:
                             tier = "5.2"
                             counter_tier52 += 1
-                            tier_dict[key1]["tier 5.2"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.2"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.2"] += 1
 
                         elif softclipped_mutation_oneOfTwoSSCS:
                             tier = "5.3"
                             counter_tier53 += 1
-                            tier_dict[key1]["tier 5.3"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.3"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.3"] += 1
 
                         elif softclipped_mutation_oneMate:
                             tier = "5.4"
                             counter_tier54 += 1
-                            tier_dict[key1]["tier 5.4"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.4"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.4"] += 1
 
                         elif softclipped_mutation_oneMateOneSSCS:
                             tier = "5.5"
                             counter_tier55 += 1
-                            tier_dict[key1]["tier 5.5"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 5.5"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 5.5"] += 1
 
                         elif (contradictory):
                             tier = "6"
                             counter_tier6 += 1
-                            tier_dict[key1]["tier 6"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 6"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 6"] += 1
 
                         else:
                             tier = "7"
                             counter_tier7 += 1
-                            tier_dict[key1]["tier 7"] += 1
+                            if variant_type == "alt":
+                                tier_dict[key1]["tier 7"] += 1
+                            elif variant_type == "ref":
+                                tier_dict_ref[key1]["tier 7"] += 1
 
                         chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
                         var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
@@ -1114,13 +1315,12 @@
                             read_pos2 = read_len_median2 = None
                         if (read_pos3 == -1):
                             read_pos3 = read_len_median3 = None
-                        line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
+                        line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
                         # ws1.write_row(row, 0, line)
                         # csv_writer.writerow(line)
-                        line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
+                        line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
                         # ws1.write_row(row + 1, 0, line2)
                         # csv_writer.writerow(line2)
-
                         # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
                         #                       {'type': 'formula',
                         #                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
@@ -1155,12 +1355,19 @@
             # write to file
             # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
             sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
+            sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]])
             correct_tier = False
+            correct_tier_ref = False
             if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
                 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
                 tier_dict[key1]["tier 4"] = 0
                 correct_tier = True
 
+            if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0:
+                tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"]
+                tier_dict_ref[key1]["tier 4"] = 0
+                correct_tier_ref = True
+
             for sample in change_tier_after_print:
                 row_number = sample[0]
                 line1 = sample[1]
@@ -1168,44 +1375,47 @@
                 actual_high_tier = sample[3]
                 current_tier = list(line1)[1]
 
-                if correct_tier and (current_tier == "4") and actual_high_tier:
+                if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
+                    line1 = list(line1)
+                    line1[1] = "2.5"
+                    line1 = tuple(line1)
+                    counter_tier25 += 1
+                    counter_tier4 -= 1
+                if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
                     line1 = list(line1)
                     line1[1] = "2.5"
                     line1 = tuple(line1)
                     counter_tier25 += 1
                     counter_tier4 -= 1
+
                 ws1.write_row(row_number, 0, line1)
                 csv_writer.writerow(line1)
                 ws1.write_row(row_number + 1, 0, line2)
                 csv_writer.writerow(line2)
+                if line1[2] == "alt":
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                elif line1[2] == "ref":
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+                    ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
 
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1),
-                                        'format': format1,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1),
-                                        'format': format3,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-                ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-                                       {'type': 'formula',
-                                        'criteria': '=$B${}>="3"'.format(row_number + 1),
-                                        'format': format2,
-                                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
     # sheet 2
     if chimera_correction:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
-                        'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                        )
     else:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
-                        'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
-
+        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                        )
     ws2.write_row(0, 0, header_line2)
     row = 0
 
@@ -1220,9 +1430,12 @@
             alt_count = cvrg_dict[key1][1]
             cvrg = ref_count + alt_count
 
+            ref_tiers = tier_dict_ref[key1]
+
             var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
             lst = [var_id, cvrg]
             used_tiers = []
+            used_tiers_ref = [t for k, t in sorted(ref_tiers.items())]
             cum_af = []
             for key2, value2 in sorted(value1.items()):
                 # calculate cummulative AF
@@ -1233,24 +1446,25 @@
             if sum(used_tiers) == 0:  # skip mutations that are filtered by the VA in the first place
                 continue
             lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
-            lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))])
+            lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))])
             if chimera_correction:
                 chimeras_all = chimera_dict[key1][1]
                 new_alt = sum(used_tiers[0:7]) - chimeras_all
                 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7])))
                 if fraction_chimeras is None:
                     fraction_chimeras = 0.
-                new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras)
+                new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras)
                 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
             lst.extend([alt_count, safe_div(alt_count, cvrg)])
             lst.extend(used_tiers)
-            lst.extend(cum_af)
+            lst.extend(used_tiers_ref)
+            # lst.extend(cum_af)
             lst = tuple(lst)
             ws2.write_row(row + 1, 0, lst)
             if chimera_correction:
-                ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)})
-                ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)})
-                ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)})
+                ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)})
+                ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)})
+                ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)})
             else:
                 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
                 ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)})
author	mheinzl
date	Fri, 22 Jul 2022 09:19:44 +0000
parents	1797e461d674
children	d7aea14291e8