Mercurial > repos > mheinzl > variant_analyzer2

diff read2mut.py @ 6:11a2a34f8a2b draft
planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author: mheinzl
date: Mon, 18 Jan 2021 09:49:15 +0000
parents: d9cbf833624e
children: ded0dc6a20d3
--- a/read2mut.py	Tue Oct 27 12:46:55 2020 +0000
+++ b/read2mut.py	Mon Jan 18 09:49:15 2021 +0000
@@ -33,11 +33,13 @@
 import numpy as np
 import pysam
 import xlsxwriter
+from cyvcf2 import VCF
+
 
 def make_argparser():
-    parser = argparse.ArgumentParser(description='Takes a tabular file with mutations, a BAM file and JSON files as input and prints stats about variants to a user specified output file.')
+    parser = argparse.ArgumentParser(description='Takes a VCF file with mutations, a BAM file and JSON files as input and prints stats about variants to a user specified output file.')
     parser.add_argument('--mutFile',
-                        help='TABULAR file with DCS mutations.')
+                        help='VCF file with DCS mutations.')
     parser.add_argument('--bamFile',
                         help='BAM file with aligned raw reads of selected tags (FASTQ created by mut2read.py - trimming with Trimmomatic - alignment with bwa).')
     parser.add_argument('--inputJson',
@@ -45,7 +47,11 @@
     parser.add_argument('--sscsJson',
                         help='JSON file with SSCS counts collected by mut2sscs.py.')
     parser.add_argument('--outputFile',
-                        help='Output xlsx file of mutation details.')
+                        help='Output xlsx file with summary of mutations.')
+    parser.add_argument('--outputFile2',
+                        help='Output xlsx file with allele frequencies of mutations.')
+    parser.add_argument('--outputFile3',
+                        help='Output xlsx file with examples of the tier classification.')
     parser.add_argument('--thresh', type=int, default=0,
                         help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.')
     parser.add_argument('--phred', type=int, default=20,
@@ -53,7 +59,11 @@
     parser.add_argument('--trim', type=int, default=10,
                         help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
     parser.add_argument('--chimera_correction', action="store_true",
-                        help='Count chimeric variants and correct the variant frequencies.')
+                        help='Count chimeric variants and correct the variant frequencies')
+    parser.add_argument('--softclipping_dist',  type=int, default=15,
+                        help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.')
+    parser.add_argument('--reads_threshold',  type=float, default=1.0,
+                        help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.')
     return parser
 
 
@@ -71,10 +81,14 @@
     json_file = args.inputJson
     sscs_json = args.sscsJson
     outfile = args.outputFile
+    outfile2 = args.outputFile2
+    outfile3 = args.outputFile3
     thresh = args.thresh
     phred_score = args.phred
     trim = args.trim
     chimera_correction = args.chimera_correction
+    thr = args.softclipping_dist
+    threshold_reads = args.reads_threshold
 
     if os.path.isfile(file1) is False:
         sys.exit("Error: Could not find '{}'".format(file1))
@@ -88,10 +102,9 @@
         sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score))
     if trim < 0:
         sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))
+    if thr <= 0:
+        sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr))
 
-    with open(file1, 'r') as mut:
-        mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
-        
     # load dicts
     with open(json_file, "r") as f:
         (tag_dict, cvrg_dict) = json.load(f)
@@ -103,77 +116,88 @@
     # pysam.index(file2)
     bam = pysam.AlignmentFile(file2, "rb")
 
-    # 4. create mut_dict
+    # create mut_dict
     mut_dict = {}
     mut_read_pos_dict = {}
     mut_read_dict = {}
     reads_dict = {}
-    if mut_array.shape == (1, 13):
-        mut_array = mut_array.reshape((1, len(mut_array)))
+    mut_read_cigar_dict = {}
+    i = 0
+    mut_array = []
 
-    for m in range(0, len(mut_array[:, 0])):
-        print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
-        chrom = mut_array[m, 1]
-        stop_pos = mut_array[m, 2].astype(int)
+    for count, variant in enumerate(VCF(file1)):
+    	#if count == 2000:
+        #    break
+        chrom = variant.CHROM
+        stop_pos = variant.start
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
-        ref = mut_array[m, 9]
-        alt = mut_array[m, 10]
-        mut_dict[chrom_stop_pos] = {}
-        mut_read_pos_dict[chrom_stop_pos] = {}
-        reads_dict[chrom_stop_pos] = {}
+        ref = variant.REF
+        alt = variant.ALT[0]
+#        nc = variant.format('NC')
+        ad = variant.format('AD')
+        if len(ref) == len(alt):
+            mut_array.append([chrom, stop_pos, ref, alt])
+            i += 1
+            mut_dict[chrom_stop_pos] = {}
+            mut_read_pos_dict[chrom_stop_pos] = {}
+            reads_dict[chrom_stop_pos] = {}
+            mut_read_cigar_dict[chrom_stop_pos] = {}
 
-        for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=100000000):
-            if pileupcolumn.reference_pos == stop_pos - 1:
-                count_alt = 0
-                count_ref = 0
-                count_indel = 0
-                count_n = 0
-                count_other = 0
-                count_lowq = 0
-                n = 0
-                print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-                      "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
-                for pileupread in pileupcolumn.pileups:
-                    n += 1
-                    if not pileupread.is_del and not pileupread.is_refskip:
-                        tag = pileupread.alignment.query_name
-                        nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                        phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
-                        if phred < phred_score:
-                            nuc = "lowQ"
-                        if tag not in mut_dict[chrom_stop_pos]:
-                            mut_dict[chrom_stop_pos][tag] = {}
-                        if nuc in mut_dict[chrom_stop_pos][tag]:
-                            mut_dict[chrom_stop_pos][tag][nuc] += 1
+            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+                if pileupcolumn.reference_pos == stop_pos:
+                    count_alt = 0
+                    count_ref = 0
+                    count_indel = 0
+                    count_n = 0
+                    count_other = 0
+                    count_lowq = 0
+                    n = 0
+                    #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
+                    #      "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
+                    for pileupread in pileupcolumn.pileups:
+                        n += 1
+                        if not pileupread.is_del and not pileupread.is_refskip:
+                            tag = pileupread.alignment.query_name
+                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
+                            phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
+                            if phred < phred_score:
+                                nuc = "lowQ"
+                            if tag not in mut_dict[chrom_stop_pos]:
+                                mut_dict[chrom_stop_pos][tag] = {}
+                            if nuc in mut_dict[chrom_stop_pos][tag]:
+                                mut_dict[chrom_stop_pos][tag][nuc] += 1
+                            else:
+                                mut_dict[chrom_stop_pos][tag][nuc] = 1
+                            if tag not in mut_read_pos_dict[chrom_stop_pos]:
+                                mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
+                                reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
+                                mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
+                            else:
+                            	mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
+                            	reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
+                            	mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
+                            if nuc == alt:
+                                count_alt += 1
+                                if tag not in mut_read_dict:
+                                    mut_read_dict[tag] = {}
+                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
+                                else:
+                                    mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
+                            elif nuc == ref:
+                                count_ref += 1
+                            elif nuc == "N":
+                                count_n += 1
+                            elif nuc == "lowQ":
+                                count_lowq += 1
+                            else:
+                                count_other += 1
                         else:
-                            mut_dict[chrom_stop_pos][tag][nuc] = 1
-                        if tag not in mut_read_pos_dict[chrom_stop_pos]:
-                            mut_read_pos_dict[chrom_stop_pos][tag] = np.array(pileupread.query_position) + 1
-                            reads_dict[chrom_stop_pos][tag] = len(pileupread.alignment.query_sequence)
-                        else:
-                            mut_read_pos_dict[chrom_stop_pos][tag] = np.append(
-                                mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1)
-                            reads_dict[chrom_stop_pos][tag] = np.append(
-                                reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
-                        if nuc == alt:
-                            count_alt += 1
-                            if tag not in mut_read_dict:
-                                mut_read_dict[tag] = {}
-                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                            else:
-                                mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-                        elif nuc == ref:
-                            count_ref += 1
-                        elif nuc == "N":
-                            count_n += 1
-                        elif nuc == "lowQ":
-                            count_lowq += 1
-                        else:
-                            count_other += 1
-                    else:
-                        count_indel += 1
-                print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
-       
+                            count_indel += 1
+
+                    #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
+        #else:
+        #    print("indels are currently not evaluated")
+    mut_array = np.array(mut_array)
     for read in bam.fetch(until_eof=True):
         if read.is_unmapped:
             pure_tag = read.query_name[:-5]
@@ -189,13 +213,17 @@
                     mut_dict[key][read.query_name][nuc] = 1
     bam.close()
 
-    # 5. create pure_tags_dict
+    # create pure_tags_dict
     pure_tags_dict = {}
     for key1, value1 in sorted(mut_dict.items()):
+    	if len(np.where(np.array(['#'.join(str(i) for i in z)
+                               for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0:
+    		continue
+
         i = np.where(np.array(['#'.join(str(i) for i in z)
-                               for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0]
-        ref = mut_array[i, 9]
-        alt = mut_array[i, 10]
+                               for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0]
+        ref = mut_array[i, 2]
+        alt = mut_array[i, 3]
         pure_tags_dict[key1] = {}
         for key2, value2 in sorted(value1.items()):
             for key3, value3 in value2.items():
@@ -206,7 +234,7 @@
                     else:
                         pure_tags_dict[key1][pure_tag] = 1
 
-    # 6. create pure_tags_dict_short with thresh
+    # create pure_tags_dict_short with thresh
     if thresh > 0:
         pure_tags_dict_short = {}
         for key, value in sorted(pure_tags_dict.items()):
@@ -215,16 +243,36 @@
     else:
         pure_tags_dict_short = pure_tags_dict
 
-    # 7. output summary with threshold
+    # whole_array = []
+    # for k in pure_tags_dict.values():
+    #    if len(k) != 0:
+    #        keys = k.keys()
+    #        if len(keys) > 1:
+    #            for k1 in keys:
+    #                whole_array.append(k1)
+    #        else:
+    #            whole_array.append(keys[0])
+
+    # output summary with threshold
     workbook = xlsxwriter.Workbook(outfile)
+    workbook2 = xlsxwriter.Workbook(outfile2)
+    workbook3 = xlsxwriter.Workbook(outfile3)
     ws1 = workbook.add_worksheet("Results")
-    ws2 = workbook.add_worksheet("Allele frequencies")
-    ws3 = workbook.add_worksheet("Tiers")
+    ws2 = workbook2.add_worksheet("Allele frequencies")
+    ws3 = workbook3.add_worksheet("Tiers")
 
     format1 = workbook.add_format({'bg_color': '#BCF5A9'})  # green
     format2 = workbook.add_format({'bg_color': '#FFC7CE'})  # red
     format3 = workbook.add_format({'bg_color': '#FACC2E'})  # yellow
 
+    format12 = workbook2.add_format({'bg_color': '#BCF5A9'})  # green
+    format22 = workbook2.add_format({'bg_color': '#FFC7CE'})  # red
+    format32 = workbook2.add_format({'bg_color': '#FACC2E'})  # yellow
+
+    format13 = workbook3.add_format({'bg_color': '#BCF5A9'})  # green
+    format23 = workbook3.add_format({'bg_color': '#FFC7CE'})  # red
+    format33 = workbook3.add_format({'bg_color': '#FACC2E'})  # yellow
+
     header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
                    'read median length.ba', 'DCS median length',
                    'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba',
@@ -244,9 +292,14 @@
     counter_tier32 = 0
     counter_tier41 = 0
     counter_tier42 = 0
-    #if chimera_correction:
+    # if chimera_correction:
     #    counter_tier43 = 0
-    counter_tier5 = 0
+    counter_tier51 = 0
+    counter_tier52 = 0
+    counter_tier53 = 0
+    counter_tier54 = 0
+    counter_tier55 = 0
+    counter_tier6 = 0
 
     row = 1
     tier_dict = {}
@@ -257,14 +310,16 @@
         chimeric_tag = {}
         if key1 in pure_tags_dict_short.keys():
             i = np.where(np.array(['#'.join(str(i) for i in z)
-                                   for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0]
-            ref = mut_array[i, 9]
-            alt = mut_array[i, 10]
+                                   for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0]
+            ref = mut_array[i, 2]
+            alt = mut_array[i, 3]
             dcs_median = cvrg_dict[key1][2]
             whole_array = pure_tags_dict_short[key1].keys()
 
             tier_dict[key1] = {}
-            values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
+            values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0),
+                                ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
+                                ("tier 6", 0)]
             for k, v in values_tier_dict:
                 tier_dict[key1][k] = v
 
@@ -455,19 +510,38 @@
 
                     read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1
                     read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0
-
+                    cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = []
+                    pos_read1 = pos_read2 = pos_read3 = pos_read4 = []
+                    end_read1 = end_read2 = end_read3 = end_read4 = []
                     if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
-                        read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])
-                        read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1'])
+                        read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']))
+                        read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1']))
+                        cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
+                        #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
+                        pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
+                        #print(cigars_dcs1)
+                        end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
                     if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
-                        read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])
-                        read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2'])
+                        read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']))
+                        read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2']))
+                        cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
+                        pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
+                        end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
                     if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
-                        read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])
-                        read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1'])
+                        read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']))
+                        read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1']))
+                        cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
+                        pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
+                        end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
                     if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
-                        read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])
-                        read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2'])
+                        read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']))
+                        read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2']))
+                        #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
+                        cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
+
+                        pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
+                        #print(cigars_dcs4)
+                        end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
 
                     used_keys.append(key2[:-5])
                     counts_mut += 1
@@ -497,21 +571,225 @@
 
                         details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                         details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
-                        
-                        
+
                         trimmed = False
                         contradictory = False
+                        softclipped_mutation_allMates = False
+                        softclipped_mutation_oneOfTwoMates = False
+                        softclipped_mutation_oneOfTwoSSCS = False
+                        softclipped_mutation_oneMate = False
+                        softclipped_mutation_oneMateOneSSCS = False
+                        print()
+                        print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3)
+                        dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
+                        dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
+                        ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
+                        ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
 
-                        if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
+                        # mate 1 - SSCS ab
+                        softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
+                        ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
+
+                        if any(ij is True for ij in softclipped_idx1):
+                        	softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
+                        	softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
+                        	softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
+                        	dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
+                        	dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
+                        	
+                        	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+                        	if any(ij is True for ij in softclipped_both_ends_idx1):
+                        		print(softclipped_both_ends_idx1)
+                        		for nr, indx in enumerate(softclipped_both_ends_idx1):
+                        			if indx:
+                        				if dist_start_read1[nr] <= dist_end_read1[nr]:
+                        					dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
+                        				else:
+                        					dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
+                        	ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+                        	ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+                        print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
+
+                        # mate 1 - SSCS ba
+                        softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
+                        ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
+                        if any(ij is True for ij in softclipped_idx4):
+                        	softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
+                        	softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
+                        	softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
+                        	dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
+                        	dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
+                        	
+                        	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+                        	if any(ij is True for ij in softclipped_both_ends_idx4):
+                        		print(softclipped_both_ends_idx4)
+                        		for nr, indx in enumerate(softclipped_both_ends_idx4):
+                        			if indx:
+                        				if dist_start_read4[nr] <= dist_end_read4[nr]:
+                        					dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
+                        				else:
+                        					dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
+                       		ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+                        	ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+                        print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
+
+                        # mate 2 - SSCS ab
+                        softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
+                        #print(sum(softclipped_idx2))
+                        ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
+                        if any(ij is True for ij in softclipped_idx2):
+                            softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
+                            softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
+                            softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
+                            dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
+                            dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
+                            	
+                        	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+                            if any(ij is True for ij in softclipped_both_ends_idx2):
+                            	print(softclipped_both_ends_idx2)
+                                for nr, indx in enumerate(softclipped_both_ends_idx2):
+                                    if indx:
+                                        if dist_start_read2[nr] <= dist_end_read2[nr]:
+                                            dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
+                                        else:
+                                            dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
+                            ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+                            #print(ratio_dist_end2)
+                            #print([True if x <= thr else False for x in ratio_dist_end2])
+                            ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+                        print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
+
+                        # mate 2 - SSCS ba
+                        softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
+                        ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
+                        if any(ij is True for ij in softclipped_idx3):
+                            softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
+                            softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
+                            softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
+                            dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
+                            dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
+                            
+                        	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+                            if any(ij is True for ij in softclipped_both_ends_idx3):
+                            	print(softclipped_both_ends_idx3)
+                                for nr, indx in enumerate(softclipped_both_ends_idx3):
+                                    if indx:
+                                        if dist_start_read3[nr] <= dist_end_read3[nr]:
+                                            dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
+                                        else:
+                                            dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
+                            #print([True if x <= thr else False for x in dist_start_read3])
+                            ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+                            ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+                        print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
+
+                        if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) &  # contradictory variant
                             all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
                             (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
-                            all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
+                             all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
                             alt1ff = 0
                             alt4ff = 0
                             alt2ff = 0
                             alt3ff = 0
                             trimmed = False
                             contradictory = True
+                        # softclipping tiers
+                        # information of both mates available --> all reads for both mates and SSCS are softclipped
+                        elif (ratio1 & ratio4 & ratio2 & ratio3 & 
+                              (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+                              all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+                        	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+                            softclipped_mutation_allMates = True
+                            softclipped_mutation_oneOfTwoMates = False
+                            softclipped_mutation_oneOfTwoSSCS = False
+                            softclipped_mutation_oneMate = False
+                            softclipped_mutation_oneMateOneSSCS = False
+                            alt1ff = 0
+                            alt4ff = 0
+                            alt2ff = 0
+                            alt3ff = 0
+                            trimmed = False
+                            contradictory = False
+                            print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
+                        # information of both mates available --> only one mate softclipped
+                        elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
+                               (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
+                               all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+                        	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+                            softclipped_mutation_allMates = False
+                            softclipped_mutation_oneOfTwoMates = True
+                            softclipped_mutation_oneOfTwoSSCS = False
+                            softclipped_mutation_oneMate = False
+                            softclipped_mutation_oneMateOneSSCS = False
+                            alt1ff = 0
+                            alt4ff = 0
+                            alt2ff = 0
+                            alt3ff = 0
+                            trimmed = False
+                            contradictory = False
+                            print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates)
+                        # information of both mates available --> only one mate softclipped
+                        elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+                              ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+                               all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+                        	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+                            softclipped_mutation_allMates = False
+                            softclipped_mutation_oneOfTwoMates = False
+                            softclipped_mutation_oneOfTwoSSCS = True
+                            softclipped_mutation_oneMate = False
+                            softclipped_mutation_oneMateOneSSCS = False
+                            alt1ff = 0
+                            alt4ff = 0
+                            alt2ff = 0
+                            alt3ff = 0
+                            trimmed = False
+                            contradictory = False
+                            print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
+                        # information of one mate available --> all reads of one mate are softclipped
+                        elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
+                              all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+                        	  (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+                        	  all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
+                        	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+                            #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
+                        #		((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
+                        #		(((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
+                       # 		((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
+                            softclipped_mutation_allMates = False
+                            softclipped_mutation_oneOfTwoMates = False
+                            softclipped_mutation_oneOfTwoSSCS = False
+                            softclipped_mutation_oneMate = True
+                            softclipped_mutation_oneMateOneSSCS = False
+                            alt1ff = 0
+                            alt4ff = 0
+                            alt2ff = 0
+                            alt3ff = 0
+                            trimmed = False
+                            contradictory = False
+                            print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
+                        # information of one mate available --> only one SSCS is softclipped
+                        elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+                              (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+                        	  (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+                        	  (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
+                        	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+                            #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
+                        	#	all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
+                        #		(all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
+                       # 		all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
+                            softclipped_mutation_allMates = False
+                            softclipped_mutation_oneOfTwoMates = False
+                            softclipped_mutation_oneOfTwoSSCS = False
+                            softclipped_mutation_oneMate = False
+                            softclipped_mutation_oneMateOneSSCS = True
+                            alt1ff = 0
+                            alt4ff = 0
+                            alt2ff = 0
+                            alt3ff = 0
+                            trimmed = False
+                            contradictory = False
+                            print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
+
                         else:
                             if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
                                 beg1 = total1new
@@ -526,14 +804,14 @@
                                 alt4ff = 0
                                 alt4f = 0
                                 trimmed = True
-                            
+
                             if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
                                 beg2 = total2new
                                 total2new = 0
                                 alt2ff = 0
                                 alt2f = 0
                                 trimmed = True
-                                
+
                             if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
                                 beg3 = total3new
                                 total3new = 0
@@ -621,10 +899,35 @@
                             counter_tier42 += 1
                             tier_dict[key1]["tier 4.2"] += 1
 
+                        elif softclipped_mutation_allMates:
+                            tier = "5.1"
+                            counter_tier51 += 1
+                            tier_dict[key1]["tier 5.1"] += 1
+
+                        elif softclipped_mutation_oneOfTwoMates:
+                            tier = "5.2"
+                            counter_tier52 += 1
+                            tier_dict[key1]["tier 5.2"] += 1
+
+                        elif softclipped_mutation_oneOfTwoSSCS:
+                            tier = "5.3"
+                            counter_tier53 += 1
+                            tier_dict[key1]["tier 5.3"] += 1
+
+                        elif softclipped_mutation_oneMate:
+                            tier = "5.4"
+                            counter_tier54 += 1
+                            tier_dict[key1]["tier 5.4"] += 1
+
+                        elif softclipped_mutation_oneMateOneSSCS:
+                            tier = "5.5"
+                            counter_tier55 += 1
+                            tier_dict[key1]["tier 5.5"] += 1
+
                         else:
-                            tier = "5"
-                            counter_tier5 += 1
-                            tier_dict[key1]["tier 5"] += 1
+                            tier = "6"
+                            counter_tier6 += 1
+                            tier_dict[key1]["tier 6"] += 1
 
                         chrom, pos = re.split(r'\#', key1)
                         var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
@@ -702,7 +1005,7 @@
                             if key_chimera in chimeric_tag.keys():
                                 chimeric_tag[key_chimera].append(float(tier))
                             else:
-                                chimeric_tag[key_chimera] = [float(tier)] 
+                                chimeric_tag[key_chimera] = [float(tier)]
 
                         if (read_pos1 == -1):
                             read_pos1 = read_len_median1 = None
@@ -750,13 +1053,13 @@
     if chimera_correction:
         header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
                     'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-                    'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                    'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
+                    'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+                    'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
     else:
         header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                    'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-                    'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                    'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
+                        'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
+                        'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
 
     ws2.write_row(0, 0, header_line2)
     row = 0
@@ -764,9 +1067,9 @@
     for key1, value1 in sorted(tier_dict.items()):
         if key1 in pure_tags_dict_short.keys():
             i = np.where(np.array(['#'.join(str(i) for i in z)
-                                   for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0]
-            ref = mut_array[i, 9]
-            alt = mut_array[i, 10]
+                                   for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0]
+            ref = mut_array[i, 2]
+            alt = mut_array[i, 3]
             chrom, pos = re.split(r'\#', key1)
             ref_count = cvrg_dict[key1][0]
             alt_count = cvrg_dict[key1][1]
@@ -782,6 +1085,8 @@
                 if len(used_tiers) > 1:
                     cum = safe_div(sum(used_tiers), cvrg)
                     cum_af.append(cum)
+            if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place
+            	continue
             lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
             if chimera_correction:
                 chimeras_all = chimera_dict[key1][0]
@@ -806,20 +1111,21 @@
             lst = tuple(lst)
             ws2.write_row(row + 1, 0, lst)
             if chimera_correction:
-                ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format1, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
-                ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format3, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
-                ws2.conditional_format('V{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format2, 'multi_range': 'V{}:Z{} V1:Z1'.format(row + 2, row + 2)})
+                ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
+                ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
+                ws2.conditional_format('V{}:AE{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AE{} V1:AE1'.format(row + 2, row + 2)})
             else:
-                ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
-                ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
-                ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:T{} P1:T1'.format(row + 2, row + 2)})
+                ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
+                ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
+                ws2.conditional_format('P{}:Y{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Y{} P1:Y1'.format(row + 2, row + 2)})
             row += 1
 
     # sheet 3
     sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
-                  ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
-                  ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41), 
-                  ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)]
+              ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
+              ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
+              ("tier 4.2", counter_tier42), ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52),
+              ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6)]
 
     header = ("tier", "count")
     ws3.write_row(0, 0, header)
@@ -839,7 +1145,21 @@
                                 'criteria': '=$A${}>="3"'.format(i + 2),
                                 'format': format2})
 
-    description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
+    description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""),
+                         ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
+                         ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
+                         ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
+                         ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
+                         ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
+                         ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
+                         ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
+                         ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
+                         ("Tier 5.1", "variants is close to softclipping in both mates"),
+                         ("Tier 5.2", "variants is close to softclipping in one of the mates"),
+                         ("Tier 5.3", "variants is close to softclipping in one of the SSCS of both mates"),
+                         ("Tier 5.4", "variants is close to softclipping in one mate (no information of second mate"),
+                         ("Tier 5.5", "variants is close to softclipping in one of the SSCS (no information of the second mate"),
+                         ("Tier 6", "remaining variants")]
     examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
                         "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
                         "4081", "4098", "5", "10", "", ""),
@@ -899,20 +1219,21 @@
                        ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
                         "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
                         "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-                       [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
+                      [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
                         "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
                         "0", "0", "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
                         "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
                         "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-                       [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
+                      [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)],
+                      [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
                         "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
                         "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
                         "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
                         "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
 
-    start_row = 15
+    start_row = 20
     ws3.write(start_row, 0, "Description of tiers with examples")
     ws3.write_row(start_row + 1, 0, header_line)
     row = 0
@@ -921,19 +1242,22 @@
         ex = examples_tiers[i]
         for k in range(len(ex)):
             ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
-        ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format1, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
+        ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
         ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
                                {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
-                                'format': format3,
+                                'format': format33,
                                 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
         ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
                                {'type': 'formula',
                                 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2),
-                                'format': format2,
+                                'format': format23,
                                 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
         row += 3
     workbook.close()
+    workbook2.close()
+    workbook3.close()
 
 
 if __name__ == '__main__':
     sys.exit(read2mut(sys.argv))
+
author	mheinzl
date	Mon, 18 Jan 2021 09:49:15 +0000
parents	d9cbf833624e
children	ded0dc6a20d3