variant_analyzer2: read2mut.py comparison

comparison read2mut.py @ 43:d21960b45a6b draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8

author	mheinzl
date	Tue, 02 Mar 2021 15:32:41 +0000
parents	da224c392a54
children	29226ceba7cd

comparison

equal deleted inserted replaced

-:da224c392a54
+:d21960b45a6b
 Looks for reads with mutation at known
 positions and calculates frequencies and stats.
 =======  ==========  =================  ================================
 Version  Date        Author             Description
-2.0.0    2020-10-30  Gundula Povysil    -
+0.2.1    2019-10-27  Gundula Povysil    -
 =======  ==========  =================  ================================
 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
---outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim5 10 --trim3 10 --chimera_correction
+--outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim 10 --chimera_correction
 """
 from __future__ import division
 import argparse
-import csv
+import itertools
 import json
 import operator
 import os
 import re
 import sys
 import numpy as np
 import pysam
 import xlsxwriter
 from cyvcf2 import VCF
 help='JSON file with data collected by mut2read.py.')
 parser.add_argument('--sscsJson',
 help='JSON file with SSCS counts collected by mut2sscs.py.')
 parser.add_argument('--outputFile',
 help='Output xlsx file with summary of mutations.')
-parser.add_argument('--outputFile_csv',
-help='Output csv file with summary of mutations.')
 parser.add_argument('--outputFile2',
 help='Output xlsx file with allele frequencies of mutations.')
 parser.add_argument('--outputFile3',
 help='Output xlsx file with examples of the tier classification.')
 parser.add_argument('--thresh', type=int, default=0,
 help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.')
 parser.add_argument('--phred', type=int, default=20,
 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.')
-parser.add_argument('--trim5', type=int, default=10,
+parser.add_argument('--trim', type=int, default=10,
-help='Integer threshold for assigning mutations at the beginning of the reads to lower tier. Default 10.')
+help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
-parser.add_argument('--trim3', type=int, default=10,
-help='Integer threshold for assigning mutations at the end of the reads to lower tier. Default 10.')
 parser.add_argument('--chimera_correction', action="store_true",
 help='Count chimeric variants and correct the variant frequencies')
+parser.add_argument('--softclipping_dist',  type=int, default=15,
+help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.')
+parser.add_argument('--reads_threshold',  type=float, default=1.0,
+help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.')
 return parser
 def safe_div(x, y):
 if y == 0:
 json_file = args.inputJson
 sscs_json = args.sscsJson
 outfile = args.outputFile
 outfile2 = args.outputFile2
 outfile3 = args.outputFile3
-outputFile_csv = args.outputFile_csv
 thresh = args.thresh
 phred_score = args.phred
-trim5 = args.trim5
+trim = args.trim
-trim3 = args.trim3
 chimera_correction = args.chimera_correction
+thr = args.softclipping_dist
+threshold_reads = args.reads_threshold
 if os.path.isfile(file1) is False:
 sys.exit("Error: Could not find '{}'".format(file1))
 if os.path.isfile(file2) is False:
 sys.exit("Error: Could not find '{}'".format(file2))
 sys.exit("Error: Could not find '{}'".format(json_file))
 if thresh < 0:
 sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh))
 if phred_score < 0:
 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score))
-if trim5 < 0:
+if trim < 0:
-sys.exit("Error: trim5 is '{}', but only non-negative integers allowed".format(trim5))
+sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))
-if trim3 < 0:
+if thr <= 0:
-sys.exit("Error: trim3 is '{}', but only non-negative integers allowed".format(trim3))
+sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr))
 # load dicts
 with open(json_file, "r") as f:
 (tag_dict, cvrg_dict) = json.load(f)
 with open(sscs_json, "r") as f:
 (mut_pos_dict, ref_pos_dict) = json.load(f)
 # read bam file
+# pysam.index(file2)
 bam = pysam.AlignmentFile(file2, "rb")
 # create mut_dict
 mut_dict = {}
 mut_read_pos_dict = {}
 mut_read_dict = {}
 reads_dict = {}
+mut_read_cigar_dict = {}
 i = 0
 mut_array = []
-for variant in VCF(file1):
+for count, variant in enumerate(VCF(file1)):
+	#if count == 2000:
+#    break
 chrom = variant.CHROM
 stop_pos = variant.start
 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
 ref = variant.REF
-alt = variant.ALT[0]
+if len(variant.ALT) == 0:
+continue
+else:
+alt = variant.ALT[0]
 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
 if len(ref) == len(alt):
 mut_array.append([chrom, stop_pos, ref, alt])
 i += 1
 mut_dict[chrom_stop_pos] = {}
 mut_read_pos_dict[chrom_stop_pos] = {}
 reads_dict[chrom_stop_pos] = {}
+mut_read_cigar_dict[chrom_stop_pos] = {}
 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
 if pileupcolumn.reference_pos == stop_pos:
 count_alt = 0
 count_ref = 0
 count_indel = 0
 count_n = 0
 count_other = 0
 count_lowq = 0
 n = 0
-print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
+#print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-"difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
+#      "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
 for pileupread in pileupcolumn.pileups:
 n += 1
 if not pileupread.is_del and not pileupread.is_refskip:
 tag = pileupread.alignment.query_name
 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
 if nuc in mut_dict[chrom_stop_pos][tag]:
 mut_dict[chrom_stop_pos][tag][nuc] += 1
 else:
 mut_dict[chrom_stop_pos][tag][nuc] = 1
 if tag not in mut_read_pos_dict[chrom_stop_pos]:
-mut_read_pos_dict[chrom_stop_pos][tag] = np.array(pileupread.query_position) + 1
+mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
-reads_dict[chrom_stop_pos][tag] = len(pileupread.alignment.query_sequence)
+reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
+mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
+#alignedRefPositions = pileupread.get_reference_positions()[0]
 else:
-mut_read_pos_dict[chrom_stop_pos][tag] = np.append(
+	mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
-mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1)
+	reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
-reads_dict[chrom_stop_pos][tag] = np.append(
+	mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
-reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
 if nuc == alt:
 count_alt += 1
 if tag not in mut_read_dict:
 mut_read_dict[tag] = {}
 mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
 else:
 count_other += 1
 else:
 count_indel += 1
-print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
+#print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
-else:
+#else:
-print("indels are currently not evaluated")
+#    print("indels are currently not evaluated")
 mut_array = np.array(mut_array)
 for read in bam.fetch(until_eof=True):
 if read.is_unmapped:
 pure_tag = read.query_name[:-5]
 nuc = "na"
 bam.close()
 # create pure_tags_dict
 pure_tags_dict = {}
 for key1, value1 in sorted(mut_dict.items()):
+	#if len(np.where(np.array(['#'.join(str(i) for i in z)
+#                       for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0:
+#		continue
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 pure_tags_dict[key1] = {}
 if len(value) < thresh:
 pure_tags_dict_short[key] = value
 else:
 pure_tags_dict_short = pure_tags_dict
-csv_data = open(outputFile_csv, "wb")
+# whole_array = []
-csv_writer = csv.writer(csv_data, delimiter=",")
+# for k in pure_tags_dict.values():
+#    if len(k) != 0:
+#        keys = k.keys()
+#        if len(keys) > 1:
+#            for k1 in keys:
+#                whole_array.append(k1)
+#        else:
+#            whole_array.append(keys[0])
 # output summary with threshold
 workbook = xlsxwriter.Workbook(outfile)
 workbook2 = xlsxwriter.Workbook(outfile2)
 workbook3 = xlsxwriter.Workbook(outfile3)
 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba',
 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba',
 'in phase', 'chimeric tag')
 ws1.write_row(0, 0, header_line)
-csv_writer.writerow(header_line)
 counter_tier11 = 0
 counter_tier12 = 0
 counter_tier21 = 0
 counter_tier22 = 0
 counter_tier23 = 0
 counter_tier24 = 0
 counter_tier31 = 0
 counter_tier32 = 0
-counter_tier41 = 0
+counter_tier33 = 0
-counter_tier42 = 0
+counter_tier4 = 0
-counter_tier5 = 0
+# if chimera_correction:
+#    counter_tier43 = 0
+counter_tier51 = 0
+counter_tier52 = 0
+counter_tier53 = 0
+counter_tier54 = 0
+counter_tier55 = 0
 counter_tier6 = 0
+counter_tier7 = 0
 row = 1
 tier_dict = {}
 chimera_dict = {}
 for key1, value1 in sorted(mut_dict.items()):
 counts_mut = 0
+chimeric_tag_list = []
 chimeric_tag = {}
 if key1 in pure_tags_dict_short.keys():
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 dcs_median = cvrg_dict[key1][2]
-whole_array = list(pure_tags_dict_short[key1].keys())
+whole_array = pure_tags_dict_short[key1].keys()
 tier_dict[key1] = {}
-values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0),
+values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0),
-("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0), ("tier 6", 0)]
+("tier 3.2", 0), ("tier 3.3", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
+("tier 6", 0), ("tier 7", 0)]
 for k, v in values_tier_dict:
 tier_dict[key1][k] = v
 used_keys = []
 if 'ab' in mut_pos_dict[key1].keys():
 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
 if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na']
-else:
+# na1f = na1/total1
+else:
+# na1 = na1f = 0
 na1 = 0
 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
 lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ']
-else:
+# lowq1f = lowq1 / total1
+else:
+# lowq1 = lowq1f = 0
 lowq1 = 0
 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
 ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref]
 ref1f = ref1 / (total1 - na1 - lowq1)
 else:
 if key2[:-5] + '.ab.2' in mut_dict[key1].keys():
 total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
 na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na']
-else:
+# na2f = na2 / total2
+else:
+# na2 = na2f = 0
 na2 = 0
 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
 lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ']
-else:
+# lowq2f = lowq2 / total2
+else:
+# lowq2 = lowq2f = 0
 lowq2 = 0
 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
 ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref]
 ref2f = ref2 / (total2 - na2 - lowq2)
 else:
 if key2[:-5] + '.ba.1' in mut_dict[key1].keys():
 total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
 na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na']
-else:
+# na3f = na3 / total3
+else:
+# na3 = na3f = 0
 na3 = 0
 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
 lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ']
-else:
+# lowq3f = lowq3 / total3
+else:
+# lowq3 = lowq3f = 0
 lowq3 = 0
 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
 ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref]
 ref3f = ref3 / (total3 - na3 - lowq3)
 else:
 if key2[:-5] + '.ba.2' in mut_dict[key1].keys():
 total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
 na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na']
-else:
+# na4f = na4 / total4
+else:
+# na4 = na4f = 0
 na4 = 0
 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
 lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ']
-else:
+# lowq4f = lowq4 / total4
+else:
+# lowq4 = lowq4f = 0
 lowq4 = 0
 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
 ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref]
 ref4f = ref4 / (total4 - na4 - lowq4)
 else:
 total4 = total4new = na4 = lowq4 = 0
 ref4 = alt4 = ref4f = alt4f = 0
 read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1
 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0
+cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = []
+pos_read1 = pos_read2 = pos_read3 = pos_read4 = []
+end_read1 = end_read2 = end_read3 = end_read4 = []
 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
-read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])
+read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']))
-read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1'])
+read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1']))
+cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
+#print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
+pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
+#print(cigars_dcs1)
+end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
-read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])
+read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']))
-read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2'])
+read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2']))
+cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
+pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
+end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
-read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])
+read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']))
-read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1'])
+read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1']))
+cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
+pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
+end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
-read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])
+read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']))
-read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2'])
+read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2']))
+#print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
+cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
+pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
+#print(cigars_dcs4)
+end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
 used_keys.append(key2[:-5])
 counts_mut += 1
 if (alt1f + alt2f + alt3f + alt4f) > 0.5:
 if total1new == 0:
 beg1 = beg4 = beg2 = beg3 = 0
 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
-trimmed_five = False
+trimmed = False
-trimmed_three = False
 contradictory = False
+softclipped_mutation_allMates = False
-if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
+softclipped_mutation_oneOfTwoMates = False
+softclipped_mutation_oneOfTwoSSCS = False
+softclipped_mutation_oneMate = False
+softclipped_mutation_oneMateOneSSCS = False
+print()
+print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3)
+dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
+dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
+ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
+ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
+# mate 1 - SSCS ab
+softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
+ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
+if any(ij is True for ij in softclipped_idx1):
+	softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
+	softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
+	softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
+	dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
+	dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
+	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+	if any(ij is True for ij in softclipped_both_ends_idx1):
+		print(softclipped_both_ends_idx1)
+		for nr, indx in enumerate(softclipped_both_ends_idx1):
+			if indx:
+				if dist_start_read1[nr] <= dist_end_read1[nr]:
+					dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
+				else:
+					dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
+	ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+	ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
+# mate 1 - SSCS ba
+softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
+ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
+if any(ij is True for ij in softclipped_idx4):
+	softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
+	softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
+	softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
+	dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
+	dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
+	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+	if any(ij is True for ij in softclipped_both_ends_idx4):
+		print(softclipped_both_ends_idx4)
+		for nr, indx in enumerate(softclipped_both_ends_idx4):
+			if indx:
+				if dist_start_read4[nr] <= dist_end_read4[nr]:
+					dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
+				else:
+					dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
+		ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+	ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
+# mate 2 - SSCS ab
+softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
+#print(sum(softclipped_idx2))
+ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
+if any(ij is True for ij in softclipped_idx2):
+softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
+softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
+softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
+dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
+dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
+	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+if any(ij is True for ij in softclipped_both_ends_idx2):
+	print(softclipped_both_ends_idx2)
+for nr, indx in enumerate(softclipped_both_ends_idx2):
+if indx:
+if dist_start_read2[nr] <= dist_end_read2[nr]:
+dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
+else:
+dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
+ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+#print(ratio_dist_end2)
+#print([True if x <= thr else False for x in ratio_dist_end2])
+ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
+# mate 2 - SSCS ba
+softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
+ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
+if any(ij is True for ij in softclipped_idx3):
+softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
+softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
+softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
+dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
+dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
+	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+if any(ij is True for ij in softclipped_both_ends_idx3):
+	print(softclipped_both_ends_idx3)
+for nr, indx in enumerate(softclipped_both_ends_idx3):
+if indx:
+if dist_start_read3[nr] <= dist_end_read3[nr]:
+dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
+else:
+dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
+#print([True if x <= thr else False for x in dist_start_read3])
+ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
+if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) &  # contradictory variant
+all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
+(all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
+all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
 alt1ff = 0
 alt4ff = 0
 alt2ff = 0
 alt3ff = 0
-trimmed_five = False
+trimmed = False
-trimmed_three = False
 contradictory = True
-else:
+# softclipping tiers
-if ((read_pos1 >= 0) and (read_pos1 <= trim5)):
+# information of both mates available --> all reads for both mates and SSCS are softclipped
+elif (ratio1 & ratio4 & ratio2 & ratio3 &
+(ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+softclipped_mutation_allMates = True
+softclipped_mutation_oneOfTwoMates = False
+softclipped_mutation_oneOfTwoSSCS = False
+softclipped_mutation_oneMate = False
+softclipped_mutation_oneMateOneSSCS = False
+alt1ff = 0
+alt4ff = 0
+alt2ff = 0
+alt3ff = 0
+trimmed = False
+contradictory = False
+print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
+# information of both mates available --> only one mate softclipped
+elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
+(ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
+all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+softclipped_mutation_allMates = False
+softclipped_mutation_oneOfTwoMates = True
+softclipped_mutation_oneOfTwoSSCS = False
+softclipped_mutation_oneMate = False
+softclipped_mutation_oneMateOneSSCS = False
+alt1ff = 0
+alt4ff = 0
+alt2ff = 0
+alt3ff = 0
+trimmed = False
+contradictory = False
+print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates)
+# information of both mates available --> only one mate softclipped
+elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+softclipped_mutation_allMates = False
+softclipped_mutation_oneOfTwoMates = False
+softclipped_mutation_oneOfTwoSSCS = True
+softclipped_mutation_oneMate = False
+softclipped_mutation_oneMateOneSSCS = False
+alt1ff = 0
+alt4ff = 0
+alt2ff = 0
+alt3ff = 0
+trimmed = False
+contradictory = False
+print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
+# information of one mate available --> all reads of one mate are softclipped
+elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
+all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+	  (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+	  all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
+	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+#if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
+#		((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
+#		(((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
+# 		((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
+softclipped_mutation_allMates = False
+softclipped_mutation_oneOfTwoMates = False
+softclipped_mutation_oneOfTwoSSCS = False
+softclipped_mutation_oneMate = True
+softclipped_mutation_oneMateOneSSCS = False
+alt1ff = 0
+alt4ff = 0
+alt2ff = 0
+alt3ff = 0
+trimmed = False
+contradictory = False
+print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
+# information of one mate available --> only one SSCS is softclipped
+elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+(all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+	  (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+	  (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
+	# if distance between softclipping and mutation is at start or end of the read smaller than threshold
+#if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
+	#	all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
+#		(all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
+# 		all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
+softclipped_mutation_allMates = False
+softclipped_mutation_oneOfTwoMates = False
+softclipped_mutation_oneOfTwoSSCS = False
+softclipped_mutation_oneMate = False
+softclipped_mutation_oneMateOneSSCS = True
+alt1ff = 0
+alt4ff = 0
+alt2ff = 0
+alt3ff = 0
+trimmed = False
+contradictory = False
+print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
+else:
+if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
 beg1 = total1new
 total1new = 0
 alt1ff = 0
 alt1f = 0
-trimmed_five = True
+trimmed = True
-if ((read_pos1 >= 0) and (abs(read_len_median1 - read_pos1) <= trim3)):
+if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
-beg1 = total1new
-total1new = 0
-alt1ff = 0
-alt1f = 0
-trimmed_three = True
-if ((read_pos4 >= 0) and (read_pos4 <= trim5)):
 beg4 = total4new
 total4new = 0
 alt4ff = 0
 alt4f = 0
-trimmed_five = True
+trimmed = True
-if ((read_pos4 >= 0) and (abs(read_len_median4 - read_pos4) <= trim3)):
+if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
-beg4 = total4new
-total4new = 0
-alt4ff = 0
-alt4f = 0
-trimmed_three = True
-if ((read_pos2 >= 0) and (read_pos2 <= trim5)):
 beg2 = total2new
 total2new = 0
 alt2ff = 0
 alt2f = 0
-trimmed_five = True
+trimmed = True
-if ((read_pos2 >= 0) and (abs(read_len_median2 - read_pos2) <= trim3)):
+if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
-beg2 = total2new
-total2new = 0
-alt2ff = 0
-alt2f = 0
-trimmed_three = True
-if ((read_pos3 >= 0) and (read_pos3 <= trim5)):
 beg3 = total3new
 total3new = 0
 alt3ff = 0
 alt3f = 0
-trimmed_five = True
+trimmed = True
-if ((read_pos3 >= 0) and (abs(read_len_median3 - read_pos3) <= trim3)):
-beg3 = total3new
-total3new = 0
-alt3ff = 0
-alt3f = 0
-trimmed_three = True
 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 # assign tiers
-if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
+all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+(all(int(ij) >= 3 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
 tier = "1.1"
 counter_tier11 += 1
 tier_dict[key1]["tier 1.1"] += 1
-elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new])
+elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
-& any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+any(int(ij) >= 3 for ij in [total1new, total4new]) &
+any(int(ij) >= 3 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
 tier = "1.2"
 counter_tier12 += 1
 tier_dict[key1]["tier 1.2"] += 1
-elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
+elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-| (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+any(int(ij) >= 3 for ij in [total1new, total4new]) &
+all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+(all(int(ij) >= 1 for ij in [total2new, total3new]) &
+any(int(ij) >= 3 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
 tier = "2.1"
 counter_tier21 += 1
 tier_dict[key1]["tier 2.1"] += 1
-elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
+all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
 tier = "2.2"
 counter_tier22 += 1
 tier_dict[key1]["tier 2.2"] += 1
-elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))
+elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-| (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
+any(int(ij) >= 3 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
+any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
+(all(int(ij) >= 1 for ij in [total2new, total3new]) &
+any(int(ij) >= 3 for ij in [total1new, total4new]) &
+all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
+any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
 tier = "2.3"
 counter_tier23 += 1
 tier_dict[key1]["tier 2.3"] += 1
-elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
+elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-| (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+(all(int(ij) >= 1 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
 tier = "2.4"
 counter_tier24 += 1
 tier_dict[key1]["tier 2.4"] += 1
-elif ((len(pure_tags_dict_short[key1]) > 1) & (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+elif ((len(pure_tags_dict_short[key1]) > 1) &
+(all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
+all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
 tier = "3.1"
 counter_tier31 += 1
 tier_dict[key1]["tier 3.1"] += 1
-elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]))
+elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-| (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
+(all(int(ij) >= 1 for ij in [total2new, total3new]) &
+all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
 tier = "3.2"
 counter_tier32 += 1
 tier_dict[key1]["tier 3.2"] += 1
-elif trimmed_five:
+elif (trimmed) and (len(pure_tags_dict_short[key1]) > 1):
-tier = "4.1"
+tier = "3.3"
-counter_tier41 += 1
+counter_tier33 += 1
-tier_dict[key1]["tier 4.1"] += 1
+tier_dict[key1]["tier 3.3"] += 1
-elif trimmed_three:
+elif (trimmed):
-tier = "4.2"
+tier = "4"
-counter_tier42 += 1
+counter_tier4 += 1
-tier_dict[key1]["tier 4.2"] += 1
+tier_dict[key1]["tier 4"] += 1
-elif contradictory:
+elif softclipped_mutation_allMates:
-tier = "5"
+tier = "5.1"
-counter_tier5 += 1
+counter_tier51 += 1
-tier_dict[key1]["tier 5"] += 1
+tier_dict[key1]["tier 5.1"] += 1
-else:
+elif softclipped_mutation_oneOfTwoMates:
+tier = "5.2"
+counter_tier52 += 1
+tier_dict[key1]["tier 5.2"] += 1
+elif softclipped_mutation_oneOfTwoSSCS:
+tier = "5.3"
+counter_tier53 += 1
+tier_dict[key1]["tier 5.3"] += 1
+elif softclipped_mutation_oneMate:
+tier = "5.4"
+counter_tier54 += 1
+tier_dict[key1]["tier 5.4"] += 1
+elif softclipped_mutation_oneMateOneSSCS:
+tier = "5.5"
+counter_tier55 += 1
+tier_dict[key1]["tier 5.5"] += 1
+elif (contradictory):
 tier = "6"
 counter_tier6 += 1
 tier_dict[key1]["tier 6"] += 1
+else:
+tier = "7"
+counter_tier7 += 1
+tier_dict[key1]["tier 7"] += 1
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
-var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
+var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
 sample_tag = key2[:-5]
 array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
 # exclude identical tag from array2, to prevent comparison to itself
 same_tag = np.where(array2 == sample_tag)
 index_array2 = np.arange(0, len(array2), 1)
 half1_mate1 = array1_half2
 half2_mate1 = array1_half
 half1_mate2 = array2_half2
 half2_mate2 = array2_half
 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
-dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2])
+dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
 min_index = np.where(dist == dist.min())  # get index of min HD
 # get all "b's" of the tag or all "a's" of the tag with minimum HD
 min_tag_half2 = half2_mate2[min_index]
 min_tag_array2 = array2[min_index]  # get whole tag with min HD
 min_value = dist.min()
 # calculate HD of "b" to all "b's" or "a" to all "a's"
-dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e))
+dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
 for e in min_tag_half2])
 dist2 = dist_second_half.max()
 max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
 max_tag = min_tag_array2[max_index]
 # tags which have identical parts:
 if min_value == 0 or dist2 == 0:
 min_tags_list_zeros.append(tag)
 chimera_tags.append(max_tag)
+# chimeric = True
+# else:
+# chimeric = False
+# if mate_b is False:
+#    text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
+# else:
+#     text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
 i += 1
 chimera_tags = [x for x in chimera_tags if x != []]
 chimera_tags_new = []
 for i in chimera_tags:
 if len(i) > 1:
 read_pos2 = read_len_median2 = None
 if (read_pos3 == -1):
 read_pos3 = read_len_median3 = None
 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
 ws1.write_row(row, 0, line)
-csv_writer.writerow(line)
 line = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
 ws1.write_row(row + 1, 0, line)
-csv_writer.writerow(line)
 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
 {'type': 'formula',
 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
 'format': format1,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
 {'type': 'formula',
 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1),
 'format': format3,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
 {'type': 'formula',
 'criteria': '=$B${}>="3"'.format(row + 1),
 'format': format2,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
 row += 3
 if chimera_correction:
 chimeric_dcs_high_tiers = 0
 chimeric_dcs = 0
 for keys_chimera in chimeric_tag.keys():
 tiers = chimeric_tag[keys_chimera]
 if high_tiers == len(tiers):
 chimeric_dcs_high_tiers += high_tiers - 1
 else:
 chimeric_dcs_high_tiers += high_tiers
 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
-#csv_data.close()
 # sheet 2
 if chimera_correction:
 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
+'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
 else:
 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
+'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
 ws2.write_row(0, 0, header_line2)
-#ws2.conditional_format('J1', {'type': 'formula', 'criteria': 'containing', 'value': 'tier 1.1', 'format': format1, 'multi_range': 'J1:K1'})
 row = 0
 for key1, value1 in sorted(tier_dict.items()):
 if key1 in pure_tags_dict_short.keys():
 i = np.where(np.array(['#'.join(str(i) for i in z)
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
 ref_count = cvrg_dict[key1][0]
 alt_count = cvrg_dict[key1][1]
 cvrg = ref_count + alt_count
-var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
+var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
 lst = [var_id, cvrg]
 used_tiers = []
 cum_af = []
 for key2, value2 in sorted(value1.items()):
 # calculate cummulative AF
 used_tiers.append(value2)
 if len(used_tiers) > 1:
 cum = safe_div(sum(used_tiers), cvrg)
 cum_af.append(cum)
+if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place
+	continue
 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
 if chimera_correction:
 chimeras_all = chimera_dict[key1][0]
 new_alt = sum(used_tiers) - chimeras_all
 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers)))
 if fraction_chimeras is None:
 fraction_chimeras = 0.
 new_cvrg = cvrg * (1. - fraction_chimeras)
 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
-lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:])))])
+lst.extend([(cvrg - sum(used_tiers[-11:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-11:])))])
 if chimera_correction:
 chimeras_all = chimera_dict[key1][1]
 new_alt = sum(used_tiers[0:6]) - chimeras_all
 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6])))
 if fraction_chimeras is None:
 fraction_chimeras = 0.
-new_cvrg = (cvrg - sum(used_tiers[-6:])) * (1. - fraction_chimeras)
+new_cvrg = (cvrg - sum(used_tiers[-11:])) * (1. - fraction_chimeras)
 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
 lst.extend([alt_count, safe_div(alt_count, cvrg)])
 lst.extend(used_tiers)
 lst.extend(cum_af)
 lst = tuple(lst)
 ws2.write_row(row + 1, 0, lst)
 if chimera_correction:
 ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
 ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
-ws2.conditional_format('V{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AA{} V1:AA1'.format(row + 2, row + 2)})
+ws2.conditional_format('V{}:AF{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AF{} V1:AF1'.format(row + 2, row + 2)})
 else:
 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
-ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)})
+ws2.conditional_format('P{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Z{} P1:Z1'.format(row + 2, row + 2)})
 row += 1
 # sheet 3
 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
-("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
+("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 3.3", counter_tier33), ("tier 4", counter_tier),
-("tier 4.2", counter_tier42), ("tier 5", counter_tier5), ("tier 6", counter_tier6)]
+("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52),
+("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)]
 header = ("tier", "count")
 ws3.write_row(0, 0, header)
 for i in range(len(sheet3)):
 ws3.write_row(i + 1, 0, sheet3[i])
 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
 {'type': 'formula',
 'criteria': '=OR($A${}="tier 1.1", $A${}="tier 1.2")'.format(i + 2, i + 2),
-'format': format13})
+'format': format1})
 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
 {'type': 'formula',
 'criteria': '=OR($A${}="tier 2.1", $A${}="tier 2.2", $A${}="tier 2.3", $A${}="tier 2.4")'.format(i + 2, i + 2, i + 2, i + 2),
-'format': format33})
+'format': format3})
 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
 {'type': 'formula',
 'criteria': '=$A${}>="3"'.format(i + 2),
-'format': format23})
+'format': format2})
 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""),
 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
-("Tier 4.1", "variants at the beginning of the reads"),
+("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
-("Tier 4.2", "variants at the end of the reads"),
+("Tier 5.1", "variant is close to softclipping in both mates"),
-("Tier 5", "mates with contradictory information"),
+("Tier 5.2", "variant is close to softclipping in one of the mates"),
+("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"),
+("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"),
+("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"),
 ("Tier 6", "remaining variants")]
-examples_tiers = [[("chr5-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
+examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
 "4081", "4098", "5", "10", "", ""),
 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,
 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None,
 "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
-[("chr5-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289",
+[("Chr5:5-20000-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289",
 "33", "43", "33", "43", "0", "0", "33", "43", "0", "0", "1", "1", "0", "0", "0", "0", "0",
 "0", "4081", "4098", "5", "10", "", ""),
 ("", "", "AAAAATGCGTAGAAATATGC", "ab2.ba1", "268", "268", "270", "288", "289",
 "11", "34", "10", "27", "0", "0", "10", "27", "0", "0", "1", "1", "0", "0", "1",
 "7", "0", "0", "4081", "4098", "5", "10", "", "")],
-[("chr5-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290",
+[("Chr5:5-20000-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290",
 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
 "0", "0", "1", "6", "47170", "41149", "", ""),
 ("", "", "CTATGACCCGTGAGCCCATG", "ab2.ba1", "77", "132", "233", "200", "290",
 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
 "0", "0", "1", "6", "47170", "41149", "", "")],
-[("chr5-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289",
+[("Chr5:5-20000-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289",
 "2", "8", "2", "8", "0", "0", "2", "8", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
 "4081", "4098", "5", "10", "", ""),
 ("", "", "AAAAAAACATCATACACCCA", "ab2.ba1", None, None, None, None,
 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0",
 "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
-[("chr5-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289",
+[("Chr5:5-20000-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289",
 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
 "4081", "4098", "5", "10", "", ""),
 ("", "", "ATCAGCCATGGCTATTATTG", "ab2.ba1", "153", "164", "217", "260", "289",
 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
 "4081", "4098", "5", "10", "", "")],
-[("chr5-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None,
+[("Chr5:5-20000-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None,
 "289", "0", "5", "0", "5", "0", "0", "0", "5", None, None, None, "1", "0",
 "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""),
 ("", "", "ATCAATATGGCCTCGCCACG", "ab2.ba1", "202", "255", "277", "290", "289",
 "1", "3", "1", "3", "0", "0", "1", "3", "0", "0", "1", "1", "0", "0", "0", "0",
 "0", "0", "4081", "4098", "5", "10", "", "")],
-[("chr5-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
+[("Chr5:5-20000-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081",
 "4098", "5", "10", "", ""),
 ("", "", "ATCAGCCATGGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289",
 "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081",
 "4098", "5", "10", "", "")],
-[("chr5-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290",
+[("Chr5:5-20000-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290",
 "3", "3", "3", "2", "3", "1", "0", "1", "1", "0.5", "0", "0.5", "0", "0", "0", "1",
 "0", "0", "3", "3", "47170", "41149", "", ""),
 ("", "", "ATGCCTACCTCATTTGTCGT", "ab2.ba1", None, "274", None,
 "288", "290", "0", "3", "0", "2", "0", "1", "0", "1", None, "0.5", None, "0.5",
 "0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", "")],
-[("chr5-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271",
+[("Chr5:5-20000-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271",
 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", ""),
 ("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271",
 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")],
-[("chr5-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "1", "100", "255", "276", "269",
+[("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "0", "100", "255", "276", "269",
-"5", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
+"5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
 ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-[("chr5-13983-G-C", "4.2", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "20", "270", "255", "276", "269",
+[("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
-"5", "6", "5", "0", "0", "0", "5", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "6", "1", "1", "5348", "5350", "", ""),
-("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
-"269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
-"0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-[("chr5-13963-T-C", "5", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
 "0", "0", "0", "1", "1", "5348", "5350", "", ""),
 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-[("chr5-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
+[("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)],
+[("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
 "0", "1", "1", "5348", "5350", "", ""),
 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
-start_row = 15
+start_row = 20
 ws3.write(start_row, 0, "Description of tiers with examples")
 ws3.write_row(start_row + 1, 0, header_line)
 row = 0
 for i in range(len(description_tiers)):
 ws3.write_row(start_row + 2 + row + i + 1, 0, description_tiers[i])
 ex = examples_tiers[i]
 for k in range(len(ex)):
 ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
-ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
 {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
 'format': format33,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
 {'type': 'formula',
 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2),
 'format': format23,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
 row += 3
 workbook.close()
 workbook2.close()
 workbook3.close()
-csv_data.close()
 if __name__ == '__main__':
 sys.exit(read2mut(sys.argv))

Mercurial > repos > mheinzl > variant_analyzer2

comparison read2mut.py @ 43:d21960b45a6b draft