variant_analyzer2: read2mut.py comparison

comparison read2mut.py @ 78:fdfe9a919ff7 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty

author	mheinzl
date	Fri, 22 Jul 2022 09:19:44 +0000
parents	1797e461d674
children	d7aea14291e8

comparison

equal deleted inserted replaced

-:1797e461d674
+:fdfe9a919ff7
 sscs_json = args.sscsJson
 outfile = args.outputFile
 outfile2 = args.outputFile2
 outfile3 = args.outputFile3
 outputFile_csv = args.outputFile_csv
 thresh = args.thresh
 phred_score = args.phred
 trim = args.trim
 chimera_correction = args.chimera_correction
 thr = args.softclipping_dist
 if thr <= 0:
 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr))
 # load dicts
 with open(json_file, "r") as f:
-(tag_dict, cvrg_dict) = json.load(f)
+(tag_dict, cvrg_dict, tag_dict_ref) = json.load(f)
 with open(sscs_json, "r") as f:
 (mut_pos_dict, ref_pos_dict) = json.load(f)
 # read bam file
 ref = variant.REF
 if len(variant.ALT) == 0:
 continue
 else:
 alt = variant.ALT[0]
+alt = alt.upper()
+ref = ref.upper()
+if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+continue
 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+mut_array.append([chrom, stop_pos, ref, alt])
-if len(ref) == len(alt):
+i += 1
-mut_array.append([chrom, stop_pos, ref, alt])
+mut_dict[chrom_stop_pos] = {}
-i += 1
+mut_read_pos_dict[chrom_stop_pos] = {}
-mut_dict[chrom_stop_pos] = {}
+reads_dict[chrom_stop_pos] = {}
-mut_read_pos_dict[chrom_stop_pos] = {}
+mut_read_cigar_dict[chrom_stop_pos] = {}
-reads_dict[chrom_stop_pos] = {}
+real_start_end[chrom_stop_pos] = {}
-mut_read_cigar_dict[chrom_stop_pos] = {}
+for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
-real_start_end[chrom_stop_pos] = {}
+if pileupcolumn.reference_pos == stop_pos:
+count_alt = 0
-for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+count_ref = 0
-if pileupcolumn.reference_pos == stop_pos:
+count_indel = 0
-count_alt = 0
+count_n = 0
-count_ref = 0
+count_other = 0
-count_indel = 0
+count_lowq = 0
-count_n = 0
+n = 0
-count_other = 0
+for pileupread in pileupcolumn.pileups:
-count_lowq = 0
+n += 1
-n = 0
+if not pileupread.is_refskip:
-for pileupread in pileupcolumn.pileups:
+if pileupread.is_del:
-n += 1
+p = pileupread.query_position_or_next
-if not pileupread.is_del and not pileupread.is_refskip:
+e = p + len(alt) - 1
-tag = pileupread.alignment.query_name
+else:
-nuc = pileupread.alignment.query_sequence[pileupread.query_position]
+p = pileupread.query_position
-phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
+e = p + len(alt)
-# if read is softclipped, store real position in reference
+s = p
-if "S" in pileupread.alignment.cigarstring:
+tag = pileupread.alignment.query_name
-# spftclipped at start
+split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
-if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
-start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
+if len(ref) < len(alt):
-end = pileupread.alignment.reference_end
+if "I" in split_cigar:
-# softclipped at end
+all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
-elif re.search(r"S$", pileupread.alignment.cigarstring):
+for ai in all_insertions:  # if multiple insertions in DCS
-end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
+ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
-start = pileupread.alignment.reference_start
+ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1:  # if pe read matches exatcly to insertion
+nuc = pileupread.alignment.query_sequence[s:e]
+phred = ord(pileupread.alignment.qual[s]) - 33
+break
+elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1:  # if pe read has shorter insertion -- not alt
+nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1]
+phred = ord(pileupread.alignment.qual[s]) - 33
+break
+else:  # insertion in pe reads but not at the desired position
+nuc = pileupread.alignment.query_sequence[s]
+phred = ord(pileupread.alignment.qual[s]) - 33
+elif "D" in split_cigar:  # if deletion in pe read, don't count
+all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+if di > 0:  # more than 1 deletion, don't count previous deletion to position
+all_deletions_mod = split_cigar[:ai - 1]
+prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+if "D" in split_cigar and sum(del_index) == p + 1:  # if deletion at that position
+nuc = "D"
+phred = ord(pileupread.alignment.qual[s]) - 33
+break
+else:
+nuc = pileupread.alignment.query_sequence[s]
+phred = ord(pileupread.alignment.qual[s]) - 33
+else:  # insertion in pe reads but not at the desired position
+nuc = pileupread.alignment.query_sequence[s]
+phred = ord(pileupread.alignment.qual[s]) - 33
+elif len(ref) > len(alt):
+ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+if "D" in split_cigar:
+all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+if di > 0:  # more than 1 deletion, don't count previous deletion to position
+all_deletions_mod = split_cigar[:ai - 1]
+prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+del_count = split_cigar[ai - 1]  # deletion on that position but does not necesserily match in the length
+if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1:  # if pe read matches exatcly to deletion
+nuc = pileupread.alignment.query_sequence[s:e]
+phred = ord(pileupread.alignment.qual[s]) - 33
+if nuc == "":
+nuc = str(alt)
+break
+elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1:  # if pe read has shorter deletion --> not alt
+nuc = str(ref)[:int(del_count)+1]
+phred = ord(pileupread.alignment.qual[s]) - 33
+break
+else:  # deletion in pe reads but not at the desired position
+nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+phred = ord(pileupread.alignment.qual[s]) - 33
+elif "I" in split_cigar:  # if pe read has insertion --> not count
+all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+for ai in all_insertions:  # if multiple insertions in DCS
+ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+if "I" in split_cigar and sum(ins_index) == p + 1:
+nuc = "I"
+phred = ord(pileupread.alignment.qual[s]) - 33
+break
+else:
+nuc = pileupread.alignment.query_sequence[s]
+phred = ord(pileupread.alignment.qual[s]) - 33
+elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+phred = ord(pileupread.alignment.qual[s]) - 33
+if nuc.upper() == ref[:len(nuc)]:
+nuc = str(ref)
+else:  # deletion in pe reads but not at the desired position
+nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+phred = ord(pileupread.alignment.qual[s]) - 33
+else:  # SNV: query position is None if is_del or is_refskip is set.
+nuc = pileupread.alignment.query_sequence[s]
+phred = ord(pileupread.alignment.qual[s]) - 33
+nuc = nuc.upper()
+# if read is softclipped, store real position in reference
+if "S" in pileupread.alignment.cigarstring:
+# spftclipped at start
+if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
+start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
+end = pileupread.alignment.reference_end
+# softclipped at end
+elif re.search(r"S$", pileupread.alignment.cigarstring):
+end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
+start = pileupread.alignment.reference_start
+else:
+end = pileupread.alignment.reference_end
+start = pileupread.alignment.reference_start
+if phred < phred_score:
+nuc = "lowQ"
+if tag not in mut_dict[chrom_stop_pos]:
+mut_dict[chrom_stop_pos][tag] = {}
+if nuc in mut_dict[chrom_stop_pos][tag]:
+mut_dict[chrom_stop_pos][tag][nuc] += 1
+else:
+mut_dict[chrom_stop_pos][tag][nuc] = 1
+if tag not in mut_read_pos_dict[chrom_stop_pos]:
+mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1]
+reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
+mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
+real_start_end[chrom_stop_pos][tag] = [(start, end)]
+else:
+mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1)
+reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
+mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
+real_start_end[chrom_stop_pos][tag].append((start, end))
+if nuc == alt:
+count_alt += 1
+if tag not in mut_read_dict:
+mut_read_dict[tag] = {}
+mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
 else:
-end = pileupread.alignment.reference_end
+mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-start = pileupread.alignment.reference_start
+elif nuc == ref:
+count_ref += 1
-if phred < phred_score:
+elif nuc == "N":
-nuc = "lowQ"
+count_n += 1
-if tag not in mut_dict[chrom_stop_pos]:
+elif nuc == "lowQ":
-mut_dict[chrom_stop_pos][tag] = {}
+count_lowq += 1
-if nuc in mut_dict[chrom_stop_pos][tag]:
+else:
-mut_dict[chrom_stop_pos][tag][nuc] += 1
+count_other += 1
 else:
-mut_dict[chrom_stop_pos][tag][nuc] = 1
+count_indel += 1
-if tag not in mut_read_pos_dict[chrom_stop_pos]:
-mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
-reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
-mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
-real_start_end[chrom_stop_pos][tag] = [(start, end)]
-else:
-mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
-reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
-mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
-real_start_end[chrom_stop_pos][tag].append((start, end))
-if nuc == alt:
-count_alt += 1
-if tag not in mut_read_dict:
-mut_read_dict[tag] = {}
-mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-else:
-mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
-elif nuc == ref:
-count_ref += 1
-elif nuc == "N":
-count_n += 1
-elif nuc == "lowQ":
-count_lowq += 1
-else:
-count_other += 1
-else:
-count_indel += 1
 mut_array = np.array(mut_array)
 for read in bam.fetch(until_eof=True):
 if read.is_unmapped:
 pure_tag = read.query_name[:-5]
 nuc = "na"
-for key in tag_dict[pure_tag].keys():
+if pure_tag in tag_dict.keys():  # stored all ref and alt reads --> get only alt reads
-if key not in mut_dict:
+for key in tag_dict[pure_tag].keys():
-mut_dict[key] = {}
+if key not in mut_dict:
-if read.query_name not in mut_dict[key]:
+mut_dict[key] = {}
-mut_dict[key][read.query_name] = {}
+if read.query_name not in mut_dict[key]:
-if nuc in mut_dict[key][read.query_name]:
+mut_dict[key][read.query_name] = {}
-mut_dict[key][read.query_name][nuc] += 1
+if nuc in mut_dict[key][read.query_name]:
-else:
+mut_dict[key][read.query_name][nuc] += 1
-mut_dict[key][read.query_name][nuc] = 1
+else:
+mut_dict[key][read.query_name][nuc] = 1
 bam.close()
 # create pure_tags_dict
 pure_tags_dict = {}
+pure_tags_dict_ref = {}
 for key1, value1 in sorted(mut_dict.items()):
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 pure_tags_dict[key1] = {}
+pure_tags_dict_ref[key1] = {}
 for key2, value2 in sorted(value1.items()):
 for key3, value3 in value2.items():
 pure_tag = key2[:-5]
 if key3 == alt:
 if pure_tag in pure_tags_dict[key1]:
 pure_tags_dict[key1][pure_tag] += 1
 else:
 pure_tags_dict[key1][pure_tag] = 1
+if key3 == ref:
+if pure_tag in pure_tags_dict_ref[key1]:
+pure_tags_dict_ref[key1][pure_tag] += 1
+else:
+pure_tags_dict_ref[key1][pure_tag] = 1
 # create pure_tags_dict_short with thresh
 if thresh > 0:
 pure_tags_dict_short = {}
 for key, value in sorted(pure_tags_dict.items()):
 if len(value) < thresh:
 format13 = workbook3.add_format({'bg_color': '#BCF5A9'})  # green
 format23 = workbook3.add_format({'bg_color': '#FFC7CE'})  # red
 format33 = workbook3.add_format({'bg_color': '#FACC2E'})  # yellow
-header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
+header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab',
 'read median length.ba', 'DCS median length',
 'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba',
 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba',
 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba',
 'in phase', 'chimeric tag')
 ws1.write_row(0, 0, header_line)
 csv_writer.writerow(header_line)
 counter_tier11 = 0
 counter_tier12 = 0
 counter_tier21 = 0
 counter_tier22 = 0
 counter_tier23 = 0
 counter_tier6 = 0
 counter_tier7 = 0
 row = 1
 tier_dict = {}
+tier_dict_ref = {}
 chimera_dict = {}
 for key1, value1 in sorted(mut_dict.items()):
 counts_mut = 0
 chimeric_tag_list = []
 chimeric_tag = {}
-if key1 in pure_tags_dict_short.keys():
+if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()):  # ref or alt
 change_tier_after_print = []
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 dcs_median = cvrg_dict[key1][2]
 whole_array = list(pure_tags_dict_short[key1].keys())
 tier_dict[key1] = {}
+tier_dict_ref[key1] = {}
 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0),
 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
 ("tier 6", 0), ("tier 7", 0)]
 for k, v in values_tier_dict:
 tier_dict[key1][k] = v
+tier_dict_ref[key1][k] = v
 used_keys = []
 if 'ab' in mut_pos_dict[key1].keys():
 sscs_mut_ab = mut_pos_dict[key1]['ab']
 else:
 else:
 sscs_ref_ba = 0
 for key2, value2 in sorted(value1.items()):
 add_mut14 = ""
 add_mut23 = ""
-if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
+if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys():  # skip reads that have not alt or ref
+continue
+if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys):
+if key2[:-5] in pure_tags_dict_short[key1].keys():
+variant_type = "alt"
+elif key2[:-5] in pure_tags_dict_ref[key1].keys():
+variant_type = "ref"
 if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na']
 # na1f = na1/total1
 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2']
 used_keys.append(key2[:-5])
 counts_mut += 1
-if (alt1f + alt2f + alt3f + alt4f) > 0.5:
+if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
+if variant_type == "alt":
+tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f
+tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f
+elif variant_type == "ref":
+tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f
+tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f
 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new
 if total1new == 0:
 ref1f = alt1f = None
 alt1ff = -1
 alt1ff_trim = -1
+tier1ff = -1
+tier1ff_trim = -1
 else:
 alt1ff = alt1f
 alt1ff_trim = alt1f
+tier1ff = tier1ff
+tier1ff_trim = tier1ff_trim
 if total2new == 0:
 ref2f = alt2f = None
 alt2ff = -1
 alt2ff_trim = -1
+tier2ff = -1
+tier2ff_trim = -1
 else:
 alt2ff = alt2f
 alt2ff_trim = alt2f
+tier2ff = tier2ff
+tier2ff_trim = tier2ff_trim
 if total3new == 0:
 ref3f = alt3f = None
 alt3ff = -1
 alt3ff_trim = -1
+tier3ff = -1
+tier3ff_trim = -1
 else:
 alt3ff = alt3f
 alt3ff_trim = alt3f
+tier3ff = tier3ff
+tier3ff_trim = tier3ff_trim
 if total4new == 0:
 ref4f = alt4f = None
 alt4ff = -1
 alt4ff_trim = -1
+tier4ff = -1
+tier4ff_trim = -1
 else:
 alt4ff = alt4f
 alt4ff_trim = alt4f
+tier4ff = tier4ff
+tier4ff_trim = tier4ff_trim
 beg1 = beg4 = beg2 = beg3 = 0
 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
 # mate 1 - SSCS ab
 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
 safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1)))
-if (safe_div_result == None):
+if (safe_div_result is None):
 ratio1 = False
 else:
 ratio1 = safe_div_result >= threshold_reads
 if any(ij is True for ij in softclipped_idx1):
 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
 # mate 1 - SSCS ba
 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
 safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4)))
-if (safe_div_result == None):
+if (safe_div_result is None):
 ratio4 = False
 else:
 ratio4 = safe_div_result >= threshold_reads
 if any(ij is True for ij in softclipped_idx4):
 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
 # mate 2 - SSCS ab
 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
 safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2)))
-if (safe_div_result == None):
+if (safe_div_result is None):
 ratio2 = False
 else:
 ratio2 = safe_div_result >= threshold_reads
 if any(ij is True for ij in softclipped_idx2):
 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
 # mate 2 - SSCS ba
 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
 safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3)))
-if (safe_div_result == None):
+if (safe_div_result is None):
 ratio3 = False
 else:
 ratio3 = safe_div_result >= threshold_reads
 if any(ij is True for ij in softclipped_idx3):
 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
 else:
 dist_start_read3[nr] = thr + 1000  # use dist of end and set start to very large number
 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
-if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) &  # contradictory variant
+if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) &  # contradictory variant
-all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
+all(float(ij) == 0. for ij in [tier2ff, tier3ff])) |
-(all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
+(all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) &
-all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
+all(float(ij) == 0. for ij in [tier1ff, tier4ff]))):
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = True
 # softclipping tiers
 # information of both mates available --> all reads for both mates and SSCS are softclipped
 elif (ratio1 & ratio4 & ratio2 & ratio3 &
 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
 softclipped_mutation_allMates = True
 softclipped_mutation_oneOfTwoMates = False
 softclipped_mutation_oneOfTwoSSCS = False
 softclipped_mutation_oneOfTwoSSCS_diffMates = False
 softclipped_mutation_oneMate = False
 softclipped_mutation_oneMateOneSSCS = False
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = False
 # information of both mates available --> only one mate softclipped
 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
-all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4]))  # red
 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3]))  # blue
 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4]))  # red
 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3]))  # blue
 n_spacer_barcode = max_end2 - max_end1
 read_len_median2 = read_len_median2 - n_spacer_barcode
 read_len_median3 = read_len_median3 - n_spacer_barcode
 else:
 softclipped_mutation_oneOfTwoMates = True
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = False
 softclipped_mutation_allMates = False
 softclipped_mutation_oneOfTwoSSCS = False
 softclipped_mutation_oneMate = False
 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
 beg1 = total1new
 total1new = 0
 alt1ff = 0
 alt1f = 0
+tier1ff = 0
 trimmed = True
 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
 beg4 = total4new
 total4new = 0
 alt4ff = 0
 alt4f = 0
+tier4ff = 0
 trimmed = True
 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
 beg2 = total2new
 total2new = 0
 alt2ff = 0
 alt2f = 0
+tier2ff = 0
 trimmed = True
 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
 beg3 = total3new
 total3new = 0
 alt3ff = 0
 alt3f = 0
+tier3ff = 0
 trimmed = True
 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 # information of both mates available --> only one mate softclipped
 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):  # all mates available
+all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):  # all mates available
 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
 softclipped_mutation_allMates = False
 softclipped_mutation_oneOfTwoMates = False
 softclipped_mutation_oneOfTwoSSCS = True
 softclipped_mutation_oneOfTwoSSCS_diffMates = False
 softclipped_mutation_oneMate = False
 softclipped_mutation_oneMateOneSSCS = False
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = False
 # information of one mate available --> all reads of one mate are softclipped
 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
-all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) |
 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
-all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))):  # all mates available
+all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))):  # all mates available
 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
 softclipped_mutation_allMates = False
 softclipped_mutation_oneOfTwoMates = False
 softclipped_mutation_oneOfTwoSSCS = False
 softclipped_mutation_oneOfTwoSSCS_diffMates = False
 softclipped_mutation_oneMate = True
 softclipped_mutation_oneMateOneSSCS = False
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = False
 # information of one mate available --> only one SSCS is softclipped
 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
-(all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+(all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) |
 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
-(all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))):  # all mates available
+(all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))):  # all mates available
 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
 softclipped_mutation_allMates = False
 softclipped_mutation_oneOfTwoMates = False
 softclipped_mutation_oneOfTwoSSCS = False
 softclipped_mutation_oneOfTwoSSCS_diffMates = False
 softclipped_mutation_oneMate = False
 softclipped_mutation_oneMateOneSSCS = True
-alt1ff = 0
+tier1ff = 0
-alt4ff = 0
+tier4ff = 0
-alt2ff = 0
+tier2ff = 0
-alt3ff = 0
+tier3ff = 0
 trimmed = False
 contradictory = False
 else:
 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
 beg1 = total1new
 total1new = 0
 alt1ff = 0
 alt1f = 0
+tier1ff = 0
 trimmed = True
 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
 beg4 = total4new
 total4new = 0
 alt4ff = 0
 alt4f = 0
+tier4ff = 0
 trimmed = True
 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
 beg2 = total2new
 total2new = 0
 alt2ff = 0
 alt2f = 0
+tier2ff = 0
 trimmed = True
 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
 beg3 = total3new
 total3new = 0
 alt3ff = 0
 alt3f = 0
+tier3ff = 0
 trimmed = True
 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 # assign tiers
 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
 (all(int(ij) >= 3 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
 tier = "1.1"
 counter_tier11 += 1
-tier_dict[key1]["tier 1.1"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 1.1"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 1.1"] += 1
 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
 any(int(ij) >= 3 for ij in [total1new, total4new]) &
 any(int(ij) >= 3 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
 tier = "1.2"
 counter_tier12 += 1
-tier_dict[key1]["tier 1.2"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 1.2"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 1.2"] += 1
 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
 any(int(ij) >= 3 for ij in [total1new, total4new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
 any(int(ij) >= 3 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
 tier = "2.1"
 counter_tier21 += 1
-tier_dict[key1]["tier 2.1"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 2.1"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 2.1"] += 1
 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])):
 tier = "2.2"
 counter_tier22 += 1
-tier_dict[key1]["tier 2.2"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 2.2"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 2.2"] += 1
 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
 any(int(ij) >= 3 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
+all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) &
-any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
+any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) |
 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
 any(int(ij) >= 3 for ij in [total1new, total4new]) &
-all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
+all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) &
-any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
+any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))):
 tier = "2.3"
 counter_tier23 += 1
-tier_dict[key1]["tier 2.3"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 2.3"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 2.3"] += 1
 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) |
 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))):
 tier = "2.4"
 counter_tier24 += 1
-tier_dict[key1]["tier 2.4"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 2.4"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 2.4"] += 1
 elif ((len(pure_tags_dict_short[key1]) > 1) &
-(all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
+(all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) |
-all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
 tier = "3.1"
 counter_tier31 += 1
-tier_dict[key1]["tier 3.1"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 3.1"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 3.1"] += 1
 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
-all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
+all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) |
 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
-all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))):
 tier = "3.2"
 counter_tier32 += 1
-tier_dict[key1]["tier 3.2"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 3.2"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 3.2"] += 1
 elif (trimmed):
 tier = "4"
 counter_tier4 += 1
-tier_dict[key1]["tier 4"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 4"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 4"] += 1
 # assign tiers
 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
 trimmed_actual_high_tier = True
 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
 trimmed_actual_high_tier = True
 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
 trimmed_actual_high_tier = True
 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])):
 trimmed_actual_high_tier = True
 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) &
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) &
-any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) |
+any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) |
 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) &
+all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) &
-any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))):
+any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))):
 trimmed_actual_high_tier = True
 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
+all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) |
 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
-all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
+all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))):
 trimmed_actual_high_tier = True
 else:
 trimmed_actual_high_tier = False
 elif softclipped_mutation_allMates:
 tier = "5.1"
 counter_tier51 += 1
-tier_dict[key1]["tier 5.1"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 5.1"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 5.1"] += 1
 elif softclipped_mutation_oneOfTwoMates:
 tier = "5.2"
 counter_tier52 += 1
-tier_dict[key1]["tier 5.2"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 5.2"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 5.2"] += 1
 elif softclipped_mutation_oneOfTwoSSCS:
 tier = "5.3"
 counter_tier53 += 1
-tier_dict[key1]["tier 5.3"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 5.3"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 5.3"] += 1
 elif softclipped_mutation_oneMate:
 tier = "5.4"
 counter_tier54 += 1
-tier_dict[key1]["tier 5.4"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 5.4"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 5.4"] += 1
 elif softclipped_mutation_oneMateOneSSCS:
 tier = "5.5"
 counter_tier55 += 1
-tier_dict[key1]["tier 5.5"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 5.5"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 5.5"] += 1
 elif (contradictory):
 tier = "6"
 counter_tier6 += 1
-tier_dict[key1]["tier 6"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 6"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 6"] += 1
 else:
 tier = "7"
 counter_tier7 += 1
-tier_dict[key1]["tier 7"] += 1
+if variant_type == "alt":
+tier_dict[key1]["tier 7"] += 1
+elif variant_type == "ref":
+tier_dict_ref[key1]["tier 7"] += 1
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
 sample_tag = key2[:-5]
 array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
 read_pos4 = read_len_median4 = None
 if (read_pos2 == -1):
 read_pos2 = read_len_median2 = None
 if (read_pos3 == -1):
 read_pos3 = read_len_median3 = None
-line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
+line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
 # ws1.write_row(row, 0, line)
 # csv_writer.writerow(line)
-line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
+line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
 # ws1.write_row(row + 1, 0, line2)
 # csv_writer.writerow(line2)
 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
 #                       {'type': 'formula',
 #                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
 #                        'format': format1,
 #                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
 # write to file
 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
+sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]])
 correct_tier = False
+correct_tier_ref = False
 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
 tier_dict[key1]["tier 4"] = 0
 correct_tier = True
+if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0:
+tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"]
+tier_dict_ref[key1]["tier 4"] = 0
+correct_tier_ref = True
 for sample in change_tier_after_print:
 row_number = sample[0]
 line1 = sample[1]
 line2 = sample[2]
 actual_high_tier = sample[3]
 current_tier = list(line1)[1]
-if correct_tier and (current_tier == "4") and actual_high_tier:
+if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
 line1 = list(line1)
 line1[1] = "2.5"
 line1 = tuple(line1)
 counter_tier25 += 1
 counter_tier4 -= 1
+if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
+line1 = list(line1)
+line1[1] = "2.5"
+line1 = tuple(line1)
+counter_tier25 += 1
+counter_tier4 -= 1
 ws1.write_row(row_number, 0, line1)
 csv_writer.writerow(line1)
 ws1.write_row(row_number + 1, 0, line2)
 csv_writer.writerow(line2)
+if line1[2] == "alt":
-ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-{'type': 'formula',
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1),
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-'format': format1,
+elif line1[2] == "ref":
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-{'type': 'formula',
+ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1),
-'format': format3,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
-ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2),
-{'type': 'formula',
-'criteria': '=$B${}>="3"'.format(row_number + 1),
-'format': format2,
-'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
 # sheet 2
 if chimera_correction:
-header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
+'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
-'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
-'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
+'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+)
 else:
-header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5',
+'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
-'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
-'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7')
+'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+)
 ws2.write_row(0, 0, header_line2)
 row = 0
 for key1, value1 in sorted(tier_dict.items()):
 if key1 in pure_tags_dict_short.keys():
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
 ref_count = cvrg_dict[key1][0]
 alt_count = cvrg_dict[key1][1]
 cvrg = ref_count + alt_count
+ref_tiers = tier_dict_ref[key1]
 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
 lst = [var_id, cvrg]
 used_tiers = []
+used_tiers_ref = [t for k, t in sorted(ref_tiers.items())]
 cum_af = []
 for key2, value2 in sorted(value1.items()):
 # calculate cummulative AF
 used_tiers.append(value2)
 if len(used_tiers) > 1:
 cum = safe_div(sum(used_tiers), cvrg)
 cum_af.append(cum)
 if sum(used_tiers) == 0:  # skip mutations that are filtered by the VA in the first place
 continue
 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
-lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))])
+lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))])
 if chimera_correction:
 chimeras_all = chimera_dict[key1][1]
 new_alt = sum(used_tiers[0:7]) - chimeras_all
 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7])))
 if fraction_chimeras is None:
 fraction_chimeras = 0.
-new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras)
+new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras)
 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
 lst.extend([alt_count, safe_div(alt_count, cvrg)])
 lst.extend(used_tiers)
-lst.extend(cum_af)
+lst.extend(used_tiers_ref)
+# lst.extend(cum_af)
 lst = tuple(lst)
 ws2.write_row(row + 1, 0, lst)
 if chimera_correction:
-ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)})
+ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)})
-ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)})
+ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)})
-ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)})
+ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)})
 else:
 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
 ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)})
 ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)})
 row += 1

Mercurial > repos > mheinzl > variant_analyzer2

comparison read2mut.py @ 78:fdfe9a919ff7 draft