Mercurial > repos > mheinzl > variant_analyzer2
diff read2mut.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 22 Jul 2022 09:19:44 +0000 |
parents | 1797e461d674 |
children | d7aea14291e8 |
line wrap: on
line diff
--- a/read2mut.py Mon Mar 29 09:22:57 2021 +0000 +++ b/read2mut.py Fri Jul 22 09:19:44 2022 +0000 @@ -87,6 +87,7 @@ outfile2 = args.outputFile2 outfile3 = args.outputFile3 outputFile_csv = args.outputFile_csv + thresh = args.thresh phred_score = args.phred trim = args.trim @@ -111,7 +112,7 @@ # load dicts with open(json_file, "r") as f: - (tag_dict, cvrg_dict) = json.load(f) + (tag_dict, cvrg_dict, tag_dict_ref) = json.load(f) with open(sscs_json, "r") as f: (mut_pos_dict, ref_pos_dict) = json.load(f) @@ -138,106 +139,207 @@ continue else: alt = variant.ALT[0] - chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt + + alt = alt.upper() + ref = ref.upper() + if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) + continue - if len(ref) == len(alt): - mut_array.append([chrom, stop_pos, ref, alt]) - i += 1 - mut_dict[chrom_stop_pos] = {} - mut_read_pos_dict[chrom_stop_pos] = {} - reads_dict[chrom_stop_pos] = {} - mut_read_cigar_dict[chrom_stop_pos] = {} - real_start_end[chrom_stop_pos] = {} + chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt + mut_array.append([chrom, stop_pos, ref, alt]) + i += 1 + mut_dict[chrom_stop_pos] = {} + mut_read_pos_dict[chrom_stop_pos] = {} + reads_dict[chrom_stop_pos] = {} + mut_read_cigar_dict[chrom_stop_pos] = {} + real_start_end[chrom_stop_pos] = {} + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + count_n = 0 + count_other = 0 + count_lowq = 0 + n = 0 + for pileupread in pileupcolumn.pileups: + n += 1 + if not pileupread.is_refskip: + if pileupread.is_del: + p = pileupread.query_position_or_next + e = p + len(alt) - 1 + else: + p = pileupread.query_position + e = p + len(alt) + s = p + tag = pileupread.alignment.query_name + split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): - if pileupcolumn.reference_pos == stop_pos: - count_alt = 0 - count_ref = 0 - count_indel = 0 - count_n = 0 - count_other = 0 - count_lowq = 0 - n = 0 - for pileupread in pileupcolumn.pileups: - n += 1 - if not pileupread.is_del and not pileupread.is_refskip: - tag = pileupread.alignment.query_name - nuc = pileupread.alignment.query_sequence[pileupread.query_position] - phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 - # if read is softclipped, store real position in reference - if "S" in pileupread.alignment.cigarstring: - # spftclipped at start - if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): - start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) - end = pileupread.alignment.reference_end - # softclipped at end - elif re.search(r"S$", pileupread.alignment.cigarstring): - end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) - start = pileupread.alignment.reference_start - else: + if len(ref) < len(alt): + if "I" in split_cigar: + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1: # if pe read matches exatcly to insertion + nuc = pileupread.alignment.query_sequence[s:e] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1: # if pe read has shorter insertion -- not alt + nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: # insertion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif "D" in split_cigar: # if deletion in pe read, don't count + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + if "D" in split_cigar and sum(del_index) == p + 1: # if deletion at that position + nuc = "D" + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + else: # insertion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + + elif len(ref) > len(alt): + ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] + if "D" in split_cigar: + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + del_count = split_cigar[ai - 1] # deletion on that position but does not necesserily match in the length + if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1: # if pe read matches exatcly to deletion + nuc = pileupread.alignment.query_sequence[s:e] + phred = ord(pileupread.alignment.qual[s]) - 33 + if nuc == "": + nuc = str(alt) + break + elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1: # if pe read has shorter deletion --> not alt + nuc = str(ref)[:int(del_count)+1] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: # deletion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif "I" in split_cigar: # if pe read has insertion --> not count + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1: + nuc = "I" + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there + nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + if nuc.upper() == ref[:len(nuc)]: + nuc = str(ref) + else: # deletion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + else: # SNV: query position is None if is_del or is_refskip is set. + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + + nuc = nuc.upper() + + # if read is softclipped, store real position in reference + if "S" in pileupread.alignment.cigarstring: + # spftclipped at start + if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): + start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) end = pileupread.alignment.reference_end + # softclipped at end + elif re.search(r"S$", pileupread.alignment.cigarstring): + end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) start = pileupread.alignment.reference_start - - if phred < phred_score: - nuc = "lowQ" - if tag not in mut_dict[chrom_stop_pos]: - mut_dict[chrom_stop_pos][tag] = {} - if nuc in mut_dict[chrom_stop_pos][tag]: - mut_dict[chrom_stop_pos][tag][nuc] += 1 - else: - mut_dict[chrom_stop_pos][tag][nuc] = 1 - if tag not in mut_read_pos_dict[chrom_stop_pos]: - mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] - reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] - mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] - real_start_end[chrom_stop_pos][tag] = [(start, end)] + else: + end = pileupread.alignment.reference_end + start = pileupread.alignment.reference_start + if phred < phred_score: + nuc = "lowQ" + if tag not in mut_dict[chrom_stop_pos]: + mut_dict[chrom_stop_pos][tag] = {} + if nuc in mut_dict[chrom_stop_pos][tag]: + mut_dict[chrom_stop_pos][tag][nuc] += 1 + else: + mut_dict[chrom_stop_pos][tag][nuc] = 1 + if tag not in mut_read_pos_dict[chrom_stop_pos]: + mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1] + reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] + mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] + real_start_end[chrom_stop_pos][tag] = [(start, end)] + else: + mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1) + reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) + mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) + real_start_end[chrom_stop_pos][tag].append((start, end)) + if nuc == alt: + count_alt += 1 + if tag not in mut_read_dict: + mut_read_dict[tag] = {} + mut_read_dict[tag][chrom_stop_pos] = (alt, ref) else: - mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) - reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) - mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) - real_start_end[chrom_stop_pos][tag].append((start, end)) - if nuc == alt: - count_alt += 1 - if tag not in mut_read_dict: - mut_read_dict[tag] = {} - mut_read_dict[tag][chrom_stop_pos] = (alt, ref) - else: - mut_read_dict[tag][chrom_stop_pos] = (alt, ref) - elif nuc == ref: - count_ref += 1 - elif nuc == "N": - count_n += 1 - elif nuc == "lowQ": - count_lowq += 1 - else: - count_other += 1 + mut_read_dict[tag][chrom_stop_pos] = (alt, ref) + elif nuc == ref: + count_ref += 1 + elif nuc == "N": + count_n += 1 + elif nuc == "lowQ": + count_lowq += 1 else: - count_indel += 1 + count_other += 1 + else: + count_indel += 1 mut_array = np.array(mut_array) for read in bam.fetch(until_eof=True): if read.is_unmapped: pure_tag = read.query_name[:-5] nuc = "na" - for key in tag_dict[pure_tag].keys(): - if key not in mut_dict: - mut_dict[key] = {} - if read.query_name not in mut_dict[key]: - mut_dict[key][read.query_name] = {} - if nuc in mut_dict[key][read.query_name]: - mut_dict[key][read.query_name][nuc] += 1 - else: - mut_dict[key][read.query_name][nuc] = 1 + if pure_tag in tag_dict.keys(): # stored all ref and alt reads --> get only alt reads + for key in tag_dict[pure_tag].keys(): + if key not in mut_dict: + mut_dict[key] = {} + if read.query_name not in mut_dict[key]: + mut_dict[key][read.query_name] = {} + if nuc in mut_dict[key][read.query_name]: + mut_dict[key][read.query_name][nuc] += 1 + else: + mut_dict[key][read.query_name][nuc] = 1 bam.close() - # create pure_tags_dict pure_tags_dict = {} + pure_tags_dict_ref = {} for key1, value1 in sorted(mut_dict.items()): i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] ref = mut_array[i, 2] alt = mut_array[i, 3] pure_tags_dict[key1] = {} + pure_tags_dict_ref[key1] = {} for key2, value2 in sorted(value1.items()): for key3, value3 in value2.items(): pure_tag = key2[:-5] @@ -247,6 +349,12 @@ else: pure_tags_dict[key1][pure_tag] = 1 + if key3 == ref: + if pure_tag in pure_tags_dict_ref[key1]: + pure_tags_dict_ref[key1][pure_tag] += 1 + else: + pure_tags_dict_ref[key1][pure_tag] = 1 + # create pure_tags_dict_short with thresh if thresh > 0: pure_tags_dict_short = {} @@ -279,7 +387,7 @@ format23 = workbook3.add_format({'bg_color': '#FFC7CE'}) # red format33 = workbook3.add_format({'bg_color': '#FACC2E'}) # yellow - header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', + header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', 'read median length.ba', 'DCS median length', 'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba', 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', @@ -288,6 +396,7 @@ 'in phase', 'chimeric tag') ws1.write_row(0, 0, header_line) csv_writer.writerow(header_line) + counter_tier11 = 0 counter_tier12 = 0 counter_tier21 = 0 @@ -308,12 +417,14 @@ row = 1 tier_dict = {} + tier_dict_ref = {} chimera_dict = {} for key1, value1 in sorted(mut_dict.items()): counts_mut = 0 chimeric_tag_list = [] chimeric_tag = {} - if key1 in pure_tags_dict_short.keys(): + if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()): # ref or alt + change_tier_after_print = [] i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] @@ -323,11 +434,13 @@ whole_array = list(pure_tags_dict_short[key1].keys()) tier_dict[key1] = {} + tier_dict_ref[key1] = {} values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), ("tier 6", 0), ("tier 7", 0)] for k, v in values_tier_dict: tier_dict[key1][k] = v + tier_dict_ref[key1][k] = v used_keys = [] if 'ab' in mut_pos_dict[key1].keys(): @@ -349,7 +462,16 @@ for key2, value2 in sorted(value1.items()): add_mut14 = "" add_mut23 = "" - if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()): + if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys(): # skip reads that have not alt or ref + continue + + if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys): + + if key2[:-5] in pure_tags_dict_short[key1].keys(): + variant_type = "alt" + elif key2[:-5] in pure_tags_dict_ref[key1].keys(): + variant_type = "ref" + if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): @@ -550,36 +672,59 @@ used_keys.append(key2[:-5]) counts_mut += 1 - if (alt1f + alt2f + alt3f + alt4f) > 0.5: + if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)): + if variant_type == "alt": + tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f + tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f + elif variant_type == "ref": + tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f + tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f + total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new if total1new == 0: ref1f = alt1f = None alt1ff = -1 alt1ff_trim = -1 + tier1ff = -1 + tier1ff_trim = -1 else: alt1ff = alt1f alt1ff_trim = alt1f + tier1ff = tier1ff + tier1ff_trim = tier1ff_trim if total2new == 0: ref2f = alt2f = None alt2ff = -1 alt2ff_trim = -1 + tier2ff = -1 + tier2ff_trim = -1 else: alt2ff = alt2f alt2ff_trim = alt2f + tier2ff = tier2ff + tier2ff_trim = tier2ff_trim if total3new == 0: ref3f = alt3f = None alt3ff = -1 alt3ff_trim = -1 + tier3ff = -1 + tier3ff_trim = -1 else: alt3ff = alt3f alt3ff_trim = alt3f + tier3ff = tier3ff + tier3ff_trim = tier3ff_trim if total4new == 0: ref4f = alt4f = None alt4ff = -1 alt4ff_trim = -1 + tier4ff = -1 + tier4ff_trim = -1 else: alt4ff = alt4f alt4ff_trim = alt4f + tier4ff = tier4ff + tier4ff_trim = tier4ff_trim beg1 = beg4 = beg2 = beg3 = 0 @@ -605,7 +750,7 @@ # mate 1 - SSCS ab softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) - if (safe_div_result == None): + if (safe_div_result is None): ratio1 = False else: ratio1 = safe_div_result >= threshold_reads @@ -629,7 +774,7 @@ # mate 1 - SSCS ba softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) - if (safe_div_result == None): + if (safe_div_result is None): ratio4 = False else: ratio4 = safe_div_result >= threshold_reads @@ -653,7 +798,7 @@ # mate 2 - SSCS ab softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) - if (safe_div_result == None): + if (safe_div_result is None): ratio2 = False else: ratio2 = safe_div_result >= threshold_reads @@ -677,7 +822,7 @@ # mate 2 - SSCS ba softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) - if (safe_div_result == None): + if (safe_div_result is None): ratio3 = False else: ratio3 = safe_div_result >= threshold_reads @@ -698,21 +843,21 @@ ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads - if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant - all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | - (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & - all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) & # contradictory variant + all(float(ij) == 0. for ij in [tier2ff, tier3ff])) | + (all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) & + all(float(ij) == 0. for ij in [tier1ff, tier4ff]))): + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = True # softclipping tiers # information of both mates available --> all reads for both mates and SSCS are softclipped elif (ratio1 & ratio4 & ratio2 & ratio3 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = True softclipped_mutation_oneOfTwoMates = False @@ -720,16 +865,16 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of both mates available --> only one mate softclipped elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue @@ -763,10 +908,10 @@ read_len_median3 = read_len_median3 - n_spacer_barcode else: softclipped_mutation_oneOfTwoMates = True - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False softclipped_mutation_allMates = False @@ -780,6 +925,7 @@ total1new = 0 alt1ff = 0 alt1f = 0 + tier1ff = 0 trimmed = True if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): @@ -787,6 +933,7 @@ total4new = 0 alt4ff = 0 alt4f = 0 + tier4ff = 0 trimmed = True if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): @@ -794,6 +941,7 @@ total2new = 0 alt2ff = 0 alt2f = 0 + tier2ff = 0 trimmed = True if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): @@ -801,6 +949,7 @@ total3new = 0 alt3ff = 0 alt3f = 0 + tier3ff = 0 trimmed = True details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) @@ -808,7 +957,7 @@ # information of both mates available --> only one mate softclipped elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -816,18 +965,18 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of one mate available --> all reads of one mate are softclipped elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & - all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | + all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) | (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available + all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -835,17 +984,17 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = True softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of one mate available --> only one SSCS is softclipped elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & - (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | + (all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) | (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available + (all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -853,10 +1002,10 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = True - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False @@ -866,6 +1015,7 @@ total1new = 0 alt1ff = 0 alt1f = 0 + tier1ff = 0 trimmed = True if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): @@ -873,6 +1023,7 @@ total4new = 0 alt4ff = 0 alt4f = 0 + tier4ff = 0 trimmed = True if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): @@ -880,6 +1031,7 @@ total2new = 0 alt2ff = 0 alt2f = 0 + tier2ff = 0 trimmed = True if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): @@ -887,117 +1039,145 @@ total3new = 0 alt3ff = 0 alt3f = 0 + tier3ff = 0 trimmed = True details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) # assign tiers if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "1.1" counter_tier11 += 1 - tier_dict[key1]["tier 1.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 1.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 1.1"] += 1 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): tier = "1.2" counter_tier12 += 1 - tier_dict[key1]["tier 1.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 1.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 1.2"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "2.1" counter_tier21 += 1 - tier_dict[key1]["tier 2.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.1"] += 1 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): tier = "2.2" counter_tier22 += 1 - tier_dict[key1]["tier 2.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.2"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & - any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) & + any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & - any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) & + any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))): tier = "2.3" counter_tier23 += 1 - tier_dict[key1]["tier 2.3"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.3"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.3"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "2.4" counter_tier24 += 1 - tier_dict[key1]["tier 2.4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.4"] += 1 elif ((len(pure_tags_dict_short[key1]) > 1) & - (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | - all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + (all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) | + all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): tier = "3.1" counter_tier31 += 1 - tier_dict[key1]["tier 3.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 3.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 3.1"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & - all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & - all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): tier = "3.2" counter_tier32 += 1 - tier_dict[key1]["tier 3.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 3.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 3.2"] += 1 elif (trimmed): tier = "4" counter_tier4 += 1 - tier_dict[key1]["tier 4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 4"] += 1 # assign tiers if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) & - any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) & + any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) & - any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) & + any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True else: trimmed_actual_high_tier = False @@ -1005,37 +1185,58 @@ elif softclipped_mutation_allMates: tier = "5.1" counter_tier51 += 1 - tier_dict[key1]["tier 5.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.1"] += 1 elif softclipped_mutation_oneOfTwoMates: tier = "5.2" counter_tier52 += 1 - tier_dict[key1]["tier 5.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.2"] += 1 elif softclipped_mutation_oneOfTwoSSCS: tier = "5.3" counter_tier53 += 1 - tier_dict[key1]["tier 5.3"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.3"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.3"] += 1 elif softclipped_mutation_oneMate: tier = "5.4" counter_tier54 += 1 - tier_dict[key1]["tier 5.4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.4"] += 1 elif softclipped_mutation_oneMateOneSSCS: tier = "5.5" counter_tier55 += 1 - tier_dict[key1]["tier 5.5"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.5"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.5"] += 1 elif (contradictory): tier = "6" counter_tier6 += 1 - tier_dict[key1]["tier 6"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 6"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 6"] += 1 else: tier = "7" counter_tier7 += 1 - tier_dict[key1]["tier 7"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 7"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 7"] += 1 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) @@ -1114,13 +1315,12 @@ read_pos2 = read_len_median2 = None if (read_pos3 == -1): read_pos3 = read_len_median3 = None - line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) + line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) # ws1.write_row(row, 0, line) # csv_writer.writerow(line) - line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) + line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) # ws1.write_row(row + 1, 0, line2) # csv_writer.writerow(line2) - # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), # {'type': 'formula', # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), @@ -1155,12 +1355,19 @@ # write to file # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) + sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]]) correct_tier = False + correct_tier_ref = False if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] tier_dict[key1]["tier 4"] = 0 correct_tier = True + if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0: + tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"] + tier_dict_ref[key1]["tier 4"] = 0 + correct_tier_ref = True + for sample in change_tier_after_print: row_number = sample[0] line1 = sample[1] @@ -1168,44 +1375,47 @@ actual_high_tier = sample[3] current_tier = list(line1)[1] - if correct_tier and (current_tier == "4") and actual_high_tier: + if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier: + line1 = list(line1) + line1[1] = "2.5" + line1 = tuple(line1) + counter_tier25 += 1 + counter_tier4 -= 1 + if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier: line1 = list(line1) line1[1] = "2.5" line1 = tuple(line1) counter_tier25 += 1 counter_tier4 -= 1 + ws1.write_row(row_number, 0, line1) csv_writer.writerow(line1) ws1.write_row(row_number + 1, 0, line2) csv_writer.writerow(line2) + if line1[2] == "alt": + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + elif line1[2] == "ref": + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), - 'format': format1, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), - 'format': format3, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=$B${}>="3"'.format(row_number + 1), - 'format': format2, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) # sheet 2 if chimera_correction: - header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', - 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') + header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', + 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', + 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', + 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', + 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' + ) else: - header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', - 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') - + header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', + 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', + 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', + 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', + 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' + ) ws2.write_row(0, 0, header_line2) row = 0 @@ -1220,9 +1430,12 @@ alt_count = cvrg_dict[key1][1] cvrg = ref_count + alt_count + ref_tiers = tier_dict_ref[key1] + var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) lst = [var_id, cvrg] used_tiers = [] + used_tiers_ref = [t for k, t in sorted(ref_tiers.items())] cum_af = [] for key2, value2 in sorted(value1.items()): # calculate cummulative AF @@ -1233,24 +1446,25 @@ if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place continue lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) - lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) + lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))]) if chimera_correction: chimeras_all = chimera_dict[key1][1] new_alt = sum(used_tiers[0:7]) - chimeras_all fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7]))) if fraction_chimeras is None: fraction_chimeras = 0. - new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras) + new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras) lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)]) lst.extend([alt_count, safe_div(alt_count, cvrg)]) lst.extend(used_tiers) - lst.extend(cum_af) + lst.extend(used_tiers_ref) + # lst.extend(cum_af) lst = tuple(lst) ws2.write_row(row + 1, 0, lst) if chimera_correction: - ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)}) - ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)}) - ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)}) + ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)}) + ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)}) + ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)}) else: ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)})