Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 75:6ccff403db8a draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Tue, 23 Mar 2021 15:18:17 +0000 |
parents | eca1365eb42c |
children | 56f271641828 |
comparison
equal
deleted
inserted
replaced
74:5023186c2061 | 75:6ccff403db8a |
---|---|
129 real_start_end = {} | 129 real_start_end = {} |
130 i = 0 | 130 i = 0 |
131 mut_array = [] | 131 mut_array = [] |
132 | 132 |
133 for count, variant in enumerate(VCF(file1)): | 133 for count, variant in enumerate(VCF(file1)): |
134 #if count == 2000: | |
135 # break | |
136 chrom = variant.CHROM | 134 chrom = variant.CHROM |
137 stop_pos = variant.start | 135 stop_pos = variant.start |
138 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | |
139 ref = variant.REF | 136 ref = variant.REF |
140 if len(variant.ALT) == 0: | 137 if len(variant.ALT) == 0: |
141 continue | 138 continue |
142 else: | 139 else: |
143 alt = variant.ALT[0] | 140 alt = variant.ALT[0] |
159 count_indel = 0 | 156 count_indel = 0 |
160 count_n = 0 | 157 count_n = 0 |
161 count_other = 0 | 158 count_other = 0 |
162 count_lowq = 0 | 159 count_lowq = 0 |
163 n = 0 | 160 n = 0 |
164 #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
165 # "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
166 for pileupread in pileupcolumn.pileups: | 161 for pileupread in pileupcolumn.pileups: |
167 n += 1 | 162 n += 1 |
168 if not pileupread.is_del and not pileupread.is_refskip: | 163 if not pileupread.is_del and not pileupread.is_refskip: |
169 tag = pileupread.alignment.query_name | 164 tag = pileupread.alignment.query_name |
170 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | 165 nuc = pileupread.alignment.query_sequence[pileupread.query_position] |
171 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 | 166 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 |
172 | |
173 # if read is softclipped, store real position in reference | 167 # if read is softclipped, store real position in reference |
174 if "S" in pileupread.alignment.cigarstring: | 168 if "S" in pileupread.alignment.cigarstring: |
175 # spftclipped at start | 169 # spftclipped at start |
176 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): | 170 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): |
177 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) | 171 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) |
178 end = pileupread.alignment.reference_end | 172 end = pileupread.alignment.reference_end |
179 # softclipped at end | 173 # softclipped at end |
196 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] | 190 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] |
197 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] | 191 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] |
198 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] | 192 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] |
199 real_start_end[chrom_stop_pos][tag] = [(start, end)] | 193 real_start_end[chrom_stop_pos][tag] = [(start, end)] |
200 else: | 194 else: |
201 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) | 195 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) |
202 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) | 196 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) |
203 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) | 197 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) |
204 real_start_end[chrom_stop_pos][tag].append((start, end)) | 198 real_start_end[chrom_stop_pos][tag].append((start, end)) |
205 if nuc == alt: | 199 if nuc == alt: |
206 count_alt += 1 | 200 count_alt += 1 |
207 if tag not in mut_read_dict: | 201 if tag not in mut_read_dict: |
208 mut_read_dict[tag] = {} | 202 mut_read_dict[tag] = {} |
218 else: | 212 else: |
219 count_other += 1 | 213 count_other += 1 |
220 else: | 214 else: |
221 count_indel += 1 | 215 count_indel += 1 |
222 | 216 |
223 #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) | |
224 #else: | |
225 # print("indels are currently not evaluated") | |
226 mut_array = np.array(mut_array) | 217 mut_array = np.array(mut_array) |
227 for read in bam.fetch(until_eof=True): | 218 for read in bam.fetch(until_eof=True): |
228 if read.is_unmapped: | 219 if read.is_unmapped: |
229 pure_tag = read.query_name[:-5] | 220 pure_tag = read.query_name[:-5] |
230 nuc = "na" | 221 nuc = "na" |
240 bam.close() | 231 bam.close() |
241 | 232 |
242 # create pure_tags_dict | 233 # create pure_tags_dict |
243 pure_tags_dict = {} | 234 pure_tags_dict = {} |
244 for key1, value1 in sorted(mut_dict.items()): | 235 for key1, value1 in sorted(mut_dict.items()): |
245 #if len(np.where(np.array(['#'.join(str(i) for i in z) | |
246 # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: | |
247 # continue | |
248 | |
249 i = np.where(np.array(['#'.join(str(i) for i in z) | 236 i = np.where(np.array(['#'.join(str(i) for i in z) |
250 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] | 237 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
251 ref = mut_array[i, 2] | 238 ref = mut_array[i, 2] |
252 alt = mut_array[i, 3] | 239 alt = mut_array[i, 3] |
253 pure_tags_dict[key1] = {} | 240 pure_tags_dict[key1] = {} |
334 alt = mut_array[i, 3] | 321 alt = mut_array[i, 3] |
335 dcs_median = cvrg_dict[key1][2] | 322 dcs_median = cvrg_dict[key1][2] |
336 whole_array = pure_tags_dict_short[key1].keys() | 323 whole_array = pure_tags_dict_short[key1].keys() |
337 | 324 |
338 tier_dict[key1] = {} | 325 tier_dict[key1] = {} |
339 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0),("tier 2.5", 0), | 326 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), |
340 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), | 327 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), |
341 ("tier 6", 0), ("tier 7", 0)] | 328 ("tier 6", 0), ("tier 7", 0)] |
342 for k, v in values_tier_dict: | 329 for k, v in values_tier_dict: |
343 tier_dict[key1][k] = v | 330 tier_dict[key1][k] = v |
344 | 331 |
617 | 604 |
618 # mate 1 - SSCS ab | 605 # mate 1 - SSCS ab |
619 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] | 606 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] |
620 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads | 607 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads |
621 if any(ij is True for ij in softclipped_idx1): | 608 if any(ij is True for ij in softclipped_idx1): |
622 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] | 609 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] |
623 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] | 610 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] |
624 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] | 611 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] |
625 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] | 612 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] |
626 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] | 613 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] |
627 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 614 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
628 if any(ij is True for ij in softclipped_both_ends_idx1): | 615 if any(ij is True for ij in softclipped_both_ends_idx1): |
629 for nr, indx in enumerate(softclipped_both_ends_idx1): | 616 for nr, indx in enumerate(softclipped_both_ends_idx1): |
630 if indx: | 617 if indx: |
631 if dist_start_read1[nr] <= dist_end_read1[nr]: | 618 if dist_start_read1[nr] <= dist_end_read1[nr]: |
632 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number | 619 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number |
633 else: | 620 else: |
634 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number | 621 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number |
635 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 622 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
636 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 623 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
637 | 624 |
638 # mate 1 - SSCS ba | 625 # mate 1 - SSCS ba |
639 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] | 626 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] |
640 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads | 627 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads |
641 if any(ij is True for ij in softclipped_idx4): | 628 if any(ij is True for ij in softclipped_idx4): |
642 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] | 629 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] |
643 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] | 630 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] |
644 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] | 631 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] |
645 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] | 632 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] |
646 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] | 633 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] |
647 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 634 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
648 if any(ij is True for ij in softclipped_both_ends_idx4): | 635 if any(ij is True for ij in softclipped_both_ends_idx4): |
649 for nr, indx in enumerate(softclipped_both_ends_idx4): | 636 for nr, indx in enumerate(softclipped_both_ends_idx4): |
650 if indx: | 637 if indx: |
651 if dist_start_read4[nr] <= dist_end_read4[nr]: | 638 if dist_start_read4[nr] <= dist_end_read4[nr]: |
652 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number | 639 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number |
653 else: | 640 else: |
654 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number | 641 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number |
655 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 642 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
656 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 643 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
657 | 644 |
658 # mate 2 - SSCS ab | 645 # mate 2 - SSCS ab |
659 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] | 646 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] |
660 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads | 647 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads |
661 if any(ij is True for ij in softclipped_idx2): | 648 if any(ij is True for ij in softclipped_idx2): |
662 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] | 649 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] |
663 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] | 650 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] |
664 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] | 651 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] |
665 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] | 652 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] |
666 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] | 653 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] |
667 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 654 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
668 if any(ij is True for ij in softclipped_both_ends_idx2): | 655 if any(ij is True for ij in softclipped_both_ends_idx2): |
669 for nr, indx in enumerate(softclipped_both_ends_idx2): | 656 for nr, indx in enumerate(softclipped_both_ends_idx2): |
670 if indx: | 657 if indx: |
671 if dist_start_read2[nr] <= dist_end_read2[nr]: | 658 if dist_start_read2[nr] <= dist_end_read2[nr]: |
672 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number | 659 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number |
673 else: | 660 else: |
674 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number | 661 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number |
675 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 662 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
676 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 663 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
677 | 664 |
678 # mate 2 - SSCS ba | 665 # mate 2 - SSCS ba |
679 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] | 666 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] |
682 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] | 669 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] |
683 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] | 670 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] |
684 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] | 671 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] |
685 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] | 672 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] |
686 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] | 673 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] |
687 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 674 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
688 if any(ij is True for ij in softclipped_both_ends_idx3): | 675 if any(ij is True for ij in softclipped_both_ends_idx3): |
689 for nr, indx in enumerate(softclipped_both_ends_idx3): | 676 for nr, indx in enumerate(softclipped_both_ends_idx3): |
690 if indx: | 677 if indx: |
691 if dist_start_read3[nr] <= dist_end_read3[nr]: | 678 if dist_start_read3[nr] <= dist_end_read3[nr]: |
692 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh | 679 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh |
693 else: | 680 else: |
694 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number | 681 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number |
695 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 682 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
696 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 683 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
697 | 684 |
698 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant | 685 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant |
699 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | | 686 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | |
705 alt3ff = 0 | 692 alt3ff = 0 |
706 trimmed = False | 693 trimmed = False |
707 contradictory = True | 694 contradictory = True |
708 # softclipping tiers | 695 # softclipping tiers |
709 # information of both mates available --> all reads for both mates and SSCS are softclipped | 696 # information of both mates available --> all reads for both mates and SSCS are softclipped |
710 elif (ratio1 & ratio4 & ratio2 & ratio3 & | 697 elif (ratio1 & ratio4 & ratio2 & ratio3 & |
711 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 698 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
712 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 699 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
713 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 700 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
714 softclipped_mutation_allMates = True | 701 softclipped_mutation_allMates = True |
715 softclipped_mutation_oneOfTwoMates = False | 702 softclipped_mutation_oneOfTwoMates = False |
716 softclipped_mutation_oneOfTwoSSCS = False | 703 softclipped_mutation_oneOfTwoSSCS = False |
717 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 704 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
718 softclipped_mutation_oneMate = False | 705 softclipped_mutation_oneMate = False |
724 trimmed = False | 711 trimmed = False |
725 contradictory = False | 712 contradictory = False |
726 # information of both mates available --> only one mate softclipped | 713 # information of both mates available --> only one mate softclipped |
727 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | | 714 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | |
728 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & | 715 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & |
729 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 716 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
730 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 717 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
731 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red | 718 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red |
732 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue | 719 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue |
733 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red | 720 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red |
734 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue | 721 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue |
735 if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue | 722 if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue |
736 softclipped_mutation_oneOfTwoMates = False | 723 softclipped_mutation_oneOfTwoMates = False |
737 # blue mate at beginning softclipped | 724 # blue mate at beginning softclipped |
738 if min_start1 > min_start2: | 725 if min_start1 > min_start2: |
739 n_spacer_barcode = min_start1 - min_start2 | 726 n_spacer_barcode = min_start1 - min_start2 |
740 read_pos2 = read_pos2 - n_spacer_barcode | 727 read_pos2 = read_pos2 - n_spacer_barcode |
741 read_pos3 = read_pos3 - n_spacer_barcode | 728 read_pos3 = read_pos3 - n_spacer_barcode |
742 read_len_median2 = read_len_median2 - n_spacer_barcode | 729 read_len_median2 = read_len_median2 - n_spacer_barcode |
743 read_len_median3 = read_len_median3 - n_spacer_barcode | 730 read_len_median3 = read_len_median3 - n_spacer_barcode |
744 # red mate at end softclipped | 731 # red mate at end softclipped |
745 if max_end1 > max_end2: | 732 if max_end1 > max_end2: |
746 n_spacer_barcode = max_end1 - max_end2 | 733 n_spacer_barcode = max_end1 - max_end2 |
747 read_len_median1 = read_len_median1 - n_spacer_barcode | 734 read_len_median1 = read_len_median1 - n_spacer_barcode |
748 read_len_median4 = read_len_median4 - n_spacer_barcode | 735 read_len_median4 = read_len_median4 - n_spacer_barcode |
749 elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red | 736 elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red |
750 softclipped_mutation_oneOfTwoMates = False | 737 softclipped_mutation_oneOfTwoMates = False |
751 if min_start1 < min_start2: | 738 if min_start1 < min_start2: |
752 n_spacer_barcode = min_start2 - min_start1 | 739 n_spacer_barcode = min_start2 - min_start1 |
753 read_pos1 = read_pos1 - n_spacer_barcode | 740 read_pos1 = read_pos1 - n_spacer_barcode |
754 read_pos4 = read_pos4 - n_spacer_barcode | 741 read_pos4 = read_pos4 - n_spacer_barcode |
755 read_len_median1 = read_len_median1 - n_spacer_barcode | 742 read_len_median1 = read_len_median1 - n_spacer_barcode |
756 read_len_median4 = read_len_median4 - n_spacer_barcode | 743 read_len_median4 = read_len_median4 - n_spacer_barcode |
757 if max_end1 < max_end2: # if mate1 ends after mate 2 starts | 744 if max_end1 < max_end2: # if mate1 ends after mate 2 starts |
758 n_spacer_barcode = max_end2 - max_end1 | 745 n_spacer_barcode = max_end2 - max_end1 |
759 read_len_median2 = read_len_median2 - n_spacer_barcode | 746 read_len_median2 = read_len_median2 - n_spacer_barcode |
760 read_len_median3 = read_len_median3 - n_spacer_barcode | 747 read_len_median3 = read_len_median3 - n_spacer_barcode |
761 else: | 748 else: |
762 softclipped_mutation_oneOfTwoMates = True | 749 softclipped_mutation_oneOfTwoMates = True |
768 contradictory = False | 755 contradictory = False |
769 softclipped_mutation_allMates = False | 756 softclipped_mutation_allMates = False |
770 softclipped_mutation_oneOfTwoSSCS = False | 757 softclipped_mutation_oneOfTwoSSCS = False |
771 softclipped_mutation_oneMate = False | 758 softclipped_mutation_oneMate = False |
772 softclipped_mutation_oneMateOneSSCS = False | 759 softclipped_mutation_oneMateOneSSCS = False |
773 | 760 |
774 if softclipped_mutation_oneOfTwoMates is False: # check trimming tier | 761 if softclipped_mutation_oneOfTwoMates is False: # check trimming tier |
775 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): | 762 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
776 beg1 = total1new | 763 beg1 = total1new |
777 total1new = 0 | 764 total1new = 0 |
778 alt1ff = 0 | 765 alt1ff = 0 |
779 alt1f = 0 | 766 alt1f = 0 |
803 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 790 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
804 | 791 |
805 # information of both mates available --> only one mate softclipped | 792 # information of both mates available --> only one mate softclipped |
806 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 793 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
807 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 794 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
808 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 795 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
809 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 796 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
810 softclipped_mutation_allMates = False | 797 softclipped_mutation_allMates = False |
811 softclipped_mutation_oneOfTwoMates = False | 798 softclipped_mutation_oneOfTwoMates = False |
812 softclipped_mutation_oneOfTwoSSCS = True | 799 softclipped_mutation_oneOfTwoSSCS = True |
813 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 800 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
814 softclipped_mutation_oneMate = False | 801 softclipped_mutation_oneMate = False |
821 contradictory = False | 808 contradictory = False |
822 | 809 |
823 # information of one mate available --> all reads of one mate are softclipped | 810 # information of one mate available --> all reads of one mate are softclipped |
824 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & | 811 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & |
825 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | | 812 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | |
826 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 813 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
827 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available | 814 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available |
828 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 815 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
829 softclipped_mutation_allMates = False | 816 softclipped_mutation_allMates = False |
830 softclipped_mutation_oneOfTwoMates = False | 817 softclipped_mutation_oneOfTwoMates = False |
831 softclipped_mutation_oneOfTwoSSCS = False | 818 softclipped_mutation_oneOfTwoSSCS = False |
832 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 819 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
833 softclipped_mutation_oneMate = True | 820 softclipped_mutation_oneMate = True |
839 trimmed = False | 826 trimmed = False |
840 contradictory = False | 827 contradictory = False |
841 # information of one mate available --> only one SSCS is softclipped | 828 # information of one mate available --> only one SSCS is softclipped |
842 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 829 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
843 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | | 830 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | |
844 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 831 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
845 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available | 832 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available |
846 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 833 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
847 softclipped_mutation_allMates = False | 834 softclipped_mutation_allMates = False |
848 softclipped_mutation_oneOfTwoMates = False | 835 softclipped_mutation_oneOfTwoMates = False |
849 softclipped_mutation_oneOfTwoSSCS = False | 836 softclipped_mutation_oneOfTwoSSCS = False |
850 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 837 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
851 softclipped_mutation_oneMate = False | 838 softclipped_mutation_oneMate = False |
959 elif (trimmed): | 946 elif (trimmed): |
960 tier = "4" | 947 tier = "4" |
961 counter_tier4 += 1 | 948 counter_tier4 += 1 |
962 tier_dict[key1]["tier 4"] += 1 | 949 tier_dict[key1]["tier 4"] += 1 |
963 | 950 |
964 # assign tiers | 951 # assign tiers |
965 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | 952 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & |
966 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | 953 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | |
967 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | 954 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & |
968 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | 955 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): |
969 trimmed_actual_high_tier = True | 956 trimmed_actual_high_tier = True |
1110 if (read_pos2 == -1): | 1097 if (read_pos2 == -1): |
1111 read_pos2 = read_len_median2 = None | 1098 read_pos2 = read_len_median2 = None |
1112 if (read_pos3 == -1): | 1099 if (read_pos3 == -1): |
1113 read_pos3 = read_len_median3 = None | 1100 read_pos3 = read_len_median3 = None |
1114 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) | 1101 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) |
1115 #ws1.write_row(row, 0, line) | 1102 # ws1.write_row(row, 0, line) |
1116 #csv_writer.writerow(line) | 1103 # csv_writer.writerow(line) |
1117 line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) | 1104 line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) |
1118 #ws1.write_row(row + 1, 0, line2) | 1105 # ws1.write_row(row + 1, 0, line2) |
1119 #csv_writer.writerow(line2) | 1106 # csv_writer.writerow(line2) |
1120 | 1107 |
1121 #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), | 1108 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), |
1122 # {'type': 'formula', | 1109 # {'type': 'formula', |
1123 # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), | 1110 # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), |
1124 # 'format': format1, | 1111 # 'format': format1, |
1125 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1112 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
1126 #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), | 1113 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), |
1127 # {'type': 'formula', | 1114 # {'type': 'formula', |
1128 # 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), | 1115 # 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), |
1129 # 'format': format3, | 1116 # 'format': format3, |
1130 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1117 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
1131 #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), | 1118 # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), |
1132 # {'type': 'formula', | 1119 # {'type': 'formula', |
1133 # 'criteria': '=$B${}>="3"'.format(row + 1), | 1120 # 'criteria': '=$B${}>="3"'.format(row + 1), |
1134 # 'format': format2, | 1121 # 'format': format2, |
1135 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1122 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
1136 change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier)) | 1123 change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier)) |
1175 csv_writer.writerow(line1) | 1162 csv_writer.writerow(line1) |
1176 ws1.write_row(row_number + 1, 0, line2) | 1163 ws1.write_row(row_number + 1, 0, line2) |
1177 csv_writer.writerow(line2) | 1164 csv_writer.writerow(line2) |
1178 | 1165 |
1179 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | 1166 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), |
1180 {'type': 'formula', | 1167 {'type': 'formula', |
1181 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), | 1168 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), |
1182 'format': format1, | 1169 'format': format1, |
1183 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | 1170 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1184 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | 1171 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), |
1185 {'type': 'formula', | 1172 {'type': 'formula', |
1186 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), | 1173 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), |
1187 'format': format3, | 1174 'format': format3, |
1188 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | 1175 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1189 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), | 1176 ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), |
1190 {'type': 'formula', | 1177 {'type': 'formula', |
1191 'criteria': '=$B${}>="3"'.format(row_number + 1), | 1178 'criteria': '=$B${}>="3"'.format(row_number + 1), |
1192 'format': format2, | 1179 'format': format2, |
1193 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) | 1180 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) |
1194 | |
1195 # sheet 2 | 1181 # sheet 2 |
1196 if chimera_correction: | 1182 if chimera_correction: |
1197 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', | 1183 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', |
1198 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', | 1184 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', |
1199 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', | 1185 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', |
1200 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') | 1186 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') |
1201 else: | 1187 else: |
1202 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', | 1188 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', |
1203 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', | 1189 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', |
1204 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', | 1190 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', |
1205 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') | 1191 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') |
1226 # calculate cummulative AF | 1212 # calculate cummulative AF |
1227 used_tiers.append(value2) | 1213 used_tiers.append(value2) |
1228 if len(used_tiers) > 1: | 1214 if len(used_tiers) > 1: |
1229 cum = safe_div(sum(used_tiers), cvrg) | 1215 cum = safe_div(sum(used_tiers), cvrg) |
1230 cum_af.append(cum) | 1216 cum_af.append(cum) |
1231 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place | 1217 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place |
1232 continue | 1218 continue |
1233 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) | 1219 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) |
1234 lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) | 1220 lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) |
1235 if chimera_correction: | 1221 if chimera_correction: |
1236 chimeras_all = chimera_dict[key1][1] | 1222 chimeras_all = chimera_dict[key1][1] |
1237 new_alt = sum(used_tiers[0:7]) - chimeras_all | 1223 new_alt = sum(used_tiers[0:7]) - chimeras_all |
1255 ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)}) | 1241 ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)}) |
1256 row += 1 | 1242 row += 1 |
1257 | 1243 |
1258 # sheet 3 | 1244 # sheet 3 |
1259 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), | 1245 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), |
1260 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), ("tier 2.5", counter_tier25), | 1246 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), ("tier 2.5", counter_tier25), |
1261 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4", counter_tier4), | 1247 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4", counter_tier4), |
1262 ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52), | 1248 ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52), |
1263 ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)] | 1249 ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)] |
1264 | 1250 |
1265 header = ("tier", "count") | 1251 header = ("tier", "count") |
1401 csv_data.close() | 1387 csv_data.close() |
1402 | 1388 |
1403 | 1389 |
1404 if __name__ == '__main__': | 1390 if __name__ == '__main__': |
1405 sys.exit(read2mut(sys.argv)) | 1391 sys.exit(read2mut(sys.argv)) |
1406 |