Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 61:3722268ffac5 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Wed, 17 Mar 2021 13:14:40 +0000 |
parents | 9ce53bf0931c |
children | 66c1245436b9 |
comparison
equal
deleted
inserted
replaced
60:9ce53bf0931c | 61:3722268ffac5 |
---|---|
124 mut_dict = {} | 124 mut_dict = {} |
125 mut_read_pos_dict = {} | 125 mut_read_pos_dict = {} |
126 mut_read_dict = {} | 126 mut_read_dict = {} |
127 reads_dict = {} | 127 reads_dict = {} |
128 mut_read_cigar_dict = {} | 128 mut_read_cigar_dict = {} |
129 real_start_end = {} | |
129 i = 0 | 130 i = 0 |
130 mut_array = [] | 131 mut_array = [] |
131 | 132 |
132 for count, variant in enumerate(VCF(file1)): | 133 for count, variant in enumerate(VCF(file1)): |
133 #if count == 2000: | 134 #if count == 2000: |
147 i += 1 | 148 i += 1 |
148 mut_dict[chrom_stop_pos] = {} | 149 mut_dict[chrom_stop_pos] = {} |
149 mut_read_pos_dict[chrom_stop_pos] = {} | 150 mut_read_pos_dict[chrom_stop_pos] = {} |
150 reads_dict[chrom_stop_pos] = {} | 151 reads_dict[chrom_stop_pos] = {} |
151 mut_read_cigar_dict[chrom_stop_pos] = {} | 152 mut_read_cigar_dict[chrom_stop_pos] = {} |
153 real_start_end[chrom_stop_pos] = {} | |
152 | 154 |
153 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | 155 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): |
154 if pileupcolumn.reference_pos == stop_pos: | 156 if pileupcolumn.reference_pos == stop_pos: |
155 count_alt = 0 | 157 count_alt = 0 |
156 count_ref = 0 | 158 count_ref = 0 |
165 n += 1 | 167 n += 1 |
166 if not pileupread.is_del and not pileupread.is_refskip: | 168 if not pileupread.is_del and not pileupread.is_refskip: |
167 tag = pileupread.alignment.query_name | 169 tag = pileupread.alignment.query_name |
168 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | 170 nuc = pileupread.alignment.query_sequence[pileupread.query_position] |
169 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 | 171 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 |
172 | |
173 # if read is softclipped, store real position in reference | |
174 if "S" in pileupread.alignment.cigarstring: | |
175 # spftclipped at start | |
176 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): | |
177 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) | |
178 end = pileupread.alignment.reference_end | |
179 # softclipped at end | |
180 elif re.search(r"S$", pileupread.alignment.cigarstring): | |
181 end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) | |
182 start = pileupread.alignment.reference_start | |
183 else: | |
184 end = pileupread.alignment.reference_end | |
185 start = pileupread.alignment.reference_start | |
186 | |
170 if phred < phred_score: | 187 if phred < phred_score: |
171 nuc = "lowQ" | 188 nuc = "lowQ" |
172 if tag not in mut_dict[chrom_stop_pos]: | 189 if tag not in mut_dict[chrom_stop_pos]: |
173 mut_dict[chrom_stop_pos][tag] = {} | 190 mut_dict[chrom_stop_pos][tag] = {} |
174 if nuc in mut_dict[chrom_stop_pos][tag]: | 191 if nuc in mut_dict[chrom_stop_pos][tag]: |
177 mut_dict[chrom_stop_pos][tag][nuc] = 1 | 194 mut_dict[chrom_stop_pos][tag][nuc] = 1 |
178 if tag not in mut_read_pos_dict[chrom_stop_pos]: | 195 if tag not in mut_read_pos_dict[chrom_stop_pos]: |
179 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] | 196 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] |
180 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] | 197 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] |
181 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] | 198 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] |
182 | 199 real_start_end[chrom_stop_pos][tag] = [(start, end)] |
183 #alignedRefPositions = pileupread.get_reference_positions()[0] | |
184 else: | 200 else: |
185 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) | 201 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) |
186 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) | 202 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) |
187 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) | 203 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) |
204 real_start_end[chrom_stop_pos][tag].append((start, end)) | |
188 if nuc == alt: | 205 if nuc == alt: |
189 count_alt += 1 | 206 count_alt += 1 |
190 if tag not in mut_read_dict: | 207 if tag not in mut_read_dict: |
191 mut_read_dict[tag] = {} | 208 mut_read_dict[tag] = {} |
192 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) | 209 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) |
531 end_read1 = end_read2 = end_read3 = end_read4 = [] | 548 end_read1 = end_read2 = end_read3 = end_read4 = [] |
532 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): | 549 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): |
533 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) | 550 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) |
534 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) | 551 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) |
535 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] | 552 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] |
536 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']) | |
537 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] | 553 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] |
538 #print(cigars_dcs1) | |
539 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] | 554 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] |
555 ref_positions1 = real_start_end[key1][key2[:-5] + '.ab.1'] | |
540 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): | 556 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): |
541 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) | 557 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) |
542 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) | 558 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) |
543 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] | 559 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] |
544 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] | 560 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] |
545 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] | 561 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] |
562 ref_positions2 = real_start_end[key1][key2[:-5] + '.ab.2'] | |
546 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): | 563 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): |
547 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) | 564 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) |
548 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) | 565 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) |
549 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] | 566 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] |
550 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] | 567 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] |
551 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] | 568 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] |
569 ref_positions3 = real_start_end[key1][key2[:-5] + '.ba.1'] | |
552 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): | 570 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): |
553 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) | 571 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) |
554 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) | 572 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) |
555 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']) | |
556 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] | 573 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] |
557 | |
558 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] | 574 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] |
559 #print(cigars_dcs4) | |
560 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] | 575 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] |
576 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2'] | |
561 | 577 |
562 used_keys.append(key2[:-5]) | 578 used_keys.append(key2[:-5]) |
563 counts_mut += 1 | 579 counts_mut += 1 |
564 if (alt1f + alt2f + alt3f + alt4f) > 0.5: | 580 if (alt1f + alt2f + alt3f + alt4f) > 0.5: |
581 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new | |
565 if total1new == 0: | 582 if total1new == 0: |
566 ref1f = alt1f = None | 583 ref1f = alt1f = None |
567 alt1ff = -1 | 584 alt1ff = -1 |
585 alt1ff_trim = -1 | |
568 else: | 586 else: |
569 alt1ff = alt1f | 587 alt1ff = alt1f |
588 alt1ff_trim = alt1f | |
570 if total2new == 0: | 589 if total2new == 0: |
571 ref2f = alt2f = None | 590 ref2f = alt2f = None |
572 alt2ff = -1 | 591 alt2ff = -1 |
592 alt2ff_trim = -1 | |
573 else: | 593 else: |
574 alt2ff = alt2f | 594 alt2ff = alt2f |
595 alt2ff_trim = alt2f | |
575 if total3new == 0: | 596 if total3new == 0: |
576 ref3f = alt3f = None | 597 ref3f = alt3f = None |
577 alt3ff = -1 | 598 alt3ff = -1 |
599 alt3ff_trim = -1 | |
578 else: | 600 else: |
579 alt3ff = alt3f | 601 alt3ff = alt3f |
602 alt3ff_trim = alt3f | |
580 if total4new == 0: | 603 if total4new == 0: |
581 ref4f = alt4f = None | 604 ref4f = alt4f = None |
582 alt4ff = -1 | 605 alt4ff = -1 |
606 alt4ff_trim = alt4f | |
583 else: | 607 else: |
584 alt4ff = alt4f | 608 alt4ff = alt4f |
609 alt4ff_trim = alt4f | |
585 | 610 |
586 beg1 = beg4 = beg2 = beg3 = 0 | 611 beg1 = beg4 = beg2 = beg3 = 0 |
587 | 612 |
588 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 613 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
589 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 614 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
594 softclipped_mutation_oneOfTwoMates = False | 619 softclipped_mutation_oneOfTwoMates = False |
595 softclipped_mutation_oneOfTwoSSCS = False | 620 softclipped_mutation_oneOfTwoSSCS = False |
596 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 621 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
597 softclipped_mutation_oneMate = False | 622 softclipped_mutation_oneMate = False |
598 softclipped_mutation_oneMateOneSSCS = False | 623 softclipped_mutation_oneMateOneSSCS = False |
599 print() | 624 |
600 print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3) | 625 trimmed_actual_high_tier = False |
626 | |
601 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] | 627 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] |
602 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] | 628 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] |
603 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False | 629 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False |
604 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False | 630 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False |
605 | 631 |
606 # mate 1 - SSCS ab | 632 # mate 1 - SSCS ab |
607 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] | 633 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] |
608 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads | 634 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads |
609 | |
610 if any(ij is True for ij in softclipped_idx1): | 635 if any(ij is True for ij in softclipped_idx1): |
611 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] | 636 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] |
612 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] | 637 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] |
613 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] | 638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] |
614 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] | 639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] |
615 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] | 640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] |
616 | |
617 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
618 if any(ij is True for ij in softclipped_both_ends_idx1): | 642 if any(ij is True for ij in softclipped_both_ends_idx1): |
619 print(softclipped_both_ends_idx1) | 643 print(softclipped_both_ends_idx1) |
620 for nr, indx in enumerate(softclipped_both_ends_idx1): | 644 for nr, indx in enumerate(softclipped_both_ends_idx1): |
621 if indx: | 645 if indx: |
623 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number | 647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number |
624 else: | 648 else: |
625 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number | 649 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number |
626 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 650 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
627 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 651 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
628 print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1) | |
629 | 652 |
630 # mate 1 - SSCS ba | 653 # mate 1 - SSCS ba |
631 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] | 654 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] |
632 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads | 655 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads |
633 if any(ij is True for ij in softclipped_idx4): | 656 if any(ij is True for ij in softclipped_idx4): |
634 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] | 657 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] |
635 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] | 658 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] |
636 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] | 659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] |
637 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] | 660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] |
638 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] | 661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] |
639 | |
640 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
641 if any(ij is True for ij in softclipped_both_ends_idx4): | 663 if any(ij is True for ij in softclipped_both_ends_idx4): |
642 print(softclipped_both_ends_idx4) | 664 print(softclipped_both_ends_idx4) |
643 for nr, indx in enumerate(softclipped_both_ends_idx4): | 665 for nr, indx in enumerate(softclipped_both_ends_idx4): |
644 if indx: | 666 if indx: |
646 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number | 668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number |
647 else: | 669 else: |
648 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number | 670 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number |
649 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 671 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
650 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 672 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
651 print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4) | |
652 | 673 |
653 # mate 2 - SSCS ab | 674 # mate 2 - SSCS ab |
654 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] | 675 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] |
655 #print(sum(softclipped_idx2)) | |
656 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads | 676 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads |
657 if any(ij is True for ij in softclipped_idx2): | 677 if any(ij is True for ij in softclipped_idx2): |
658 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] | 678 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] |
659 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] | 679 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] |
660 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] | 680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] |
661 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] | 681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] |
662 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] | 682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] |
663 | |
664 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
665 if any(ij is True for ij in softclipped_both_ends_idx2): | 684 if any(ij is True for ij in softclipped_both_ends_idx2): |
666 print(softclipped_both_ends_idx2) | 685 print(softclipped_both_ends_idx2) |
667 for nr, indx in enumerate(softclipped_both_ends_idx2): | 686 for nr, indx in enumerate(softclipped_both_ends_idx2): |
668 if indx: | 687 if indx: |
669 if dist_start_read2[nr] <= dist_end_read2[nr]: | 688 if dist_start_read2[nr] <= dist_end_read2[nr]: |
670 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number | 689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number |
671 else: | 690 else: |
672 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number | 691 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number |
673 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 692 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
674 #print(ratio_dist_end2) | |
675 #print([True if x <= thr else False for x in ratio_dist_end2]) | |
676 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 693 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
677 print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2) | |
678 | 694 |
679 # mate 2 - SSCS ba | 695 # mate 2 - SSCS ba |
680 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] | 696 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] |
681 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads | 697 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads |
682 if any(ij is True for ij in softclipped_idx3): | 698 if any(ij is True for ij in softclipped_idx3): |
683 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] | 699 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] |
684 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] | 700 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] |
685 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] | 701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] |
686 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] | 702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] |
687 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] | 703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] |
688 | |
689 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
690 if any(ij is True for ij in softclipped_both_ends_idx3): | 705 if any(ij is True for ij in softclipped_both_ends_idx3): |
691 print(softclipped_both_ends_idx3) | 706 print(softclipped_both_ends_idx3) |
692 for nr, indx in enumerate(softclipped_both_ends_idx3): | 707 for nr, indx in enumerate(softclipped_both_ends_idx3): |
693 if indx: | 708 if indx: |
694 if dist_start_read3[nr] <= dist_end_read3[nr]: | 709 if dist_start_read3[nr] <= dist_end_read3[nr]: |
695 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh | 710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh |
696 else: | 711 else: |
697 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number | 712 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number |
698 #print([True if x <= thr else False for x in dist_start_read3]) | |
699 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 713 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
700 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 714 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
701 print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3) | |
702 | 715 |
703 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant | 716 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant |
704 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | | 717 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | |
705 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & | 718 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & |
706 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): | 719 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): |
726 alt4ff = 0 | 739 alt4ff = 0 |
727 alt2ff = 0 | 740 alt2ff = 0 |
728 alt3ff = 0 | 741 alt3ff = 0 |
729 trimmed = False | 742 trimmed = False |
730 contradictory = False | 743 contradictory = False |
731 print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates) | |
732 # information of both mates available --> only one mate softclipped | 744 # information of both mates available --> only one mate softclipped |
733 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | | 745 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | |
734 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & | 746 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & |
735 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 747 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
736 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 748 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
749 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red | |
750 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue | |
751 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red | |
752 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue | |
753 if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue | |
754 softclipped_mutation_oneOfTwoMates = False | |
755 # blue mate at beginning softclipped | |
756 if min_start1 > min_start2: | |
757 n_spacer_barcode = min_start1 - min_start2 | |
758 read_pos2 = read_pos2 - n_spacer_barcode | |
759 read_pos3 = read_pos3 - n_spacer_barcode | |
760 read_len_median2 = read_len_median2 - n_spacer_barcode | |
761 read_len_median3 = read_len_median3 - n_spacer_barcode | |
762 # red mate at end softclipped | |
763 if max_end1 > max_end2: | |
764 n_spacer_barcode = max_end1 - max_end2 | |
765 read_len_median1 = read_len_median1 - n_spacer_barcode | |
766 read_len_median4 = read_len_median4 - n_spacer_barcode | |
767 elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red | |
768 softclipped_mutation_oneOfTwoMates = False | |
769 if min_start1 < min_start2: | |
770 n_spacer_barcode = min_start2 - min_start1 | |
771 read_pos1 = read_pos1 - n_spacer_barcode | |
772 read_pos4 = read_pos4 - n_spacer_barcode | |
773 read_len_median1 = read_len_median1 - n_spacer_barcode | |
774 read_len_median4 = read_len_median4 - n_spacer_barcode | |
775 if max_end1 < max_end2: # if mate1 ends after mate 2 starts | |
776 n_spacer_barcode = max_end2 - max_end1 | |
777 read_len_median2 = read_len_median2 - n_spacer_barcode | |
778 read_len_median3 = read_len_median3 - n_spacer_barcode | |
779 else: | |
780 softclipped_mutation_oneOfTwoMates = True | |
781 alt1ff = 0 | |
782 alt4ff = 0 | |
783 alt2ff = 0 | |
784 alt3ff = 0 | |
785 trimmed = False | |
786 contradictory = False | |
737 softclipped_mutation_allMates = False | 787 softclipped_mutation_allMates = False |
738 softclipped_mutation_oneOfTwoMates = True | |
739 softclipped_mutation_oneOfTwoSSCS = False | 788 softclipped_mutation_oneOfTwoSSCS = False |
740 softclipped_mutation_oneOfTwoSSCS_diffMates = False | |
741 softclipped_mutation_oneMate = False | 789 softclipped_mutation_oneMate = False |
742 softclipped_mutation_oneMateOneSSCS = False | 790 softclipped_mutation_oneMateOneSSCS = False |
743 alt1ff = 0 | 791 |
744 alt4ff = 0 | 792 if softclipped_mutation_oneOfTwoMates is False: # check trimming tier |
745 alt2ff = 0 | 793 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
746 alt3ff = 0 | 794 beg1 = total1new |
747 trimmed = False | 795 total1new = 0 |
748 contradictory = False | 796 alt1ff = 0 |
749 print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates) | 797 alt1f = 0 |
798 trimmed = True | |
799 | |
800 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): | |
801 beg4 = total4new | |
802 total4new = 0 | |
803 alt4ff = 0 | |
804 alt4f = 0 | |
805 trimmed = True | |
806 | |
807 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): | |
808 beg2 = total2new | |
809 total2new = 0 | |
810 alt2ff = 0 | |
811 alt2f = 0 | |
812 trimmed = True | |
813 | |
814 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): | |
815 beg3 = total3new | |
816 total3new = 0 | |
817 alt3ff = 0 | |
818 alt3f = 0 | |
819 trimmed = True | |
820 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | |
821 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | |
822 | |
750 # information of both mates available --> only one mate softclipped | 823 # information of both mates available --> only one mate softclipped |
751 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 824 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
752 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 825 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
753 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 826 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
754 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 827 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
762 alt4ff = 0 | 835 alt4ff = 0 |
763 alt2ff = 0 | 836 alt2ff = 0 |
764 alt3ff = 0 | 837 alt3ff = 0 |
765 trimmed = False | 838 trimmed = False |
766 contradictory = False | 839 contradictory = False |
767 print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff]) | |
768 | 840 |
769 # information of one mate available --> all reads of one mate are softclipped | 841 # information of one mate available --> all reads of one mate are softclipped |
770 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & | 842 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & |
771 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | | 843 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | |
772 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 844 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
773 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available | 845 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available |
774 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 846 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
775 #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) & | |
776 # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) | | |
777 # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) & | |
778 # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))): | |
779 softclipped_mutation_allMates = False | 847 softclipped_mutation_allMates = False |
780 softclipped_mutation_oneOfTwoMates = False | 848 softclipped_mutation_oneOfTwoMates = False |
781 softclipped_mutation_oneOfTwoSSCS = False | 849 softclipped_mutation_oneOfTwoSSCS = False |
782 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 850 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
783 softclipped_mutation_oneMate = True | 851 softclipped_mutation_oneMate = True |
786 alt4ff = 0 | 854 alt4ff = 0 |
787 alt2ff = 0 | 855 alt2ff = 0 |
788 alt3ff = 0 | 856 alt3ff = 0 |
789 trimmed = False | 857 trimmed = False |
790 contradictory = False | 858 contradictory = False |
791 print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate) | |
792 # information of one mate available --> only one SSCS is softclipped | 859 # information of one mate available --> only one SSCS is softclipped |
793 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 860 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
794 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | | 861 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | |
795 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 862 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
796 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available | 863 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available |
797 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 864 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
798 #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) | | |
799 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) | | |
800 # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) | | |
801 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))): | |
802 softclipped_mutation_allMates = False | 865 softclipped_mutation_allMates = False |
803 softclipped_mutation_oneOfTwoMates = False | 866 softclipped_mutation_oneOfTwoMates = False |
804 softclipped_mutation_oneOfTwoSSCS = False | 867 softclipped_mutation_oneOfTwoSSCS = False |
805 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 868 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
806 softclipped_mutation_oneMate = False | 869 softclipped_mutation_oneMate = False |
809 alt4ff = 0 | 872 alt4ff = 0 |
810 alt2ff = 0 | 873 alt2ff = 0 |
811 alt3ff = 0 | 874 alt3ff = 0 |
812 trimmed = False | 875 trimmed = False |
813 contradictory = False | 876 contradictory = False |
814 print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS) | |
815 | 877 |
816 else: | 878 else: |
817 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): | 879 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
818 beg1 = total1new | 880 beg1 = total1new |
819 total1new = 0 | 881 total1new = 0 |
841 alt3ff = 0 | 903 alt3ff = 0 |
842 alt3f = 0 | 904 alt3f = 0 |
843 trimmed = True | 905 trimmed = True |
844 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 906 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
845 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 907 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
846 | |
847 | |
848 #sum_highTiers = sum([tier_dict[key1][ij] for ij in tier_dict[key1].keys()[:6]]) | |
849 | 908 |
850 # assign tiers | 909 # assign tiers |
851 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & | 910 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & |
852 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | | 911 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | |
853 (all(int(ij) >= 3 for ij in [total2new, total3new]) & | 912 (all(int(ij) >= 3 for ij in [total2new, total3new]) & |
913 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): | 972 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): |
914 tier = "3.2" | 973 tier = "3.2" |
915 counter_tier32 += 1 | 974 counter_tier32 += 1 |
916 tier_dict[key1]["tier 3.2"] += 1 | 975 tier_dict[key1]["tier 3.2"] += 1 |
917 | 976 |
918 #elif (trimmed) and (sum_highTiers > 1): | |
919 # tier = "2.5" | |
920 # counter_tier25 += 1 | |
921 # tier_dict[key1]["tier 2.5"] += 1 | |
922 | |
923 elif (trimmed): | 977 elif (trimmed): |
924 tier = "4" | 978 tier = "4" |
925 counter_tier4 += 1 | 979 counter_tier4 += 1 |
926 tier_dict[key1]["tier 4"] += 1 | 980 tier_dict[key1]["tier 4"] += 1 |
981 | |
982 # assign tiers | |
983 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
984 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
985 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
986 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
987 trimmed_actual_high_tier = True | |
988 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | |
989 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
990 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
991 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): | |
992 trimmed_actual_high_tier = True | |
993 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
994 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
995 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
996 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
997 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
998 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
999 trimmed_actual_high_tier = True | |
1000 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | |
1001 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): | |
1002 trimmed_actual_high_tier = True | |
1003 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
1004 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
1005 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) & | |
1006 any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) | | |
1007 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
1008 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
1009 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) & | |
1010 any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))): | |
1011 trimmed_actual_high_tier = True | |
1012 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
1013 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
1014 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
1015 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
1016 trimmed_actual_high_tier = True | |
1017 else: | |
1018 trimmed_actual_high_tier = False | |
927 | 1019 |
928 elif softclipped_mutation_allMates: | 1020 elif softclipped_mutation_allMates: |
929 tier = "5.1" | 1021 tier = "5.1" |
930 counter_tier51 += 1 | 1022 counter_tier51 += 1 |
931 tier_dict[key1]["tier 5.1"] += 1 | 1023 tier_dict[key1]["tier 5.1"] += 1 |
1006 | 1098 |
1007 # tags which have identical parts: | 1099 # tags which have identical parts: |
1008 if min_value == 0 or dist2 == 0: | 1100 if min_value == 0 or dist2 == 0: |
1009 min_tags_list_zeros.append(tag) | 1101 min_tags_list_zeros.append(tag) |
1010 chimera_tags.append(max_tag) | 1102 chimera_tags.append(max_tag) |
1011 # chimeric = True | |
1012 # else: | |
1013 # chimeric = False | |
1014 | |
1015 # if mate_b is False: | |
1016 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric) | |
1017 # else: | |
1018 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric) | |
1019 i += 1 | 1103 i += 1 |
1020 chimera_tags = [x for x in chimera_tags if x != []] | 1104 chimera_tags = [x for x in chimera_tags if x != []] |
1021 chimera_tags_new = [] | 1105 chimera_tags_new = [] |
1022 for i in chimera_tags: | 1106 for i in chimera_tags: |
1023 if len(i) > 1: | 1107 if len(i) > 1: |
1070 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1154 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
1071 #if trimmed: | 1155 #if trimmed: |
1072 #if key1 not in list(change_tier_after_print.keys()): | 1156 #if key1 not in list(change_tier_after_print.keys()): |
1073 # change_tier_after_print[key1] = [((row, line, line2))] | 1157 # change_tier_after_print[key1] = [((row, line, line2))] |
1074 #else: | 1158 #else: |
1075 change_tier_after_print.append((row, line, line2)) | 1159 change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier)) |
1076 | 1160 |
1077 row += 3 | 1161 row += 3 |
1078 | 1162 |
1079 if chimera_correction: | 1163 if chimera_correction: |
1080 chimeric_dcs_high_tiers = 0 | 1164 chimeric_dcs_high_tiers = 0 |
1099 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: | 1183 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: |
1100 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] | 1184 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] |
1101 tier_dict[key1]["tier 4"] = 0 | 1185 tier_dict[key1]["tier 4"] = 0 |
1102 correct_tier = True | 1186 correct_tier = True |
1103 | 1187 |
1104 #lines = change_tier_after_print[key1] | |
1105 for sample in change_tier_after_print: | 1188 for sample in change_tier_after_print: |
1106 row_number = sample[0] | 1189 row_number = sample[0] |
1107 line1 = sample[1] | 1190 line1 = sample[1] |
1108 line2 = sample[2] | 1191 line2 = sample[2] |
1109 | 1192 actual_high_tier = sample[3] |
1110 if correct_tier and list(line1)[1] == "4": | 1193 current_tier = list(line1)[1] |
1194 | |
1195 if correct_tier and (current_tier == "4") and actual_high_tier: | |
1111 line1 = list(line1) | 1196 line1 = list(line1) |
1112 line1[1] = "2.5" | 1197 line1[1] = "2.5" |
1113 line1 = tuple(line1) | 1198 line1 = tuple(line1) |
1114 counter_tier25 += 1 | 1199 counter_tier25 += 1 |
1115 counter_tier4 -= 1 | 1200 counter_tier4 -= 1 |