comparison read2mut.py @ 61:3722268ffac5 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Wed, 17 Mar 2021 13:14:40 +0000
parents 9ce53bf0931c
children 66c1245436b9
comparison
equal deleted inserted replaced
60:9ce53bf0931c 61:3722268ffac5
124 mut_dict = {} 124 mut_dict = {}
125 mut_read_pos_dict = {} 125 mut_read_pos_dict = {}
126 mut_read_dict = {} 126 mut_read_dict = {}
127 reads_dict = {} 127 reads_dict = {}
128 mut_read_cigar_dict = {} 128 mut_read_cigar_dict = {}
129 real_start_end = {}
129 i = 0 130 i = 0
130 mut_array = [] 131 mut_array = []
131 132
132 for count, variant in enumerate(VCF(file1)): 133 for count, variant in enumerate(VCF(file1)):
133 #if count == 2000: 134 #if count == 2000:
147 i += 1 148 i += 1
148 mut_dict[chrom_stop_pos] = {} 149 mut_dict[chrom_stop_pos] = {}
149 mut_read_pos_dict[chrom_stop_pos] = {} 150 mut_read_pos_dict[chrom_stop_pos] = {}
150 reads_dict[chrom_stop_pos] = {} 151 reads_dict[chrom_stop_pos] = {}
151 mut_read_cigar_dict[chrom_stop_pos] = {} 152 mut_read_cigar_dict[chrom_stop_pos] = {}
153 real_start_end[chrom_stop_pos] = {}
152 154
153 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): 155 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
154 if pileupcolumn.reference_pos == stop_pos: 156 if pileupcolumn.reference_pos == stop_pos:
155 count_alt = 0 157 count_alt = 0
156 count_ref = 0 158 count_ref = 0
165 n += 1 167 n += 1
166 if not pileupread.is_del and not pileupread.is_refskip: 168 if not pileupread.is_del and not pileupread.is_refskip:
167 tag = pileupread.alignment.query_name 169 tag = pileupread.alignment.query_name
168 nuc = pileupread.alignment.query_sequence[pileupread.query_position] 170 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
169 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 171 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33
172
173 # if read is softclipped, store real position in reference
174 if "S" in pileupread.alignment.cigarstring:
175 # spftclipped at start
176 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring):
177 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0])
178 end = pileupread.alignment.reference_end
179 # softclipped at end
180 elif re.search(r"S$", pileupread.alignment.cigarstring):
181 end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2])
182 start = pileupread.alignment.reference_start
183 else:
184 end = pileupread.alignment.reference_end
185 start = pileupread.alignment.reference_start
186
170 if phred < phred_score: 187 if phred < phred_score:
171 nuc = "lowQ" 188 nuc = "lowQ"
172 if tag not in mut_dict[chrom_stop_pos]: 189 if tag not in mut_dict[chrom_stop_pos]:
173 mut_dict[chrom_stop_pos][tag] = {} 190 mut_dict[chrom_stop_pos][tag] = {}
174 if nuc in mut_dict[chrom_stop_pos][tag]: 191 if nuc in mut_dict[chrom_stop_pos][tag]:
177 mut_dict[chrom_stop_pos][tag][nuc] = 1 194 mut_dict[chrom_stop_pos][tag][nuc] = 1
178 if tag not in mut_read_pos_dict[chrom_stop_pos]: 195 if tag not in mut_read_pos_dict[chrom_stop_pos]:
179 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] 196 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
180 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] 197 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
181 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] 198 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
182 199 real_start_end[chrom_stop_pos][tag] = [(start, end)]
183 #alignedRefPositions = pileupread.get_reference_positions()[0]
184 else: 200 else:
185 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) 201 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
186 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) 202 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
187 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) 203 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
204 real_start_end[chrom_stop_pos][tag].append((start, end))
188 if nuc == alt: 205 if nuc == alt:
189 count_alt += 1 206 count_alt += 1
190 if tag not in mut_read_dict: 207 if tag not in mut_read_dict:
191 mut_read_dict[tag] = {} 208 mut_read_dict[tag] = {}
192 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) 209 mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
531 end_read1 = end_read2 = end_read3 = end_read4 = [] 548 end_read1 = end_read2 = end_read3 = end_read4 = []
532 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): 549 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
533 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) 550 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']))
534 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) 551 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1']))
535 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] 552 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
536 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
537 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] 553 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
538 #print(cigars_dcs1)
539 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] 554 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
555 ref_positions1 = real_start_end[key1][key2[:-5] + '.ab.1']
540 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): 556 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
541 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) 557 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']))
542 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) 558 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2']))
543 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] 559 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
544 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] 560 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
545 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] 561 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
562 ref_positions2 = real_start_end[key1][key2[:-5] + '.ab.2']
546 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): 563 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
547 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) 564 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']))
548 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) 565 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1']))
549 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] 566 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
550 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] 567 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
551 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] 568 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
569 ref_positions3 = real_start_end[key1][key2[:-5] + '.ba.1']
552 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): 570 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
553 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) 571 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']))
554 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) 572 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2']))
555 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
556 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] 573 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
557
558 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] 574 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
559 #print(cigars_dcs4)
560 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] 575 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
576 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2']
561 577
562 used_keys.append(key2[:-5]) 578 used_keys.append(key2[:-5])
563 counts_mut += 1 579 counts_mut += 1
564 if (alt1f + alt2f + alt3f + alt4f) > 0.5: 580 if (alt1f + alt2f + alt3f + alt4f) > 0.5:
581 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new
565 if total1new == 0: 582 if total1new == 0:
566 ref1f = alt1f = None 583 ref1f = alt1f = None
567 alt1ff = -1 584 alt1ff = -1
585 alt1ff_trim = -1
568 else: 586 else:
569 alt1ff = alt1f 587 alt1ff = alt1f
588 alt1ff_trim = alt1f
570 if total2new == 0: 589 if total2new == 0:
571 ref2f = alt2f = None 590 ref2f = alt2f = None
572 alt2ff = -1 591 alt2ff = -1
592 alt2ff_trim = -1
573 else: 593 else:
574 alt2ff = alt2f 594 alt2ff = alt2f
595 alt2ff_trim = alt2f
575 if total3new == 0: 596 if total3new == 0:
576 ref3f = alt3f = None 597 ref3f = alt3f = None
577 alt3ff = -1 598 alt3ff = -1
599 alt3ff_trim = -1
578 else: 600 else:
579 alt3ff = alt3f 601 alt3ff = alt3f
602 alt3ff_trim = alt3f
580 if total4new == 0: 603 if total4new == 0:
581 ref4f = alt4f = None 604 ref4f = alt4f = None
582 alt4ff = -1 605 alt4ff = -1
606 alt4ff_trim = alt4f
583 else: 607 else:
584 alt4ff = alt4f 608 alt4ff = alt4f
609 alt4ff_trim = alt4f
585 610
586 beg1 = beg4 = beg2 = beg3 = 0 611 beg1 = beg4 = beg2 = beg3 = 0
587 612
588 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 613 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
589 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 614 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
594 softclipped_mutation_oneOfTwoMates = False 619 softclipped_mutation_oneOfTwoMates = False
595 softclipped_mutation_oneOfTwoSSCS = False 620 softclipped_mutation_oneOfTwoSSCS = False
596 softclipped_mutation_oneOfTwoSSCS_diffMates = False 621 softclipped_mutation_oneOfTwoSSCS_diffMates = False
597 softclipped_mutation_oneMate = False 622 softclipped_mutation_oneMate = False
598 softclipped_mutation_oneMateOneSSCS = False 623 softclipped_mutation_oneMateOneSSCS = False
599 print() 624
600 print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3) 625 trimmed_actual_high_tier = False
626
601 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] 627 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
602 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] 628 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
603 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False 629 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
604 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False 630 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
605 631
606 # mate 1 - SSCS ab 632 # mate 1 - SSCS ab
607 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] 633 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
608 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads 634 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
609
610 if any(ij is True for ij in softclipped_idx1): 635 if any(ij is True for ij in softclipped_idx1):
611 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] 636 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
612 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] 637 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
613 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] 638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
614 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] 639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
615 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] 640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
616
617 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
618 if any(ij is True for ij in softclipped_both_ends_idx1): 642 if any(ij is True for ij in softclipped_both_ends_idx1):
619 print(softclipped_both_ends_idx1) 643 print(softclipped_both_ends_idx1)
620 for nr, indx in enumerate(softclipped_both_ends_idx1): 644 for nr, indx in enumerate(softclipped_both_ends_idx1):
621 if indx: 645 if indx:
623 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number 647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
624 else: 648 else:
625 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number 649 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
626 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads 650 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
627 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads 651 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
628 print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
629 652
630 # mate 1 - SSCS ba 653 # mate 1 - SSCS ba
631 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] 654 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
632 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads 655 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
633 if any(ij is True for ij in softclipped_idx4): 656 if any(ij is True for ij in softclipped_idx4):
634 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] 657 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
635 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] 658 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
636 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] 659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
637 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] 660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
638 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] 661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
639
640 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
641 if any(ij is True for ij in softclipped_both_ends_idx4): 663 if any(ij is True for ij in softclipped_both_ends_idx4):
642 print(softclipped_both_ends_idx4) 664 print(softclipped_both_ends_idx4)
643 for nr, indx in enumerate(softclipped_both_ends_idx4): 665 for nr, indx in enumerate(softclipped_both_ends_idx4):
644 if indx: 666 if indx:
646 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number 668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
647 else: 669 else:
648 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number 670 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
649 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads 671 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
650 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads 672 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
651 print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
652 673
653 # mate 2 - SSCS ab 674 # mate 2 - SSCS ab
654 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] 675 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
655 #print(sum(softclipped_idx2))
656 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads 676 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
657 if any(ij is True for ij in softclipped_idx2): 677 if any(ij is True for ij in softclipped_idx2):
658 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] 678 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
659 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] 679 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
660 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] 680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
661 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] 681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
662 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] 682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
663
664 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
665 if any(ij is True for ij in softclipped_both_ends_idx2): 684 if any(ij is True for ij in softclipped_both_ends_idx2):
666 print(softclipped_both_ends_idx2) 685 print(softclipped_both_ends_idx2)
667 for nr, indx in enumerate(softclipped_both_ends_idx2): 686 for nr, indx in enumerate(softclipped_both_ends_idx2):
668 if indx: 687 if indx:
669 if dist_start_read2[nr] <= dist_end_read2[nr]: 688 if dist_start_read2[nr] <= dist_end_read2[nr]:
670 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number 689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
671 else: 690 else:
672 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number 691 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
673 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads 692 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
674 #print(ratio_dist_end2)
675 #print([True if x <= thr else False for x in ratio_dist_end2])
676 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads 693 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
677 print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
678 694
679 # mate 2 - SSCS ba 695 # mate 2 - SSCS ba
680 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] 696 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
681 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads 697 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
682 if any(ij is True for ij in softclipped_idx3): 698 if any(ij is True for ij in softclipped_idx3):
683 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] 699 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
684 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] 700 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
685 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] 701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
686 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] 702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
687 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] 703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
688
689 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping 704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
690 if any(ij is True for ij in softclipped_both_ends_idx3): 705 if any(ij is True for ij in softclipped_both_ends_idx3):
691 print(softclipped_both_ends_idx3) 706 print(softclipped_both_ends_idx3)
692 for nr, indx in enumerate(softclipped_both_ends_idx3): 707 for nr, indx in enumerate(softclipped_both_ends_idx3):
693 if indx: 708 if indx:
694 if dist_start_read3[nr] <= dist_end_read3[nr]: 709 if dist_start_read3[nr] <= dist_end_read3[nr]:
695 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh 710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
696 else: 711 else:
697 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number 712 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
698 #print([True if x <= thr else False for x in dist_start_read3])
699 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads 713 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
700 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads 714 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
701 print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
702 715
703 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant 716 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
704 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | 717 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
705 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & 718 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
706 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): 719 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
726 alt4ff = 0 739 alt4ff = 0
727 alt2ff = 0 740 alt2ff = 0
728 alt3ff = 0 741 alt3ff = 0
729 trimmed = False 742 trimmed = False
730 contradictory = False 743 contradictory = False
731 print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
732 # information of both mates available --> only one mate softclipped 744 # information of both mates available --> only one mate softclipped
733 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | 745 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
734 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & 746 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
735 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available 747 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
736 # if distance between softclipping and mutation is at start or end of the read smaller than threshold 748 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
749 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red
750 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue
751 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red
752 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue
753 if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue
754 softclipped_mutation_oneOfTwoMates = False
755 # blue mate at beginning softclipped
756 if min_start1 > min_start2:
757 n_spacer_barcode = min_start1 - min_start2
758 read_pos2 = read_pos2 - n_spacer_barcode
759 read_pos3 = read_pos3 - n_spacer_barcode
760 read_len_median2 = read_len_median2 - n_spacer_barcode
761 read_len_median3 = read_len_median3 - n_spacer_barcode
762 # red mate at end softclipped
763 if max_end1 > max_end2:
764 n_spacer_barcode = max_end1 - max_end2
765 read_len_median1 = read_len_median1 - n_spacer_barcode
766 read_len_median4 = read_len_median4 - n_spacer_barcode
767 elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red
768 softclipped_mutation_oneOfTwoMates = False
769 if min_start1 < min_start2:
770 n_spacer_barcode = min_start2 - min_start1
771 read_pos1 = read_pos1 - n_spacer_barcode
772 read_pos4 = read_pos4 - n_spacer_barcode
773 read_len_median1 = read_len_median1 - n_spacer_barcode
774 read_len_median4 = read_len_median4 - n_spacer_barcode
775 if max_end1 < max_end2: # if mate1 ends after mate 2 starts
776 n_spacer_barcode = max_end2 - max_end1
777 read_len_median2 = read_len_median2 - n_spacer_barcode
778 read_len_median3 = read_len_median3 - n_spacer_barcode
779 else:
780 softclipped_mutation_oneOfTwoMates = True
781 alt1ff = 0
782 alt4ff = 0
783 alt2ff = 0
784 alt3ff = 0
785 trimmed = False
786 contradictory = False
737 softclipped_mutation_allMates = False 787 softclipped_mutation_allMates = False
738 softclipped_mutation_oneOfTwoMates = True
739 softclipped_mutation_oneOfTwoSSCS = False 788 softclipped_mutation_oneOfTwoSSCS = False
740 softclipped_mutation_oneOfTwoSSCS_diffMates = False
741 softclipped_mutation_oneMate = False 789 softclipped_mutation_oneMate = False
742 softclipped_mutation_oneMateOneSSCS = False 790 softclipped_mutation_oneMateOneSSCS = False
743 alt1ff = 0 791
744 alt4ff = 0 792 if softclipped_mutation_oneOfTwoMates is False: # check trimming tier
745 alt2ff = 0 793 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
746 alt3ff = 0 794 beg1 = total1new
747 trimmed = False 795 total1new = 0
748 contradictory = False 796 alt1ff = 0
749 print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates) 797 alt1f = 0
798 trimmed = True
799
800 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
801 beg4 = total4new
802 total4new = 0
803 alt4ff = 0
804 alt4f = 0
805 trimmed = True
806
807 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
808 beg2 = total2new
809 total2new = 0
810 alt2ff = 0
811 alt2f = 0
812 trimmed = True
813
814 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
815 beg3 = total3new
816 total3new = 0
817 alt3ff = 0
818 alt3f = 0
819 trimmed = True
820 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
821 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
822
750 # information of both mates available --> only one mate softclipped 823 # information of both mates available --> only one mate softclipped
751 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & 824 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
752 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & 825 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
753 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available 826 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
754 # if distance between softclipping and mutation is at start or end of the read smaller than threshold 827 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
762 alt4ff = 0 835 alt4ff = 0
763 alt2ff = 0 836 alt2ff = 0
764 alt3ff = 0 837 alt3ff = 0
765 trimmed = False 838 trimmed = False
766 contradictory = False 839 contradictory = False
767 print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
768 840
769 # information of one mate available --> all reads of one mate are softclipped 841 # information of one mate available --> all reads of one mate are softclipped
770 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & 842 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
771 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | 843 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
772 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & 844 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
773 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available 845 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
774 # if distance between softclipping and mutation is at start or end of the read smaller than threshold 846 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
775 #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
776 # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
777 # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
778 # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
779 softclipped_mutation_allMates = False 847 softclipped_mutation_allMates = False
780 softclipped_mutation_oneOfTwoMates = False 848 softclipped_mutation_oneOfTwoMates = False
781 softclipped_mutation_oneOfTwoSSCS = False 849 softclipped_mutation_oneOfTwoSSCS = False
782 softclipped_mutation_oneOfTwoSSCS_diffMates = False 850 softclipped_mutation_oneOfTwoSSCS_diffMates = False
783 softclipped_mutation_oneMate = True 851 softclipped_mutation_oneMate = True
786 alt4ff = 0 854 alt4ff = 0
787 alt2ff = 0 855 alt2ff = 0
788 alt3ff = 0 856 alt3ff = 0
789 trimmed = False 857 trimmed = False
790 contradictory = False 858 contradictory = False
791 print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
792 # information of one mate available --> only one SSCS is softclipped 859 # information of one mate available --> only one SSCS is softclipped
793 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & 860 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
794 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | 861 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
795 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & 862 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
796 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available 863 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
797 # if distance between softclipping and mutation is at start or end of the read smaller than threshold 864 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
798 #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
799 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
800 # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
801 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
802 softclipped_mutation_allMates = False 865 softclipped_mutation_allMates = False
803 softclipped_mutation_oneOfTwoMates = False 866 softclipped_mutation_oneOfTwoMates = False
804 softclipped_mutation_oneOfTwoSSCS = False 867 softclipped_mutation_oneOfTwoSSCS = False
805 softclipped_mutation_oneOfTwoSSCS_diffMates = False 868 softclipped_mutation_oneOfTwoSSCS_diffMates = False
806 softclipped_mutation_oneMate = False 869 softclipped_mutation_oneMate = False
809 alt4ff = 0 872 alt4ff = 0
810 alt2ff = 0 873 alt2ff = 0
811 alt3ff = 0 874 alt3ff = 0
812 trimmed = False 875 trimmed = False
813 contradictory = False 876 contradictory = False
814 print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
815 877
816 else: 878 else:
817 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): 879 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
818 beg1 = total1new 880 beg1 = total1new
819 total1new = 0 881 total1new = 0
841 alt3ff = 0 903 alt3ff = 0
842 alt3f = 0 904 alt3f = 0
843 trimmed = True 905 trimmed = True
844 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 906 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
845 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 907 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
846
847
848 #sum_highTiers = sum([tier_dict[key1][ij] for ij in tier_dict[key1].keys()[:6]])
849 908
850 # assign tiers 909 # assign tiers
851 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & 910 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
852 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | 911 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
853 (all(int(ij) >= 3 for ij in [total2new, total3new]) & 912 (all(int(ij) >= 3 for ij in [total2new, total3new]) &
913 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): 972 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
914 tier = "3.2" 973 tier = "3.2"
915 counter_tier32 += 1 974 counter_tier32 += 1
916 tier_dict[key1]["tier 3.2"] += 1 975 tier_dict[key1]["tier 3.2"] += 1
917 976
918 #elif (trimmed) and (sum_highTiers > 1):
919 # tier = "2.5"
920 # counter_tier25 += 1
921 # tier_dict[key1]["tier 2.5"] += 1
922
923 elif (trimmed): 977 elif (trimmed):
924 tier = "4" 978 tier = "4"
925 counter_tier4 += 1 979 counter_tier4 += 1
926 tier_dict[key1]["tier 4"] += 1 980 tier_dict[key1]["tier 4"] += 1
981
982 # assign tiers
983 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
984 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
985 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
986 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
987 trimmed_actual_high_tier = True
988 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
989 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
990 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
991 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
992 trimmed_actual_high_tier = True
993 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
994 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
995 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
996 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
997 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
998 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
999 trimmed_actual_high_tier = True
1000 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) &
1001 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])):
1002 trimmed_actual_high_tier = True
1003 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
1004 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) &
1005 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) &
1006 any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) |
1007 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
1008 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) &
1009 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) &
1010 any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))):
1011 trimmed_actual_high_tier = True
1012 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) &
1013 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) |
1014 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) &
1015 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))):
1016 trimmed_actual_high_tier = True
1017 else:
1018 trimmed_actual_high_tier = False
927 1019
928 elif softclipped_mutation_allMates: 1020 elif softclipped_mutation_allMates:
929 tier = "5.1" 1021 tier = "5.1"
930 counter_tier51 += 1 1022 counter_tier51 += 1
931 tier_dict[key1]["tier 5.1"] += 1 1023 tier_dict[key1]["tier 5.1"] += 1
1006 1098
1007 # tags which have identical parts: 1099 # tags which have identical parts:
1008 if min_value == 0 or dist2 == 0: 1100 if min_value == 0 or dist2 == 0:
1009 min_tags_list_zeros.append(tag) 1101 min_tags_list_zeros.append(tag)
1010 chimera_tags.append(max_tag) 1102 chimera_tags.append(max_tag)
1011 # chimeric = True
1012 # else:
1013 # chimeric = False
1014
1015 # if mate_b is False:
1016 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
1017 # else:
1018 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
1019 i += 1 1103 i += 1
1020 chimera_tags = [x for x in chimera_tags if x != []] 1104 chimera_tags = [x for x in chimera_tags if x != []]
1021 chimera_tags_new = [] 1105 chimera_tags_new = []
1022 for i in chimera_tags: 1106 for i in chimera_tags:
1023 if len(i) > 1: 1107 if len(i) > 1:
1070 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) 1154 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
1071 #if trimmed: 1155 #if trimmed:
1072 #if key1 not in list(change_tier_after_print.keys()): 1156 #if key1 not in list(change_tier_after_print.keys()):
1073 # change_tier_after_print[key1] = [((row, line, line2))] 1157 # change_tier_after_print[key1] = [((row, line, line2))]
1074 #else: 1158 #else:
1075 change_tier_after_print.append((row, line, line2)) 1159 change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier))
1076 1160
1077 row += 3 1161 row += 3
1078 1162
1079 if chimera_correction: 1163 if chimera_correction:
1080 chimeric_dcs_high_tiers = 0 1164 chimeric_dcs_high_tiers = 0
1099 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: 1183 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
1100 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] 1184 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
1101 tier_dict[key1]["tier 4"] = 0 1185 tier_dict[key1]["tier 4"] = 0
1102 correct_tier = True 1186 correct_tier = True
1103 1187
1104 #lines = change_tier_after_print[key1]
1105 for sample in change_tier_after_print: 1188 for sample in change_tier_after_print:
1106 row_number = sample[0] 1189 row_number = sample[0]
1107 line1 = sample[1] 1190 line1 = sample[1]
1108 line2 = sample[2] 1191 line2 = sample[2]
1109 1192 actual_high_tier = sample[3]
1110 if correct_tier and list(line1)[1] == "4": 1193 current_tier = list(line1)[1]
1194
1195 if correct_tier and (current_tier == "4") and actual_high_tier:
1111 line1 = list(line1) 1196 line1 = list(line1)
1112 line1[1] = "2.5" 1197 line1[1] = "2.5"
1113 line1 = tuple(line1) 1198 line1 = tuple(line1)
1114 counter_tier25 += 1 1199 counter_tier25 += 1
1115 counter_tier4 -= 1 1200 counter_tier4 -= 1