comparison read2mut.py @ 2:9d74f30275c6 draft

planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Wed, 07 Oct 2020 10:57:15 +0000
parents e5953c54cfb5
children 4fc62ab6e9e8
comparison
equal deleted inserted replaced
1:2a505d46f682 2:9d74f30275c6
21 """ 21 """
22 22
23 from __future__ import division 23 from __future__ import division
24 24
25 import argparse 25 import argparse
26 import itertools
26 import json 27 import json
27 import operator 28 import operator
28 import os 29 import os
29 import re 30 import re
30 import sys 31 import sys
31 32
32 import numpy as np 33 import numpy as np
33 import pysam 34 import pysam
34 import xlsxwriter 35 import xlsxwriter
35
36 36
37 def make_argparser(): 37 def make_argparser():
38 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations, a BAM file and JSON files as input and prints stats about variants to a user specified output file.') 38 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations, a BAM file and JSON files as input and prints stats about variants to a user specified output file.')
39 parser.add_argument('--mutFile', 39 parser.add_argument('--mutFile',
40 help='TABULAR file with DCS mutations.') 40 help='TABULAR file with DCS mutations.')
51 parser.add_argument('--phred', type=int, default=20, 51 parser.add_argument('--phred', type=int, default=20,
52 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.') 52 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.')
53 parser.add_argument('--trim', type=int, default=10, 53 parser.add_argument('--trim', type=int, default=10,
54 help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.') 54 help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
55 parser.add_argument('--chimera_correction', action="store_true", 55 parser.add_argument('--chimera_correction', action="store_true",
56 help='Add additional tier for chimeric variants and correct the variant frequencies') 56 help='Count chimeric variants and correct the variant frequencies.')
57
58 return parser 57 return parser
59 58
60 59
61 def safe_div(x, y): 60 def safe_div(x, y):
62 if y == 0: 61 if y == 0:
88 if phred_score < 0: 87 if phred_score < 0:
89 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score)) 88 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score))
90 if trim < 0: 89 if trim < 0:
91 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh)) 90 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))
92 91
93 # 1. read mut file
94 with open(file1, 'r') as mut: 92 with open(file1, 'r') as mut:
95 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) 93 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
96 94
97 # 2. load dicts 95 # load dicts
98 with open(json_file, "r") as f: 96 with open(json_file, "r") as f:
99 (tag_dict, cvrg_dict) = json.load(f) 97 (tag_dict, cvrg_dict) = json.load(f)
100 98
101 with open(sscs_json, "r") as f: 99 with open(sscs_json, "r") as f:
102 (mut_pos_dict, ref_pos_dict) = json.load(f) 100 (mut_pos_dict, ref_pos_dict) = json.load(f)
103 101
104 # 3. read bam file 102 # read bam file
105 # pysam.index(file2) 103 # pysam.index(file2)
106 bam = pysam.AlignmentFile(file2, "rb") 104 bam = pysam.AlignmentFile(file2, "rb")
107 105
108 # 4. create mut_dict 106 # 4. create mut_dict
109 mut_dict = {} 107 mut_dict = {}
113 if mut_array.shape == (1, 13): 111 if mut_array.shape == (1, 13):
114 mut_array = mut_array.reshape((1, len(mut_array))) 112 mut_array = mut_array.reshape((1, len(mut_array)))
115 113
116 for m in range(0, len(mut_array[:, 0])): 114 for m in range(0, len(mut_array[:, 0])):
117 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) 115 print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
118 # for m in range(0, 5):
119 chrom = mut_array[m, 1] 116 chrom = mut_array[m, 1]
120 stop_pos = mut_array[m, 2].astype(int) 117 stop_pos = mut_array[m, 2].astype(int)
121 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 118 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
122 ref = mut_array[m, 9] 119 ref = mut_array[m, 9]
123 alt = mut_array[m, 10] 120 alt = mut_array[m, 10]
124 mut_dict[chrom_stop_pos] = {} 121 mut_dict[chrom_stop_pos] = {}
125 mut_read_pos_dict[chrom_stop_pos] = {} 122 mut_read_pos_dict[chrom_stop_pos] = {}
126 reads_dict[chrom_stop_pos] = {} 123 reads_dict[chrom_stop_pos] = {}
127 124
128 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000): 125 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=100000000):
129 if pileupcolumn.reference_pos == stop_pos - 1: 126 if pileupcolumn.reference_pos == stop_pos - 1:
130 count_alt = 0 127 count_alt = 0
131 count_ref = 0 128 count_ref = 0
132 count_indel = 0 129 count_indel = 0
133 count_n = 0 130 count_n = 0
156 else: 153 else:
157 mut_read_pos_dict[chrom_stop_pos][tag] = np.append( 154 mut_read_pos_dict[chrom_stop_pos][tag] = np.append(
158 mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1) 155 mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1)
159 reads_dict[chrom_stop_pos][tag] = np.append( 156 reads_dict[chrom_stop_pos][tag] = np.append(
160 reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence)) 157 reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
161
162 if nuc == alt: 158 if nuc == alt:
163 count_alt += 1 159 count_alt += 1
164 if tag not in mut_read_dict: 160 if tag not in mut_read_dict:
165 mut_read_dict[tag] = {} 161 mut_read_dict[tag] = {}
166 mut_read_dict[tag][chrom_stop_pos] = alt 162 mut_read_dict[tag][chrom_stop_pos] = alt
174 count_lowq += 1 170 count_lowq += 1
175 else: 171 else:
176 count_other += 1 172 count_other += 1
177 else: 173 else:
178 count_indel += 1 174 count_indel += 1
179
180 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) 175 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
181 176
182 for read in bam.fetch(until_eof=True): 177 for read in bam.fetch(until_eof=True):
183 if read.is_unmapped: 178 if read.is_unmapped:
184 pure_tag = read.query_name[:-5] 179 pure_tag = read.query_name[:-5]
185 nuc = "na" 180 nuc = "na"
186 for key in tag_dict[pure_tag].keys(): 181 for key in tag_dict[pure_tag].keys():
218 if len(value) < thresh: 213 if len(value) < thresh:
219 pure_tags_dict_short[key] = value 214 pure_tags_dict_short[key] = value
220 else: 215 else:
221 pure_tags_dict_short = pure_tags_dict 216 pure_tags_dict_short = pure_tags_dict
222 217
223 #whole_array = []
224 #for k in pure_tags_dict.values():
225 # whole_array.extend(k.keys())
226
227 # 7. output summary with threshold 218 # 7. output summary with threshold
228 workbook = xlsxwriter.Workbook(outfile) 219 workbook = xlsxwriter.Workbook(outfile)
229 ws1 = workbook.add_worksheet("Results") 220 ws1 = workbook.add_worksheet("Results")
230 ws2 = workbook.add_worksheet("Allele frequencies") 221 ws2 = workbook.add_worksheet("Allele frequencies")
231 ws3 = workbook.add_worksheet("Tiers") 222 ws3 = workbook.add_worksheet("Tiers")
251 counter_tier24 = 0 242 counter_tier24 = 0
252 counter_tier31 = 0 243 counter_tier31 = 0
253 counter_tier32 = 0 244 counter_tier32 = 0
254 counter_tier41 = 0 245 counter_tier41 = 0
255 counter_tier42 = 0 246 counter_tier42 = 0
256 247 #if chimera_correction:
257 if chimera_correction: 248 # counter_tier43 = 0
258 counter_tier43 = 0
259
260 counter_tier5 = 0 249 counter_tier5 = 0
261 250
262 row = 1 251 row = 1
263 tier_dict = {} 252 tier_dict = {}
253 chimera_dict = {}
264 for key1, value1 in sorted(mut_dict.items()): 254 for key1, value1 in sorted(mut_dict.items()):
265 counts_mut = 0 255 counts_mut = 0
256 chimeric_tag_list = []
257 chimeric_tag = {}
266 if key1 in pure_tags_dict_short.keys(): 258 if key1 in pure_tags_dict_short.keys():
267 i = np.where(np.array(['#'.join(str(i) for i in z) 259 i = np.where(np.array(['#'.join(str(i) for i in z)
268 for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0] 260 for z in zip(mut_array[:, 1], mut_array[:, 2])]) == key1)[0][0]
269 ref = mut_array[i, 9] 261 ref = mut_array[i, 9]
270 alt = mut_array[i, 10] 262 alt = mut_array[i, 10]
271 dcs_median = cvrg_dict[key1][2] 263 dcs_median = cvrg_dict[key1][2]
272 whole_array = pure_tags_dict_short[key1].keys() 264 whole_array = pure_tags_dict_short[key1].keys()
273 265
274 tier_dict[key1] = {} 266 tier_dict[key1] = {}
275 if chimera_correction: 267 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
276 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 4.3", 0), ("tier 5", 0)]
277 else:
278 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
279
280 for k, v in values_tier_dict: 268 for k, v in values_tier_dict:
281 tier_dict[key1][k] = v 269 tier_dict[key1][k] = v
282 270
283 used_keys = [] 271 used_keys = []
284 if 'ab' in mut_pos_dict[key1].keys(): 272 if 'ab' in mut_pos_dict[key1].keys():
504 beg1 = beg4 = beg2 = beg3 = 0 492 beg1 = beg4 = beg2 = beg3 = 0
505 493
506 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 494 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
507 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 495 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
508 496
509 chrom, pos = re.split(r'\#', key1) 497
510 var_id = '-'.join([chrom, pos, ref, alt])
511 sample_tag = key2[:-5]
512 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
513 # exclude identical tag from array2, to prevent comparison to itself
514 same_tag = np.where(array2 == sample_tag)
515 index_array2 = np.arange(0, len(array2), 1)
516 index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data
517 array2 = array2[index_withoutSame]
518 if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant
519 array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1
520 array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2
521 array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1
522 array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2
523
524 min_tags_list_zeros = []
525 chimera_tags = []
526 for mate_b in [False, True]:
527 i = 0 # counter, only used to see how many HDs of tags were already calculated
528 if mate_b is False: # HD calculation for all a's
529 half1_mate1 = array1_half
530 half2_mate1 = array1_half2
531 half1_mate2 = array2_half
532 half2_mate2 = array2_half2
533 elif mate_b is True: # HD calculation for all b's
534 half1_mate1 = array1_half2
535 half2_mate1 = array1_half
536 half1_mate2 = array2_half2
537 half2_mate2 = array2_half
538 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
539 dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2])
540 min_index = np.where(dist == dist.min()) # get index of min HD
541 # get all "b's" of the tag or all "a's" of the tag with minimum HD
542 min_tag_half2 = half2_mate2[min_index]
543 min_tag_array2 = array2[min_index] # get whole tag with min HD
544 min_value = dist.min()
545 # calculate HD of "b" to all "b's" or "a" to all "a's"
546 dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e))
547 for e in min_tag_half2])
548
549 dist2 = dist_second_half.max()
550 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
551 max_tag = min_tag_array2[max_index]
552
553 # tags which have identical parts:
554 if min_value == 0 or dist2 == 0:
555 min_tags_list_zeros.append(tag)
556 chimera_tags.append(max_tag)
557 # chimeric = True
558 # else:
559 # chimeric = False
560
561 # if mate_b is False:
562 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
563 # else:
564 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
565 i += 1
566 chimera_tags = [x for x in chimera_tags if x != []]
567 chimera_tags_new = []
568 for i in chimera_tags:
569 if len(i) > 1:
570 for t in i:
571 chimera_tags_new.append(t)
572 else:
573 chimera_tags_new.extend(i)
574 chimera_tags_new = np.asarray(chimera_tags_new)
575 chimera = ", ".join(chimera_tags_new)
576 else:
577 chimera_tags_new = []
578 chimera = ""
579
580 trimmed = False 498 trimmed = False
581 contradictory = False 499 contradictory = False
582 chimeric_dcs = False 500
583 501 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
584 if chimera_correction and len(chimera_tags_new) > 0: # chimeras
585 alt1ff = 0
586 alt4ff = 0
587 alt2ff = 0
588 alt3ff = 0
589 chimeric_dcs = True
590 trimmed = False
591 contradictory = False
592 elif ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
593 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | 502 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
594 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & 503 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
595 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): 504 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
596 alt1ff = 0 505 alt1ff = 0
597 alt4ff = 0 506 alt4ff = 0
598 alt2ff = 0 507 alt2ff = 0
599 alt3ff = 0 508 alt3ff = 0
600 trimmed = False 509 trimmed = False
601 contradictory = True 510 contradictory = True
602 chimeric_dcs = False
603 else: 511 else:
604 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): 512 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
605 beg1 = total1new 513 beg1 = total1new
606 total1new = 0 514 total1new = 0
607 alt1ff = 0 515 alt1ff = 0
707 elif (contradictory): 615 elif (contradictory):
708 tier = "4.2" 616 tier = "4.2"
709 counter_tier42 += 1 617 counter_tier42 += 1
710 tier_dict[key1]["tier 4.2"] += 1 618 tier_dict[key1]["tier 4.2"] += 1
711 619
712 elif chimera_correction and chimeric_dcs:
713 tier = "4.3"
714 counter_tier43 += 1
715 tier_dict[key1]["tier 4.3"] += 1
716
717 else: 620 else:
718 tier = "5" 621 tier = "5"
719 counter_tier5 += 1 622 counter_tier5 += 1
720 tier_dict[key1]["tier 5"] += 1 623 tier_dict[key1]["tier 5"] += 1
624
625 chrom, pos = re.split(r'\#', key1)
626 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
627 sample_tag = key2[:-5]
628 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
629 # exclude identical tag from array2, to prevent comparison to itself
630 same_tag = np.where(array2 == sample_tag)
631 index_array2 = np.arange(0, len(array2), 1)
632 index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data
633 array2 = array2[index_withoutSame]
634 if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant
635 array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1
636 array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2
637 array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1
638 array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2
639
640 min_tags_list_zeros = []
641 chimera_tags = []
642 for mate_b in [False, True]:
643 i = 0 # counter, only used to see how many HDs of tags were already calculated
644 if mate_b is False: # HD calculation for all a's
645 half1_mate1 = array1_half
646 half2_mate1 = array1_half2
647 half1_mate2 = array2_half
648 half2_mate2 = array2_half2
649 elif mate_b is True: # HD calculation for all b's
650 half1_mate1 = array1_half2
651 half2_mate1 = array1_half
652 half1_mate2 = array2_half2
653 half2_mate2 = array2_half
654 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
655 dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
656 min_index = np.where(dist == dist.min()) # get index of min HD
657 # get all "b's" of the tag or all "a's" of the tag with minimum HD
658 min_tag_half2 = half2_mate2[min_index]
659 min_tag_array2 = array2[min_index] # get whole tag with min HD
660 min_value = dist.min()
661 # calculate HD of "b" to all "b's" or "a" to all "a's"
662 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
663 for e in min_tag_half2])
664
665 dist2 = dist_second_half.max()
666 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
667 max_tag = min_tag_array2[max_index]
668
669 # tags which have identical parts:
670 if min_value == 0 or dist2 == 0:
671 min_tags_list_zeros.append(tag)
672 chimera_tags.append(max_tag)
673 # chimeric = True
674 # else:
675 # chimeric = False
676
677 # if mate_b is False:
678 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
679 # else:
680 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
681 i += 1
682 chimera_tags = [x for x in chimera_tags if x != []]
683 chimera_tags_new = []
684 for i in chimera_tags:
685 if len(i) > 1:
686 for t in i:
687 chimera_tags_new.append(t)
688 else:
689 chimera_tags_new.extend(i)
690 chimera = ", ".join(chimera_tags_new)
691 else:
692 chimera_tags_new = []
693 chimera = ""
694
695 if len(chimera_tags_new) > 0:
696 chimera_tags_new.append(sample_tag)
697 key_chimera = ",".join(sorted(chimera_tags_new))
698 if key_chimera in chimeric_tag.keys():
699 chimeric_tag[key_chimera].append(float(tier))
700 else:
701 chimeric_tag[key_chimera] = [float(tier)]
721 702
722 if (read_pos1 == -1): 703 if (read_pos1 == -1):
723 read_pos1 = read_len_median1 = None 704 read_pos1 = read_len_median1 = None
724 if (read_pos4 == -1): 705 if (read_pos4 == -1):
725 read_pos4 = read_len_median4 = None 706 read_pos4 = read_len_median4 = None
747 'criteria': '=$B${}>="3"'.format(row + 1), 728 'criteria': '=$B${}>="3"'.format(row + 1),
748 'format': format2, 729 'format': format2,
749 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) 730 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
750 731
751 row += 3 732 row += 3
752 733 if chimera_correction:
734 chimeric_dcs_high_tiers = 0
735 chimeric_dcs = 0
736 for keys_chimera in chimeric_tag.keys():
737 tiers = chimeric_tag[keys_chimera]
738 chimeric_dcs += len(tiers) - 1
739 high_tiers = sum(1 for t in tiers if t < 3.)
740 if high_tiers == len(tiers):
741 chimeric_dcs_high_tiers += high_tiers - 1
742 else:
743 chimeric_dcs_high_tiers += high_tiers
744 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
745 # sheet 2
753 if chimera_correction: 746 if chimera_correction:
754 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 747 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
755 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 748 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
756 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 4.3', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 749 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
757 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-4.3', 'AF 1.1-5') 750 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
758 else: 751 else:
759 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 752 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
760 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 753 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
761 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 754 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
762 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5') 755 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
763 756
764 ws2.write_row(0, 0, header_line2) 757 ws2.write_row(0, 0, header_line2)
773 chrom, pos = re.split(r'\#', key1) 766 chrom, pos = re.split(r'\#', key1)
774 ref_count = cvrg_dict[key1][0] 767 ref_count = cvrg_dict[key1][0]
775 alt_count = cvrg_dict[key1][1] 768 alt_count = cvrg_dict[key1][1]
776 cvrg = ref_count + alt_count 769 cvrg = ref_count + alt_count
777 770
778 var_id = '-'.join([chrom, pos, ref, alt]) 771 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
779 lst = [var_id, cvrg] 772 lst = [var_id, cvrg]
780 used_tiers = [] 773 used_tiers = []
781 cum_af = [] 774 cum_af = []
782 for key2, value2 in sorted(value1.items()): 775 for key2, value2 in sorted(value1.items()):
783 # calculate cummulative AF 776 # calculate cummulative AF
785 if len(used_tiers) > 1: 778 if len(used_tiers) > 1:
786 cum = safe_div(sum(used_tiers), cvrg) 779 cum = safe_div(sum(used_tiers), cvrg)
787 cum_af.append(cum) 780 cum_af.append(cum)
788 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) 781 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
789 if chimera_correction: 782 if chimera_correction:
790 lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:]))), alt_count, safe_div(alt_count, cvrg)]) 783 chimeras_all = chimera_dict[key1][0]
791 else: 784 new_alt = sum(used_tiers) - chimeras_all
792 lst.extend([(cvrg - sum(used_tiers[-5:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-5:]))), alt_count, safe_div(alt_count, cvrg)]) 785 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers)))
786 if fraction_chimeras is None:
787 fraction_chimeras = 0.
788 new_cvrg = cvrg * (1. - fraction_chimeras)
789 lst.extend([chimeras_all, safe_div(new_alt, new_cvrg)])
790 lst.extend([(cvrg - sum(used_tiers[-5:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-5:])))])
791 if chimera_correction:
792 chimeras_all = chimera_dict[key1][1]
793 new_alt = sum(used_tiers[0:6]) - chimeras_all
794 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6])))
795 if fraction_chimeras is None:
796 fraction_chimeras = 0.
797 new_cvrg = (cvrg - sum(used_tiers[-5:])) * (1. - fraction_chimeras)
798 lst.extend([chimeras_all, safe_div(new_alt, new_cvrg)])
799 lst.extend([alt_count, safe_div(alt_count, cvrg)])
793 lst.extend(used_tiers) 800 lst.extend(used_tiers)
794 lst.extend(cum_af) 801 lst.extend(cum_af)
795 lst = tuple(lst) 802 lst = tuple(lst)
796 ws2.write_row(row + 1, 0, lst) 803 ws2.write_row(row + 1, 0, lst)
797 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
798 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
799 if chimera_correction: 804 if chimera_correction:
800 ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)}) 805 ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1"', 'format': format1, 'multi_range': 'N{}:O{} N1:O1'.format(row + 2, row + 2)})
806 ws2.conditional_format('P{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1"', 'format': format3, 'multi_range': 'P{}:S{} P1:S1'.format(row + 2, row + 2)})
807 ws2.conditional_format('T{}:X{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format2, 'multi_range': 'T{}:X{} T1:X1'.format(row + 2, row + 2)})
801 else: 808 else:
809 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
810 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
802 ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:T{} P1:T1'.format(row + 2, row + 2)}) 811 ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:T{} P1:T1'.format(row + 2, row + 2)})
803
804 row += 1 812 row += 1
805 813
806 # sheet 3 814 # sheet 3
807 if chimera_correction: 815 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
808 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
809 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
810 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
811 ("tier 4.2", counter_tier42), ("tier 4.3", counter_tier43), ("tier 5", counter_tier5)]
812 else:
813 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
814 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), 816 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
815 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41), 817 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
816 ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)] 818 ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)]
817
818 819
819 header = ("tier", "count") 820 header = ("tier", "count")
820 ws3.write_row(0, 0, header) 821 ws3.write_row(0, 0, header)
821 822
822 for i in range(len(sheet3)): 823 for i in range(len(sheet3)):
831 'format': format3}) 832 'format': format3})
832 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), 833 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
833 {'type': 'formula', 834 {'type': 'formula',
834 'criteria': '=$A${}>="3"'.format(i + 2), 835 'criteria': '=$A${}>="3"'.format(i + 2),
835 'format': format2}) 836 'format': format2})
836 if chimera_correction: 837
837 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 4.3", "variants that are chimeric"), ("Tier 5", "remaining variants")] 838 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
838 else:
839 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
840
841 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", 839 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
842 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 840 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
843 "4081", "4098", "5", "10", "", ""), 841 "4081", "4098", "5", "10", "", ""),
844 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, 842 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,
845 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, 843 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None,
900 [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263", 898 [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
901 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0", 899 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
902 "0", "0", "0", "1", "1", "5348", "5350", "", ""), 900 "0", "0", "0", "1", "1", "5348", "5350", "", ""),
903 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263", 901 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
904 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0", 902 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
905 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]] 903 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
906 904 [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
907 if chimera_correction:
908 examples_tiers.extend([[("Chr5:5-20000-13963-T-C", "4.3", "TTTTTAAGAAGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
909 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081",
910 "4098", "5", "10", "", "TTTTTAAGAATAACCCACAC"),
911 ("", "", "TTTTTAAGAAGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289",
912 "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081",
913 "4098", "5", "10", "", "TTTTTAAGAATAACCCACAC")],
914 [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
915 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0", 905 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
916 "0", "1", "1", "5348", "5350", "", ""), 906 "0", "1", "1", "5348", "5350", "", ""),
917 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None, 907 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
918 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", 908 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
919 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]) 909 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
920 else:
921 examples_tiers.extend([
922 [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
923 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
924 "0", "1", "1", "5348", "5350", "", ""),
925 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
926 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
927 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]])
928 910
929 start_row = 15 911 start_row = 15
930 ws3.write(start_row, 0, "Description of tiers with examples") 912 ws3.write(start_row, 0, "Description of tiers with examples")
931 ws3.write_row(start_row + 1, 0, header_line) 913 ws3.write_row(start_row + 1, 0, header_line)
932 row = 0 914 row = 0