comparison read2mut.py @ 43:d21960b45a6b draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Tue, 02 Mar 2021 15:32:41 +0000
parents da224c392a54
children 29226ceba7cd
comparison
equal deleted inserted replaced
42:da224c392a54 43:d21960b45a6b
8 Looks for reads with mutation at known 8 Looks for reads with mutation at known
9 positions and calculates frequencies and stats. 9 positions and calculates frequencies and stats.
10 10
11 ======= ========== ================= ================================ 11 ======= ========== ================= ================================
12 Version Date Author Description 12 Version Date Author Description
13 2.0.0 2020-10-30 Gundula Povysil - 13 0.2.1 2019-10-27 Gundula Povysil -
14 ======= ========== ================= ================================ 14 ======= ========== ================= ================================
15 15
16 16
17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
19 --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim5 10 --trim3 10 --chimera_correction 19 --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim 10 --chimera_correction
20 20
21 """ 21 """
22 22
23 from __future__ import division 23 from __future__ import division
24 24
25 import argparse 25 import argparse
26 import csv 26 import itertools
27 import json 27 import json
28 import operator 28 import operator
29 import os 29 import os
30 import re 30 import re
31 import sys 31 import sys
32
33 32
34 import numpy as np 33 import numpy as np
35 import pysam 34 import pysam
36 import xlsxwriter 35 import xlsxwriter
37 from cyvcf2 import VCF 36 from cyvcf2 import VCF
47 help='JSON file with data collected by mut2read.py.') 46 help='JSON file with data collected by mut2read.py.')
48 parser.add_argument('--sscsJson', 47 parser.add_argument('--sscsJson',
49 help='JSON file with SSCS counts collected by mut2sscs.py.') 48 help='JSON file with SSCS counts collected by mut2sscs.py.')
50 parser.add_argument('--outputFile', 49 parser.add_argument('--outputFile',
51 help='Output xlsx file with summary of mutations.') 50 help='Output xlsx file with summary of mutations.')
52 parser.add_argument('--outputFile_csv',
53 help='Output csv file with summary of mutations.')
54 parser.add_argument('--outputFile2', 51 parser.add_argument('--outputFile2',
55 help='Output xlsx file with allele frequencies of mutations.') 52 help='Output xlsx file with allele frequencies of mutations.')
56 parser.add_argument('--outputFile3', 53 parser.add_argument('--outputFile3',
57 help='Output xlsx file with examples of the tier classification.') 54 help='Output xlsx file with examples of the tier classification.')
58 parser.add_argument('--thresh', type=int, default=0, 55 parser.add_argument('--thresh', type=int, default=0,
59 help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.') 56 help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.')
60 parser.add_argument('--phred', type=int, default=20, 57 parser.add_argument('--phred', type=int, default=20,
61 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.') 58 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.')
62 parser.add_argument('--trim5', type=int, default=10, 59 parser.add_argument('--trim', type=int, default=10,
63 help='Integer threshold for assigning mutations at the beginning of the reads to lower tier. Default 10.') 60 help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
64 parser.add_argument('--trim3', type=int, default=10,
65 help='Integer threshold for assigning mutations at the end of the reads to lower tier. Default 10.')
66 parser.add_argument('--chimera_correction', action="store_true", 61 parser.add_argument('--chimera_correction', action="store_true",
67 help='Count chimeric variants and correct the variant frequencies') 62 help='Count chimeric variants and correct the variant frequencies')
63 parser.add_argument('--softclipping_dist', type=int, default=15,
64 help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.')
65 parser.add_argument('--reads_threshold', type=float, default=1.0,
66 help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.')
68 return parser 67 return parser
69 68
70 69
71 def safe_div(x, y): 70 def safe_div(x, y):
72 if y == 0: 71 if y == 0:
82 json_file = args.inputJson 81 json_file = args.inputJson
83 sscs_json = args.sscsJson 82 sscs_json = args.sscsJson
84 outfile = args.outputFile 83 outfile = args.outputFile
85 outfile2 = args.outputFile2 84 outfile2 = args.outputFile2
86 outfile3 = args.outputFile3 85 outfile3 = args.outputFile3
87 outputFile_csv = args.outputFile_csv
88 thresh = args.thresh 86 thresh = args.thresh
89 phred_score = args.phred 87 phred_score = args.phred
90 trim5 = args.trim5 88 trim = args.trim
91 trim3 = args.trim3
92 chimera_correction = args.chimera_correction 89 chimera_correction = args.chimera_correction
90 thr = args.softclipping_dist
91 threshold_reads = args.reads_threshold
93 92
94 if os.path.isfile(file1) is False: 93 if os.path.isfile(file1) is False:
95 sys.exit("Error: Could not find '{}'".format(file1)) 94 sys.exit("Error: Could not find '{}'".format(file1))
96 if os.path.isfile(file2) is False: 95 if os.path.isfile(file2) is False:
97 sys.exit("Error: Could not find '{}'".format(file2)) 96 sys.exit("Error: Could not find '{}'".format(file2))
99 sys.exit("Error: Could not find '{}'".format(json_file)) 98 sys.exit("Error: Could not find '{}'".format(json_file))
100 if thresh < 0: 99 if thresh < 0:
101 sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh)) 100 sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh))
102 if phred_score < 0: 101 if phred_score < 0:
103 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score)) 102 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score))
104 if trim5 < 0: 103 if trim < 0:
105 sys.exit("Error: trim5 is '{}', but only non-negative integers allowed".format(trim5)) 104 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))
106 if trim3 < 0: 105 if thr <= 0:
107 sys.exit("Error: trim3 is '{}', but only non-negative integers allowed".format(trim3)) 106 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr))
108 107
109 # load dicts 108 # load dicts
110 with open(json_file, "r") as f: 109 with open(json_file, "r") as f:
111 (tag_dict, cvrg_dict) = json.load(f) 110 (tag_dict, cvrg_dict) = json.load(f)
112 111
113 with open(sscs_json, "r") as f: 112 with open(sscs_json, "r") as f:
114 (mut_pos_dict, ref_pos_dict) = json.load(f) 113 (mut_pos_dict, ref_pos_dict) = json.load(f)
115 114
116 # read bam file 115 # read bam file
116 # pysam.index(file2)
117 bam = pysam.AlignmentFile(file2, "rb") 117 bam = pysam.AlignmentFile(file2, "rb")
118 118
119 # create mut_dict 119 # create mut_dict
120 mut_dict = {} 120 mut_dict = {}
121 mut_read_pos_dict = {} 121 mut_read_pos_dict = {}
122 mut_read_dict = {} 122 mut_read_dict = {}
123 reads_dict = {} 123 reads_dict = {}
124 mut_read_cigar_dict = {}
124 i = 0 125 i = 0
125 mut_array = [] 126 mut_array = []
126 127
127 for variant in VCF(file1): 128 for count, variant in enumerate(VCF(file1)):
129 #if count == 2000:
130 # break
128 chrom = variant.CHROM 131 chrom = variant.CHROM
129 stop_pos = variant.start 132 stop_pos = variant.start
130 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 133 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
131 ref = variant.REF 134 ref = variant.REF
132 alt = variant.ALT[0] 135 if len(variant.ALT) == 0:
136 continue
137 else:
138 alt = variant.ALT[0]
133 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt 139 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
134 140
135 if len(ref) == len(alt): 141 if len(ref) == len(alt):
136 mut_array.append([chrom, stop_pos, ref, alt]) 142 mut_array.append([chrom, stop_pos, ref, alt])
137 i += 1 143 i += 1
138 mut_dict[chrom_stop_pos] = {} 144 mut_dict[chrom_stop_pos] = {}
139 mut_read_pos_dict[chrom_stop_pos] = {} 145 mut_read_pos_dict[chrom_stop_pos] = {}
140 reads_dict[chrom_stop_pos] = {} 146 reads_dict[chrom_stop_pos] = {}
147 mut_read_cigar_dict[chrom_stop_pos] = {}
141 148
142 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): 149 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
143 if pileupcolumn.reference_pos == stop_pos: 150 if pileupcolumn.reference_pos == stop_pos:
144 count_alt = 0 151 count_alt = 0
145 count_ref = 0 152 count_ref = 0
146 count_indel = 0 153 count_indel = 0
147 count_n = 0 154 count_n = 0
148 count_other = 0 155 count_other = 0
149 count_lowq = 0 156 count_lowq = 0
150 n = 0 157 n = 0
151 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), 158 #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
152 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) 159 # "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
153 for pileupread in pileupcolumn.pileups: 160 for pileupread in pileupcolumn.pileups:
154 n += 1 161 n += 1
155 if not pileupread.is_del and not pileupread.is_refskip: 162 if not pileupread.is_del and not pileupread.is_refskip:
156 tag = pileupread.alignment.query_name 163 tag = pileupread.alignment.query_name
157 nuc = pileupread.alignment.query_sequence[pileupread.query_position] 164 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
163 if nuc in mut_dict[chrom_stop_pos][tag]: 170 if nuc in mut_dict[chrom_stop_pos][tag]:
164 mut_dict[chrom_stop_pos][tag][nuc] += 1 171 mut_dict[chrom_stop_pos][tag][nuc] += 1
165 else: 172 else:
166 mut_dict[chrom_stop_pos][tag][nuc] = 1 173 mut_dict[chrom_stop_pos][tag][nuc] = 1
167 if tag not in mut_read_pos_dict[chrom_stop_pos]: 174 if tag not in mut_read_pos_dict[chrom_stop_pos]:
168 mut_read_pos_dict[chrom_stop_pos][tag] = np.array(pileupread.query_position) + 1 175 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1]
169 reads_dict[chrom_stop_pos][tag] = len(pileupread.alignment.query_sequence) 176 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)]
177 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
178
179 #alignedRefPositions = pileupread.get_reference_positions()[0]
170 else: 180 else:
171 mut_read_pos_dict[chrom_stop_pos][tag] = np.append( 181 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1)
172 mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1) 182 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence))
173 reads_dict[chrom_stop_pos][tag] = np.append( 183 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring)
174 reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
175
176 if nuc == alt: 184 if nuc == alt:
177 count_alt += 1 185 count_alt += 1
178 if tag not in mut_read_dict: 186 if tag not in mut_read_dict:
179 mut_read_dict[tag] = {} 187 mut_read_dict[tag] = {}
180 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) 188 mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
189 else: 197 else:
190 count_other += 1 198 count_other += 1
191 else: 199 else:
192 count_indel += 1 200 count_indel += 1
193 201
194 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) 202 #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
195 else: 203 #else:
196 print("indels are currently not evaluated") 204 # print("indels are currently not evaluated")
197
198 mut_array = np.array(mut_array) 205 mut_array = np.array(mut_array)
199 for read in bam.fetch(until_eof=True): 206 for read in bam.fetch(until_eof=True):
200 if read.is_unmapped: 207 if read.is_unmapped:
201 pure_tag = read.query_name[:-5] 208 pure_tag = read.query_name[:-5]
202 nuc = "na" 209 nuc = "na"
212 bam.close() 219 bam.close()
213 220
214 # create pure_tags_dict 221 # create pure_tags_dict
215 pure_tags_dict = {} 222 pure_tags_dict = {}
216 for key1, value1 in sorted(mut_dict.items()): 223 for key1, value1 in sorted(mut_dict.items()):
224 #if len(np.where(np.array(['#'.join(str(i) for i in z)
225 # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0:
226 # continue
227
217 i = np.where(np.array(['#'.join(str(i) for i in z) 228 i = np.where(np.array(['#'.join(str(i) for i in z)
218 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] 229 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
219 ref = mut_array[i, 2] 230 ref = mut_array[i, 2]
220 alt = mut_array[i, 3] 231 alt = mut_array[i, 3]
221 pure_tags_dict[key1] = {} 232 pure_tags_dict[key1] = {}
235 if len(value) < thresh: 246 if len(value) < thresh:
236 pure_tags_dict_short[key] = value 247 pure_tags_dict_short[key] = value
237 else: 248 else:
238 pure_tags_dict_short = pure_tags_dict 249 pure_tags_dict_short = pure_tags_dict
239 250
240 csv_data = open(outputFile_csv, "wb") 251 # whole_array = []
241 csv_writer = csv.writer(csv_data, delimiter=",") 252 # for k in pure_tags_dict.values():
253 # if len(k) != 0:
254 # keys = k.keys()
255 # if len(keys) > 1:
256 # for k1 in keys:
257 # whole_array.append(k1)
258 # else:
259 # whole_array.append(keys[0])
242 260
243 # output summary with threshold 261 # output summary with threshold
244 workbook = xlsxwriter.Workbook(outfile) 262 workbook = xlsxwriter.Workbook(outfile)
245 workbook2 = xlsxwriter.Workbook(outfile2) 263 workbook2 = xlsxwriter.Workbook(outfile2)
246 workbook3 = xlsxwriter.Workbook(outfile3) 264 workbook3 = xlsxwriter.Workbook(outfile3)
266 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', 284 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
267 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba', 285 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba',
268 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba', 286 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba',
269 'in phase', 'chimeric tag') 287 'in phase', 'chimeric tag')
270 ws1.write_row(0, 0, header_line) 288 ws1.write_row(0, 0, header_line)
271 csv_writer.writerow(header_line) 289
272 counter_tier11 = 0 290 counter_tier11 = 0
273 counter_tier12 = 0 291 counter_tier12 = 0
274 counter_tier21 = 0 292 counter_tier21 = 0
275 counter_tier22 = 0 293 counter_tier22 = 0
276 counter_tier23 = 0 294 counter_tier23 = 0
277 counter_tier24 = 0 295 counter_tier24 = 0
278 counter_tier31 = 0 296 counter_tier31 = 0
279 counter_tier32 = 0 297 counter_tier32 = 0
280 counter_tier41 = 0 298 counter_tier33 = 0
281 counter_tier42 = 0 299 counter_tier4 = 0
282 counter_tier5 = 0 300 # if chimera_correction:
301 # counter_tier43 = 0
302 counter_tier51 = 0
303 counter_tier52 = 0
304 counter_tier53 = 0
305 counter_tier54 = 0
306 counter_tier55 = 0
283 counter_tier6 = 0 307 counter_tier6 = 0
308 counter_tier7 = 0
309
284 row = 1 310 row = 1
285 tier_dict = {} 311 tier_dict = {}
286 chimera_dict = {} 312 chimera_dict = {}
287 for key1, value1 in sorted(mut_dict.items()): 313 for key1, value1 in sorted(mut_dict.items()):
288 counts_mut = 0 314 counts_mut = 0
315 chimeric_tag_list = []
289 chimeric_tag = {} 316 chimeric_tag = {}
290 if key1 in pure_tags_dict_short.keys(): 317 if key1 in pure_tags_dict_short.keys():
291 i = np.where(np.array(['#'.join(str(i) for i in z) 318 i = np.where(np.array(['#'.join(str(i) for i in z)
292 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] 319 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
293 ref = mut_array[i, 2] 320 ref = mut_array[i, 2]
294 alt = mut_array[i, 3] 321 alt = mut_array[i, 3]
295 dcs_median = cvrg_dict[key1][2] 322 dcs_median = cvrg_dict[key1][2]
296 whole_array = list(pure_tags_dict_short[key1].keys()) 323 whole_array = pure_tags_dict_short[key1].keys()
297 324
298 tier_dict[key1] = {} 325 tier_dict[key1] = {}
299 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), 326 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0),
300 ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0), ("tier 6", 0)] 327 ("tier 3.2", 0), ("tier 3.3", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
328 ("tier 6", 0), ("tier 7", 0)]
301 for k, v in values_tier_dict: 329 for k, v in values_tier_dict:
302 tier_dict[key1][k] = v 330 tier_dict[key1][k] = v
303 331
304 used_keys = [] 332 used_keys = []
305 if 'ab' in mut_pos_dict[key1].keys(): 333 if 'ab' in mut_pos_dict[key1].keys():
324 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()): 352 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
325 if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): 353 if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
326 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) 354 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
327 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 355 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
328 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na'] 356 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na']
329 else: 357 # na1f = na1/total1
358 else:
359 # na1 = na1f = 0
330 na1 = 0 360 na1 = 0
331 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 361 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
332 lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ'] 362 lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ']
333 else: 363 # lowq1f = lowq1 / total1
364 else:
365 # lowq1 = lowq1f = 0
334 lowq1 = 0 366 lowq1 = 0
335 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 367 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
336 ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref] 368 ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref]
337 ref1f = ref1 / (total1 - na1 - lowq1) 369 ref1f = ref1 / (total1 - na1 - lowq1)
338 else: 370 else:
363 395
364 if key2[:-5] + '.ab.2' in mut_dict[key1].keys(): 396 if key2[:-5] + '.ab.2' in mut_dict[key1].keys():
365 total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values()) 397 total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values())
366 if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 398 if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
367 na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na'] 399 na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na']
368 else: 400 # na2f = na2 / total2
401 else:
402 # na2 = na2f = 0
369 na2 = 0 403 na2 = 0
370 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 404 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
371 lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ'] 405 lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ']
372 else: 406 # lowq2f = lowq2 / total2
407 else:
408 # lowq2 = lowq2f = 0
373 lowq2 = 0 409 lowq2 = 0
374 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 410 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
375 ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref] 411 ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref]
376 ref2f = ref2 / (total2 - na2 - lowq2) 412 ref2f = ref2 / (total2 - na2 - lowq2)
377 else: 413 else:
402 438
403 if key2[:-5] + '.ba.1' in mut_dict[key1].keys(): 439 if key2[:-5] + '.ba.1' in mut_dict[key1].keys():
404 total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values()) 440 total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values())
405 if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 441 if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
406 na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na'] 442 na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na']
407 else: 443 # na3f = na3 / total3
444 else:
445 # na3 = na3f = 0
408 na3 = 0 446 na3 = 0
409 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 447 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
410 lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ'] 448 lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ']
411 else: 449 # lowq3f = lowq3 / total3
450 else:
451 # lowq3 = lowq3f = 0
412 lowq3 = 0 452 lowq3 = 0
413 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 453 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
414 ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref] 454 ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref]
415 ref3f = ref3 / (total3 - na3 - lowq3) 455 ref3f = ref3 / (total3 - na3 - lowq3)
416 else: 456 else:
437 477
438 if key2[:-5] + '.ba.2' in mut_dict[key1].keys(): 478 if key2[:-5] + '.ba.2' in mut_dict[key1].keys():
439 total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values()) 479 total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values())
440 if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 480 if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
441 na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na'] 481 na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na']
442 else: 482 # na4f = na4 / total4
483 else:
484 # na4 = na4f = 0
443 na4 = 0 485 na4 = 0
444 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 486 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
445 lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ'] 487 lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ']
446 else: 488 # lowq4f = lowq4 / total4
489 else:
490 # lowq4 = lowq4f = 0
447 lowq4 = 0 491 lowq4 = 0
448 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 492 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
449 ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref] 493 ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref]
450 ref4f = ref4 / (total4 - na4 - lowq4) 494 ref4f = ref4 / (total4 - na4 - lowq4)
451 else: 495 else:
470 total4 = total4new = na4 = lowq4 = 0 514 total4 = total4new = na4 = lowq4 = 0
471 ref4 = alt4 = ref4f = alt4f = 0 515 ref4 = alt4 = ref4f = alt4f = 0
472 516
473 read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1 517 read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1
474 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0 518 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0
475 519 cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = []
520 pos_read1 = pos_read2 = pos_read3 = pos_read4 = []
521 end_read1 = end_read2 = end_read3 = end_read4 = []
476 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): 522 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
477 read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']) 523 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']))
478 read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1']) 524 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1']))
525 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
526 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
527 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
528 #print(cigars_dcs1)
529 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
479 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): 530 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
480 read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']) 531 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']))
481 read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2']) 532 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2']))
533 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
534 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
535 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
482 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): 536 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
483 read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']) 537 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']))
484 read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1']) 538 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1']))
539 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
540 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
541 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
485 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): 542 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
486 read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']) 543 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']))
487 read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2']) 544 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2']))
545 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
546 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
547
548 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
549 #print(cigars_dcs4)
550 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
488 551
489 used_keys.append(key2[:-5]) 552 used_keys.append(key2[:-5])
490 counts_mut += 1 553 counts_mut += 1
491 if (alt1f + alt2f + alt3f + alt4f) > 0.5: 554 if (alt1f + alt2f + alt3f + alt4f) > 0.5:
492 if total1new == 0: 555 if total1new == 0:
513 beg1 = beg4 = beg2 = beg3 = 0 576 beg1 = beg4 = beg2 = beg3 = 0
514 577
515 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 578 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
516 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 579 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
517 580
518 trimmed_five = False 581 trimmed = False
519 trimmed_three = False
520 contradictory = False 582 contradictory = False
521 583 softclipped_mutation_allMates = False
522 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): 584 softclipped_mutation_oneOfTwoMates = False
585 softclipped_mutation_oneOfTwoSSCS = False
586 softclipped_mutation_oneMate = False
587 softclipped_mutation_oneMateOneSSCS = False
588 print()
589 print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3)
590 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
591 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
592 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
593 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
594
595 # mate 1 - SSCS ab
596 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
597 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
598
599 if any(ij is True for ij in softclipped_idx1):
600 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
601 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
602 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
603 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
604 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
605
606 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
607 if any(ij is True for ij in softclipped_both_ends_idx1):
608 print(softclipped_both_ends_idx1)
609 for nr, indx in enumerate(softclipped_both_ends_idx1):
610 if indx:
611 if dist_start_read1[nr] <= dist_end_read1[nr]:
612 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
613 else:
614 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
615 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
616 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
617 print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
618
619 # mate 1 - SSCS ba
620 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
621 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
622 if any(ij is True for ij in softclipped_idx4):
623 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
624 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
625 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
626 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
627 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
628
629 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
630 if any(ij is True for ij in softclipped_both_ends_idx4):
631 print(softclipped_both_ends_idx4)
632 for nr, indx in enumerate(softclipped_both_ends_idx4):
633 if indx:
634 if dist_start_read4[nr] <= dist_end_read4[nr]:
635 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
636 else:
637 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
638 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
639 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
640 print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
641
642 # mate 2 - SSCS ab
643 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
644 #print(sum(softclipped_idx2))
645 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
646 if any(ij is True for ij in softclipped_idx2):
647 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
648 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
649 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
650 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
651 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
652
653 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
654 if any(ij is True for ij in softclipped_both_ends_idx2):
655 print(softclipped_both_ends_idx2)
656 for nr, indx in enumerate(softclipped_both_ends_idx2):
657 if indx:
658 if dist_start_read2[nr] <= dist_end_read2[nr]:
659 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
660 else:
661 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
662 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
663 #print(ratio_dist_end2)
664 #print([True if x <= thr else False for x in ratio_dist_end2])
665 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
666 print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
667
668 # mate 2 - SSCS ba
669 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
670 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
671 if any(ij is True for ij in softclipped_idx3):
672 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
673 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
674 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
675 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
676 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
677
678 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
679 if any(ij is True for ij in softclipped_both_ends_idx3):
680 print(softclipped_both_ends_idx3)
681 for nr, indx in enumerate(softclipped_both_ends_idx3):
682 if indx:
683 if dist_start_read3[nr] <= dist_end_read3[nr]:
684 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
685 else:
686 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
687 #print([True if x <= thr else False for x in dist_start_read3])
688 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
689 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
690 print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
691
692 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
693 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
694 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
695 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
523 alt1ff = 0 696 alt1ff = 0
524 alt4ff = 0 697 alt4ff = 0
525 alt2ff = 0 698 alt2ff = 0
526 alt3ff = 0 699 alt3ff = 0
527 trimmed_five = False 700 trimmed = False
528 trimmed_three = False
529 contradictory = True 701 contradictory = True
530 else: 702 # softclipping tiers
531 if ((read_pos1 >= 0) and (read_pos1 <= trim5)): 703 # information of both mates available --> all reads for both mates and SSCS are softclipped
704 elif (ratio1 & ratio4 & ratio2 & ratio3 &
705 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
706 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
707 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
708 softclipped_mutation_allMates = True
709 softclipped_mutation_oneOfTwoMates = False
710 softclipped_mutation_oneOfTwoSSCS = False
711 softclipped_mutation_oneMate = False
712 softclipped_mutation_oneMateOneSSCS = False
713 alt1ff = 0
714 alt4ff = 0
715 alt2ff = 0
716 alt3ff = 0
717 trimmed = False
718 contradictory = False
719 print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
720 # information of both mates available --> only one mate softclipped
721 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
722 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
723 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
724 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
725 softclipped_mutation_allMates = False
726 softclipped_mutation_oneOfTwoMates = True
727 softclipped_mutation_oneOfTwoSSCS = False
728 softclipped_mutation_oneMate = False
729 softclipped_mutation_oneMateOneSSCS = False
730 alt1ff = 0
731 alt4ff = 0
732 alt2ff = 0
733 alt3ff = 0
734 trimmed = False
735 contradictory = False
736 print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates)
737 # information of both mates available --> only one mate softclipped
738 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
739 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
740 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
741 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
742 softclipped_mutation_allMates = False
743 softclipped_mutation_oneOfTwoMates = False
744 softclipped_mutation_oneOfTwoSSCS = True
745 softclipped_mutation_oneMate = False
746 softclipped_mutation_oneMateOneSSCS = False
747 alt1ff = 0
748 alt4ff = 0
749 alt2ff = 0
750 alt3ff = 0
751 trimmed = False
752 contradictory = False
753 print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
754 # information of one mate available --> all reads of one mate are softclipped
755 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
756 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
757 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
758 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
759 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
760 #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
761 # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
762 # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
763 # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
764 softclipped_mutation_allMates = False
765 softclipped_mutation_oneOfTwoMates = False
766 softclipped_mutation_oneOfTwoSSCS = False
767 softclipped_mutation_oneMate = True
768 softclipped_mutation_oneMateOneSSCS = False
769 alt1ff = 0
770 alt4ff = 0
771 alt2ff = 0
772 alt3ff = 0
773 trimmed = False
774 contradictory = False
775 print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
776 # information of one mate available --> only one SSCS is softclipped
777 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
778 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
779 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
780 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
781 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
782 #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
783 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
784 # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
785 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
786 softclipped_mutation_allMates = False
787 softclipped_mutation_oneOfTwoMates = False
788 softclipped_mutation_oneOfTwoSSCS = False
789 softclipped_mutation_oneMate = False
790 softclipped_mutation_oneMateOneSSCS = True
791 alt1ff = 0
792 alt4ff = 0
793 alt2ff = 0
794 alt3ff = 0
795 trimmed = False
796 contradictory = False
797 print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
798
799 else:
800 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
532 beg1 = total1new 801 beg1 = total1new
533 total1new = 0 802 total1new = 0
534 alt1ff = 0 803 alt1ff = 0
535 alt1f = 0 804 alt1f = 0
536 trimmed_five = True 805 trimmed = True
537 806
538 if ((read_pos1 >= 0) and (abs(read_len_median1 - read_pos1) <= trim3)): 807 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
539 beg1 = total1new
540 total1new = 0
541 alt1ff = 0
542 alt1f = 0
543 trimmed_three = True
544
545 if ((read_pos4 >= 0) and (read_pos4 <= trim5)):
546 beg4 = total4new 808 beg4 = total4new
547 total4new = 0 809 total4new = 0
548 alt4ff = 0 810 alt4ff = 0
549 alt4f = 0 811 alt4f = 0
550 trimmed_five = True 812 trimmed = True
551 813
552 if ((read_pos4 >= 0) and (abs(read_len_median4 - read_pos4) <= trim3)): 814 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
553 beg4 = total4new
554 total4new = 0
555 alt4ff = 0
556 alt4f = 0
557 trimmed_three = True
558
559 if ((read_pos2 >= 0) and (read_pos2 <= trim5)):
560 beg2 = total2new 815 beg2 = total2new
561 total2new = 0 816 total2new = 0
562 alt2ff = 0 817 alt2ff = 0
563 alt2f = 0 818 alt2f = 0
564 trimmed_five = True 819 trimmed = True
565 820
566 if ((read_pos2 >= 0) and (abs(read_len_median2 - read_pos2) <= trim3)): 821 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
567 beg2 = total2new
568 total2new = 0
569 alt2ff = 0
570 alt2f = 0
571 trimmed_three = True
572
573 if ((read_pos3 >= 0) and (read_pos3 <= trim5)):
574 beg3 = total3new 822 beg3 = total3new
575 total3new = 0 823 total3new = 0
576 alt3ff = 0 824 alt3ff = 0
577 alt3f = 0 825 alt3f = 0
578 trimmed_five = True 826 trimmed = True
579
580 if ((read_pos3 >= 0) and (abs(read_len_median3 - read_pos3) <= trim3)):
581 beg3 = total3new
582 total3new = 0
583 alt3ff = 0
584 alt3f = 0
585 trimmed_three = True
586
587 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 827 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
588 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 828 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
589 829
590 # assign tiers 830 # assign tiers
591 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): 831 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
832 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
833 (all(int(ij) >= 3 for ij in [total2new, total3new]) &
834 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
592 tier = "1.1" 835 tier = "1.1"
593 counter_tier11 += 1 836 counter_tier11 += 1
594 tier_dict[key1]["tier 1.1"] += 1 837 tier_dict[key1]["tier 1.1"] += 1
595 838
596 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) 839 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
597 & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): 840 any(int(ij) >= 3 for ij in [total1new, total4new]) &
841 any(int(ij) >= 3 for ij in [total2new, total3new]) &
842 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
598 tier = "1.2" 843 tier = "1.2"
599 counter_tier12 += 1 844 counter_tier12 += 1
600 tier_dict[key1]["tier 1.2"] += 1 845 tier_dict[key1]["tier 1.2"] += 1
601 846
602 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) 847 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
603 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): 848 any(int(ij) >= 3 for ij in [total1new, total4new]) &
849 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
850 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
851 any(int(ij) >= 3 for ij in [total2new, total3new]) &
852 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
604 tier = "2.1" 853 tier = "2.1"
605 counter_tier21 += 1 854 counter_tier21 += 1
606 tier_dict[key1]["tier 2.1"] += 1 855 tier_dict[key1]["tier 2.1"] += 1
607 856
608 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): 857 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
858 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
609 tier = "2.2" 859 tier = "2.2"
610 counter_tier22 += 1 860 counter_tier22 += 1
611 tier_dict[key1]["tier 2.2"] += 1 861 tier_dict[key1]["tier 2.2"] += 1
612 862
613 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) 863 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
614 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): 864 any(int(ij) >= 3 for ij in [total2new, total3new]) &
865 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
866 any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
867 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
868 any(int(ij) >= 3 for ij in [total1new, total4new]) &
869 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
870 any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
615 tier = "2.3" 871 tier = "2.3"
616 counter_tier23 += 1 872 counter_tier23 += 1
617 tier_dict[key1]["tier 2.3"] += 1 873 tier_dict[key1]["tier 2.3"] += 1
618 874
619 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) 875 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
620 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): 876 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
877 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
878 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
621 tier = "2.4" 879 tier = "2.4"
622 counter_tier24 += 1 880 counter_tier24 += 1
623 tier_dict[key1]["tier 2.4"] += 1 881 tier_dict[key1]["tier 2.4"] += 1
624 882
625 elif ((len(pure_tags_dict_short[key1]) > 1) & (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): 883 elif ((len(pure_tags_dict_short[key1]) > 1) &
884 (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
885 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
626 tier = "3.1" 886 tier = "3.1"
627 counter_tier31 += 1 887 counter_tier31 += 1
628 tier_dict[key1]["tier 3.1"] += 1 888 tier_dict[key1]["tier 3.1"] += 1
629 889
630 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) 890 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
631 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): 891 all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
892 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
893 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
632 tier = "3.2" 894 tier = "3.2"
633 counter_tier32 += 1 895 counter_tier32 += 1
634 tier_dict[key1]["tier 3.2"] += 1 896 tier_dict[key1]["tier 3.2"] += 1
635 897
636 elif trimmed_five: 898 elif (trimmed) and (len(pure_tags_dict_short[key1]) > 1):
637 tier = "4.1" 899 tier = "3.3"
638 counter_tier41 += 1 900 counter_tier33 += 1
639 tier_dict[key1]["tier 4.1"] += 1 901 tier_dict[key1]["tier 3.3"] += 1
640 902
641 elif trimmed_three: 903 elif (trimmed):
642 tier = "4.2" 904 tier = "4"
643 counter_tier42 += 1 905 counter_tier4 += 1
644 tier_dict[key1]["tier 4.2"] += 1 906 tier_dict[key1]["tier 4"] += 1
645 907
646 elif contradictory: 908 elif softclipped_mutation_allMates:
647 tier = "5" 909 tier = "5.1"
648 counter_tier5 += 1 910 counter_tier51 += 1
649 tier_dict[key1]["tier 5"] += 1 911 tier_dict[key1]["tier 5.1"] += 1
650 else: 912
913 elif softclipped_mutation_oneOfTwoMates:
914 tier = "5.2"
915 counter_tier52 += 1
916 tier_dict[key1]["tier 5.2"] += 1
917
918 elif softclipped_mutation_oneOfTwoSSCS:
919 tier = "5.3"
920 counter_tier53 += 1
921 tier_dict[key1]["tier 5.3"] += 1
922
923 elif softclipped_mutation_oneMate:
924 tier = "5.4"
925 counter_tier54 += 1
926 tier_dict[key1]["tier 5.4"] += 1
927
928 elif softclipped_mutation_oneMateOneSSCS:
929 tier = "5.5"
930 counter_tier55 += 1
931 tier_dict[key1]["tier 5.5"] += 1
932
933 elif (contradictory):
651 tier = "6" 934 tier = "6"
652 counter_tier6 += 1 935 counter_tier6 += 1
653 tier_dict[key1]["tier 6"] += 1 936 tier_dict[key1]["tier 6"] += 1
654 937
938 else:
939 tier = "7"
940 counter_tier7 += 1
941 tier_dict[key1]["tier 7"] += 1
942
655 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) 943 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
656 var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt]) 944 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
657 sample_tag = key2[:-5] 945 sample_tag = key2[:-5]
658 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time 946 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
659 # exclude identical tag from array2, to prevent comparison to itself 947 # exclude identical tag from array2, to prevent comparison to itself
660 same_tag = np.where(array2 == sample_tag) 948 same_tag = np.where(array2 == sample_tag)
661 index_array2 = np.arange(0, len(array2), 1) 949 index_array2 = np.arange(0, len(array2), 1)
680 half1_mate1 = array1_half2 968 half1_mate1 = array1_half2
681 half2_mate1 = array1_half 969 half2_mate1 = array1_half
682 half1_mate2 = array2_half2 970 half1_mate2 = array2_half2
683 half2_mate2 = array2_half 971 half2_mate2 = array2_half
684 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 972 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
685 dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2]) 973 dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
686 min_index = np.where(dist == dist.min()) # get index of min HD 974 min_index = np.where(dist == dist.min()) # get index of min HD
687 # get all "b's" of the tag or all "a's" of the tag with minimum HD 975 # get all "b's" of the tag or all "a's" of the tag with minimum HD
688 min_tag_half2 = half2_mate2[min_index] 976 min_tag_half2 = half2_mate2[min_index]
689 min_tag_array2 = array2[min_index] # get whole tag with min HD 977 min_tag_array2 = array2[min_index] # get whole tag with min HD
690 min_value = dist.min() 978 min_value = dist.min()
691 # calculate HD of "b" to all "b's" or "a" to all "a's" 979 # calculate HD of "b" to all "b's" or "a" to all "a's"
692 dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e)) 980 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
693 for e in min_tag_half2]) 981 for e in min_tag_half2])
694 982
695 dist2 = dist_second_half.max() 983 dist2 = dist_second_half.max()
696 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD 984 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
697 max_tag = min_tag_array2[max_index] 985 max_tag = min_tag_array2[max_index]
698 986
699 # tags which have identical parts: 987 # tags which have identical parts:
700 if min_value == 0 or dist2 == 0: 988 if min_value == 0 or dist2 == 0:
701 min_tags_list_zeros.append(tag) 989 min_tags_list_zeros.append(tag)
702 chimera_tags.append(max_tag) 990 chimera_tags.append(max_tag)
703 991 # chimeric = True
992 # else:
993 # chimeric = False
994
995 # if mate_b is False:
996 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
997 # else:
998 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
704 i += 1 999 i += 1
705 chimera_tags = [x for x in chimera_tags if x != []] 1000 chimera_tags = [x for x in chimera_tags if x != []]
706 chimera_tags_new = [] 1001 chimera_tags_new = []
707 for i in chimera_tags: 1002 for i in chimera_tags:
708 if len(i) > 1: 1003 if len(i) > 1:
731 read_pos2 = read_len_median2 = None 1026 read_pos2 = read_len_median2 = None
732 if (read_pos3 == -1): 1027 if (read_pos3 == -1):
733 read_pos3 = read_len_median3 = None 1028 read_pos3 = read_len_median3 = None
734 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) 1029 line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
735 ws1.write_row(row, 0, line) 1030 ws1.write_row(row, 0, line)
736 csv_writer.writerow(line)
737 line = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) 1031 line = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
738 ws1.write_row(row + 1, 0, line) 1032 ws1.write_row(row + 1, 0, line)
739 csv_writer.writerow(line)
740 1033
741 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 1034 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
742 {'type': 'formula', 1035 {'type': 'formula',
743 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 1036 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
744 'format': format1, 1037 'format': format1,
745 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) 1038 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
746 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 1039 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
747 {'type': 'formula', 1040 {'type': 'formula',
748 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1), 1041 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1),
749 'format': format3, 1042 'format': format3,
750 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) 1043 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
751 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 1044 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
752 {'type': 'formula', 1045 {'type': 'formula',
753 'criteria': '=$B${}>="3"'.format(row + 1), 1046 'criteria': '=$B${}>="3"'.format(row + 1),
754 'format': format2, 1047 'format': format2,
755 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) 1048 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
1049
756 row += 3 1050 row += 3
757
758 if chimera_correction: 1051 if chimera_correction:
759 chimeric_dcs_high_tiers = 0 1052 chimeric_dcs_high_tiers = 0
760 chimeric_dcs = 0 1053 chimeric_dcs = 0
761 for keys_chimera in chimeric_tag.keys(): 1054 for keys_chimera in chimeric_tag.keys():
762 tiers = chimeric_tag[keys_chimera] 1055 tiers = chimeric_tag[keys_chimera]
765 if high_tiers == len(tiers): 1058 if high_tiers == len(tiers):
766 chimeric_dcs_high_tiers += high_tiers - 1 1059 chimeric_dcs_high_tiers += high_tiers - 1
767 else: 1060 else:
768 chimeric_dcs_high_tiers += high_tiers 1061 chimeric_dcs_high_tiers += high_tiers
769 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) 1062 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
770 #csv_data.close()
771
772 # sheet 2 1063 # sheet 2
773 if chimera_correction: 1064 if chimera_correction:
774 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 1065 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
775 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 1066 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
776 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 1067 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
777 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6') 1068 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
778 else: 1069 else:
779 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 1070 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
780 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 1071 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
781 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 1072 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
782 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6') 1073 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
783 1074
784 ws2.write_row(0, 0, header_line2) 1075 ws2.write_row(0, 0, header_line2)
785 #ws2.conditional_format('J1', {'type': 'formula', 'criteria': 'containing', 'value': 'tier 1.1', 'format': format1, 'multi_range': 'J1:K1'})
786
787 row = 0 1076 row = 0
788 1077
789 for key1, value1 in sorted(tier_dict.items()): 1078 for key1, value1 in sorted(tier_dict.items()):
790 if key1 in pure_tags_dict_short.keys(): 1079 if key1 in pure_tags_dict_short.keys():
791 i = np.where(np.array(['#'.join(str(i) for i in z) 1080 i = np.where(np.array(['#'.join(str(i) for i in z)
795 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) 1084 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
796 ref_count = cvrg_dict[key1][0] 1085 ref_count = cvrg_dict[key1][0]
797 alt_count = cvrg_dict[key1][1] 1086 alt_count = cvrg_dict[key1][1]
798 cvrg = ref_count + alt_count 1087 cvrg = ref_count + alt_count
799 1088
800 var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt]) 1089 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
801 lst = [var_id, cvrg] 1090 lst = [var_id, cvrg]
802 used_tiers = [] 1091 used_tiers = []
803 cum_af = [] 1092 cum_af = []
804 for key2, value2 in sorted(value1.items()): 1093 for key2, value2 in sorted(value1.items()):
805 # calculate cummulative AF 1094 # calculate cummulative AF
806 used_tiers.append(value2) 1095 used_tiers.append(value2)
807 if len(used_tiers) > 1: 1096 if len(used_tiers) > 1:
808 cum = safe_div(sum(used_tiers), cvrg) 1097 cum = safe_div(sum(used_tiers), cvrg)
809 cum_af.append(cum) 1098 cum_af.append(cum)
1099 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place
1100 continue
810 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) 1101 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
811 if chimera_correction: 1102 if chimera_correction:
812 chimeras_all = chimera_dict[key1][0] 1103 chimeras_all = chimera_dict[key1][0]
813 new_alt = sum(used_tiers) - chimeras_all 1104 new_alt = sum(used_tiers) - chimeras_all
814 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers))) 1105 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers)))
815 if fraction_chimeras is None: 1106 if fraction_chimeras is None:
816 fraction_chimeras = 0. 1107 fraction_chimeras = 0.
817 new_cvrg = cvrg * (1. - fraction_chimeras) 1108 new_cvrg = cvrg * (1. - fraction_chimeras)
818 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)]) 1109 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
819 lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:])))]) 1110 lst.extend([(cvrg - sum(used_tiers[-11:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-11:])))])
820 if chimera_correction: 1111 if chimera_correction:
821 chimeras_all = chimera_dict[key1][1] 1112 chimeras_all = chimera_dict[key1][1]
822 new_alt = sum(used_tiers[0:6]) - chimeras_all 1113 new_alt = sum(used_tiers[0:6]) - chimeras_all
823 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6]))) 1114 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6])))
824 if fraction_chimeras is None: 1115 if fraction_chimeras is None:
825 fraction_chimeras = 0. 1116 fraction_chimeras = 0.
826 new_cvrg = (cvrg - sum(used_tiers[-6:])) * (1. - fraction_chimeras) 1117 new_cvrg = (cvrg - sum(used_tiers[-11:])) * (1. - fraction_chimeras)
827 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)]) 1118 lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
828 lst.extend([alt_count, safe_div(alt_count, cvrg)]) 1119 lst.extend([alt_count, safe_div(alt_count, cvrg)])
829 lst.extend(used_tiers) 1120 lst.extend(used_tiers)
830 lst.extend(cum_af) 1121 lst.extend(cum_af)
831 lst = tuple(lst) 1122 lst = tuple(lst)
832 ws2.write_row(row + 1, 0, lst) 1123 ws2.write_row(row + 1, 0, lst)
833 if chimera_correction: 1124 if chimera_correction:
834 ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)}) 1125 ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
835 ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)}) 1126 ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
836 ws2.conditional_format('V{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AA{} V1:AA1'.format(row + 2, row + 2)}) 1127 ws2.conditional_format('V{}:AF{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AF{} V1:AF1'.format(row + 2, row + 2)})
837 else: 1128 else:
838 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) 1129 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
839 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)}) 1130 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
840 ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)}) 1131 ws2.conditional_format('P{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Z{} P1:Z1'.format(row + 2, row + 2)})
841 row += 1 1132 row += 1
842 1133
843 # sheet 3 1134 # sheet 3
844 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), 1135 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
845 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), 1136 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
846 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41), 1137 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 3.3", counter_tier33), ("tier 4", counter_tier),
847 ("tier 4.2", counter_tier42), ("tier 5", counter_tier5), ("tier 6", counter_tier6)] 1138 ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52),
1139 ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)]
848 1140
849 header = ("tier", "count") 1141 header = ("tier", "count")
850 ws3.write_row(0, 0, header) 1142 ws3.write_row(0, 0, header)
851 1143
852 for i in range(len(sheet3)): 1144 for i in range(len(sheet3)):
853 ws3.write_row(i + 1, 0, sheet3[i]) 1145 ws3.write_row(i + 1, 0, sheet3[i])
854 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), 1146 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
855 {'type': 'formula', 1147 {'type': 'formula',
856 'criteria': '=OR($A${}="tier 1.1", $A${}="tier 1.2")'.format(i + 2, i + 2), 1148 'criteria': '=OR($A${}="tier 1.1", $A${}="tier 1.2")'.format(i + 2, i + 2),
857 'format': format13}) 1149 'format': format1})
858 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), 1150 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
859 {'type': 'formula', 1151 {'type': 'formula',
860 'criteria': '=OR($A${}="tier 2.1", $A${}="tier 2.2", $A${}="tier 2.3", $A${}="tier 2.4")'.format(i + 2, i + 2, i + 2, i + 2), 1152 'criteria': '=OR($A${}="tier 2.1", $A${}="tier 2.2", $A${}="tier 2.3", $A${}="tier 2.4")'.format(i + 2, i + 2, i + 2, i + 2),
861 'format': format33}) 1153 'format': format3})
862 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), 1154 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
863 {'type': 'formula', 1155 {'type': 'formula',
864 'criteria': '=$A${}>="3"'.format(i + 2), 1156 'criteria': '=$A${}>="3"'.format(i + 2),
865 'format': format23}) 1157 'format': format2})
866 1158
867 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), 1159 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""),
868 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), 1160 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
869 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), 1161 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
870 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), 1162 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
871 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), 1163 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
872 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), 1164 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
873 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), 1165 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
874 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), 1166 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
875 ("Tier 4.1", "variants at the beginning of the reads"), 1167 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
876 ("Tier 4.2", "variants at the end of the reads"), 1168 ("Tier 5.1", "variant is close to softclipping in both mates"),
877 ("Tier 5", "mates with contradictory information"), 1169 ("Tier 5.2", "variant is close to softclipping in one of the mates"),
1170 ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"),
1171 ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"),
1172 ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"),
878 ("Tier 6", "remaining variants")] 1173 ("Tier 6", "remaining variants")]
879 examples_tiers = [[("chr5-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", 1174 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
880 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 1175 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
881 "4081", "4098", "5", "10", "", ""), 1176 "4081", "4098", "5", "10", "", ""),
882 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, 1177 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,
883 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, 1178 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None,
884 "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], 1179 "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
885 [("chr5-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289", 1180 [("Chr5:5-20000-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289",
886 "33", "43", "33", "43", "0", "0", "33", "43", "0", "0", "1", "1", "0", "0", "0", "0", "0", 1181 "33", "43", "33", "43", "0", "0", "33", "43", "0", "0", "1", "1", "0", "0", "0", "0", "0",
887 "0", "4081", "4098", "5", "10", "", ""), 1182 "0", "4081", "4098", "5", "10", "", ""),
888 ("", "", "AAAAATGCGTAGAAATATGC", "ab2.ba1", "268", "268", "270", "288", "289", 1183 ("", "", "AAAAATGCGTAGAAATATGC", "ab2.ba1", "268", "268", "270", "288", "289",
889 "11", "34", "10", "27", "0", "0", "10", "27", "0", "0", "1", "1", "0", "0", "1", 1184 "11", "34", "10", "27", "0", "0", "10", "27", "0", "0", "1", "1", "0", "0", "1",
890 "7", "0", "0", "4081", "4098", "5", "10", "", "")], 1185 "7", "0", "0", "4081", "4098", "5", "10", "", "")],
891 [("chr5-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290", 1186 [("Chr5:5-20000-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290",
892 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0", 1187 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
893 "0", "0", "1", "6", "47170", "41149", "", ""), 1188 "0", "0", "1", "6", "47170", "41149", "", ""),
894 ("", "", "CTATGACCCGTGAGCCCATG", "ab2.ba1", "77", "132", "233", "200", "290", 1189 ("", "", "CTATGACCCGTGAGCCCATG", "ab2.ba1", "77", "132", "233", "200", "290",
895 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0", 1190 "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
896 "0", "0", "1", "6", "47170", "41149", "", "")], 1191 "0", "0", "1", "6", "47170", "41149", "", "")],
897 [("chr5-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289", 1192 [("Chr5:5-20000-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289",
898 "2", "8", "2", "8", "0", "0", "2", "8", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 1193 "2", "8", "2", "8", "0", "0", "2", "8", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
899 "4081", "4098", "5", "10", "", ""), 1194 "4081", "4098", "5", "10", "", ""),
900 ("", "", "AAAAAAACATCATACACCCA", "ab2.ba1", None, None, None, None, 1195 ("", "", "AAAAAAACATCATACACCCA", "ab2.ba1", None, None, None, None,
901 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0", 1196 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0",
902 "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], 1197 "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
903 [("chr5-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289", 1198 [("Chr5:5-20000-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289",
904 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 1199 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
905 "4081", "4098", "5", "10", "", ""), 1200 "4081", "4098", "5", "10", "", ""),
906 ("", "", "ATCAGCCATGGCTATTATTG", "ab2.ba1", "153", "164", "217", "260", "289", 1201 ("", "", "ATCAGCCATGGCTATTATTG", "ab2.ba1", "153", "164", "217", "260", "289",
907 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 1202 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
908 "4081", "4098", "5", "10", "", "")], 1203 "4081", "4098", "5", "10", "", "")],
909 [("chr5-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None, 1204 [("Chr5:5-20000-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None,
910 "289", "0", "5", "0", "5", "0", "0", "0", "5", None, None, None, "1", "0", 1205 "289", "0", "5", "0", "5", "0", "0", "0", "5", None, None, None, "1", "0",
911 "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), 1206 "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""),
912 ("", "", "ATCAATATGGCCTCGCCACG", "ab2.ba1", "202", "255", "277", "290", "289", 1207 ("", "", "ATCAATATGGCCTCGCCACG", "ab2.ba1", "202", "255", "277", "290", "289",
913 "1", "3", "1", "3", "0", "0", "1", "3", "0", "0", "1", "1", "0", "0", "0", "0", 1208 "1", "3", "1", "3", "0", "0", "1", "3", "0", "0", "1", "1", "0", "0", "0", "0",
914 "0", "0", "4081", "4098", "5", "10", "", "")], 1209 "0", "0", "4081", "4098", "5", "10", "", "")],
915 [("chr5-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289", 1210 [("Chr5:5-20000-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
916 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", 1211 "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081",
917 "4098", "5", "10", "", ""), 1212 "4098", "5", "10", "", ""),
918 ("", "", "ATCAGCCATGGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289", 1213 ("", "", "ATCAGCCATGGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289",
919 "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081", 1214 "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081",
920 "4098", "5", "10", "", "")], 1215 "4098", "5", "10", "", "")],
921 [("chr5-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290", 1216 [("Chr5:5-20000-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290",
922 "3", "3", "3", "2", "3", "1", "0", "1", "1", "0.5", "0", "0.5", "0", "0", "0", "1", 1217 "3", "3", "3", "2", "3", "1", "0", "1", "1", "0.5", "0", "0.5", "0", "0", "0", "1",
923 "0", "0", "3", "3", "47170", "41149", "", ""), 1218 "0", "0", "3", "3", "47170", "41149", "", ""),
924 ("", "", "ATGCCTACCTCATTTGTCGT", "ab2.ba1", None, "274", None, 1219 ("", "", "ATGCCTACCTCATTTGTCGT", "ab2.ba1", None, "274", None,
925 "288", "290", "0", "3", "0", "2", "0", "1", "0", "1", None, "0.5", None, "0.5", 1220 "288", "290", "0", "3", "0", "2", "0", "1", "0", "1", None, "0.5", None, "0.5",
926 "0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", "")], 1221 "0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", "")],
927 [("chr5-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271", 1222 [("Chr5:5-20000-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271",
928 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1", 1223 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
929 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", ""), 1224 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", ""),
930 ("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271", 1225 ("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271",
931 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1", 1226 "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
932 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")], 1227 "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")],
933 [("chr5-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "1", "100", "255", "276", "269", 1228 [("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "0", "100", "255", "276", "269",
934 "5", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""), 1229 "5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
935 ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None, 1230 ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
936 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", 1231 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
937 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], 1232 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
938 [("chr5-13983-G-C", "4.2", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "20", "270", "255", "276", "269", 1233 [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
939 "5", "6", "5", "0", "0", "0", "5", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "6", "1", "1", "5348", "5350", "", ""),
940 ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
941 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
942 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
943 [("chr5-13963-T-C", "5", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
944 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0", 1234 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
945 "0", "0", "0", "1", "1", "5348", "5350", "", ""), 1235 "0", "0", "0", "1", "1", "5348", "5350", "", ""),
946 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263", 1236 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
947 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0", 1237 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
948 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], 1238 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
949 [("chr5-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269", 1239 [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)],
1240 [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
950 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0", 1241 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
951 "0", "1", "1", "5348", "5350", "", ""), 1242 "0", "1", "1", "5348", "5350", "", ""),
952 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None, 1243 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
953 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", 1244 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
954 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]] 1245 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
955 1246
956 start_row = 15 1247 start_row = 20
957 ws3.write(start_row, 0, "Description of tiers with examples") 1248 ws3.write(start_row, 0, "Description of tiers with examples")
958 ws3.write_row(start_row + 1, 0, header_line) 1249 ws3.write_row(start_row + 1, 0, header_line)
959 row = 0 1250 row = 0
960 for i in range(len(description_tiers)): 1251 for i in range(len(description_tiers)):
961 ws3.write_row(start_row + 2 + row + i + 1, 0, description_tiers[i]) 1252 ws3.write_row(start_row + 2 + row + i + 1, 0, description_tiers[i])
962 ex = examples_tiers[i] 1253 ex = examples_tiers[i]
963 for k in range(len(ex)): 1254 for k in range(len(ex)):
964 ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k]) 1255 ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
965 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) 1256 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
966 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), 1257 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
967 {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 1258 {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
968 'format': format33, 1259 'format': format33,
969 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) 1260 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
970 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), 1261 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
971 {'type': 'formula', 1262 {'type': 'formula',
972 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2), 1263 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2),
973 'format': format23, 1264 'format': format23,
974 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) 1265 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
975 row += 3 1266 row += 3
976 workbook.close() 1267 workbook.close()
977 workbook2.close() 1268 workbook2.close()
978 workbook3.close() 1269 workbook3.close()
979 csv_data.close()
980
981 1270
982 1271
983 if __name__ == '__main__': 1272 if __name__ == '__main__':
984 sys.exit(read2mut(sys.argv)) 1273 sys.exit(read2mut(sys.argv))
1274