comparison scripts/ReMatCh/modules/rematch_module.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 import os.path 1 import os.path
2 import multiprocessing 2 import multiprocessing
3 import utils
4 import functools 3 import functools
5 import sys 4 import sys
6 import pickle 5 import pickle
7 6
8 7 # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
9 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): 8 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
9 import utils
10
11
12 def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
10 command = ['samtools', 'faidx', fasta, '', '', ''] 13 command = ['samtools', 'faidx', fasta, '', '', '']
11 shell_true = False 14 shell_true = False
12 if region_None is not None: 15 if region_none is not None:
13 command[3] = region_None 16 command[3] = region_none
14 if region_outfile_none is not None: 17 if region_outfile_none is not None:
15 command[4] = '>' 18 command[4] = '>'
16 command[5] = region_outfile_none 19 command[5] = region_outfile_none
17 shell_true = True 20 shell_true = True
18 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True) 21 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, shell_true, None, print_comand_true)
19 return run_successfully, stdout 22 return run_successfully, stdout
20 23
21 24
22 # Indexing reference file using Bowtie2 25 # Indexing reference file using Bowtie2
23 def indexSequenceBowtie2(referenceFile, threads): 26 def index_sequence_bowtie2(reference_file, threads):
24 if os.path.isfile(str(referenceFile + '.1.bt2')): 27 if os.path.isfile(str(reference_file + '.1.bt2')):
25 run_successfully = True 28 run_successfully = True
26 else: 29 else:
27 command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] 30 command = ['bowtie2-build', '--threads', str(threads), reference_file, reference_file]
28 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) 31 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
29 return run_successfully 32 return run_successfully
30 33
31 34
32 # Mapping with Bowtie2 35 # Mapping with Bowtie2
33 def mappingBowtie2(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc, bowtieOPT): 36 def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc,
37 bowtie_algorithm='--very-sensitive-local', bowtie_opt=None):
34 sam_file = os.path.join(outdir, str('alignment.sam')) 38 sam_file = os.path.join(outdir, str('alignment.sam'))
35 39
36 # Index reference file 40 # Index reference file
37 run_successfully = indexSequenceBowtie2(referenceFile, threads) 41 run_successfully = index_sequence_bowtie2(reference_file, threads)
38 42
39 if run_successfully: 43 if run_successfully:
40 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '', '-S', sam_file] 44 command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x',
45 reference_file, '', '--no-unal', '', '-S', sam_file]
46
47 if num_map_loc is not None and num_map_loc > 1:
48 command[1] = '-k'
49 command[2] = str(num_map_loc)
41 50
42 if len(fastq_files) == 1: 51 if len(fastq_files) == 1:
43 command[9] = '-U ' + fastq_files[0] 52 command[9] = '-U ' + fastq_files[0]
44 elif len(fastq_files) == 2: 53 elif len(fastq_files) == 2:
45 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] 54 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
46 else: 55 else:
47 return False, None 56 return False, None
48 57
49 if conserved_True: 58 if bowtie_opt is not None:
50 command[4] = '--sensitive' 59 command[11] = bowtie_opt
51 else: 60
52 command[4] = '--very-sensitive-local' 61 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
53
54 if bowtieOPT is not None:
55 command[11] = bowtieOPT
56
57 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
58 62
59 if not run_successfully: 63 if not run_successfully:
60 sam_file = None 64 sam_file = None
61 65
62 return run_successfully, sam_file 66 return run_successfully, sam_file
75 numbers = '' 79 numbers = ''
76 80
77 return splited_cigars 81 return splited_cigars
78 82
79 83
80 def recode_cigar_based_on_base_quality(cigar, bases_quality, softClip_baseQuality, mapping_position, direct_strand_true, softClip_cigarFlagRecode): 84 def recode_cigar_based_on_base_quality(cigar, bases_quality, soft_clip_base_quality, mapping_position,
85 direct_strand_true, soft_clip_cigar_flag_recode):
81 cigar = split_cigar(cigar) 86 cigar = split_cigar(cigar)
82 soft_left = [] 87 soft_left = []
83 soft_right = [] 88 soft_right = []
84 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X') 89 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X')
85 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if cigar_part[1] in cigar_flags_for_reads_length]) - (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0) 90 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if
91 cigar_part[1] in cigar_flags_for_reads_length]) - \
92 (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0)
86 for x, base in enumerate(bases_quality): 93 for x, base in enumerate(bases_quality):
87 if ord(base) - 33 >= softClip_baseQuality: 94 if ord(base) - 33 >= soft_clip_base_quality:
88 if x <= cigar[0][0] - 1: 95 if x <= cigar[0][0] - 1:
89 if cigar[0][1] == 'S': 96 if cigar[0][1] == 'S':
90 soft_left.append(x) 97 soft_left.append(x)
91 elif x > read_length_without_right_s - 1: 98 elif x > read_length_without_right_s - 1:
92 if cigar[len(cigar) - 1][1] == 'S': 99 if cigar[len(cigar) - 1][1] == 'S':
94 101
95 left_changed = (False, 0) 102 left_changed = (False, 0)
96 if len(soft_left) > 0: 103 if len(soft_left) > 0:
97 soft_left = min(soft_left) + 1 104 soft_left = min(soft_left) + 1
98 if soft_left == 1: 105 if soft_left == 1:
99 cigar = [[cigar[0][0], softClip_cigarFlagRecode]] + cigar[1:] 106 cigar = [[cigar[0][0], soft_clip_cigar_flag_recode]] + cigar[1:]
100 left_changed = (True, cigar[0][0]) 107 left_changed = (True, cigar[0][0])
101 elif cigar[0][0] - soft_left > 0: 108 elif cigar[0][0] - soft_left > 0:
102 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, softClip_cigarFlagRecode]] + cigar[1:] 109 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, soft_clip_cigar_flag_recode]] + cigar[1:]
103 left_changed = (True, cigar[0][0] - soft_left) 110 left_changed = (True, cigar[0][0] - soft_left)
104 111
105 right_changed = (False, 0) 112 right_changed = (False, 0)
106 if len(soft_right) > 0: 113 if len(soft_right) > 0:
107 soft_right = max(soft_right) + 1 114 soft_right = max(soft_right) + 1
108 cigar = cigar[:-1] 115 cigar = cigar[:-1]
109 if soft_right - read_length_without_right_s > 0: 116 if soft_right - read_length_without_right_s > 0:
110 cigar.append([soft_right - read_length_without_right_s, softClip_cigarFlagRecode]) 117 cigar.append([soft_right - read_length_without_right_s, soft_clip_cigar_flag_recode])
111 right_changed = (True, soft_right - read_length_without_right_s) 118 right_changed = (True, soft_right - read_length_without_right_s)
112 if len(bases_quality) - soft_right > 0: 119 if len(bases_quality) - soft_right > 0:
113 cigar.append([len(bases_quality) - soft_right, 'S']) 120 cigar.append([len(bases_quality) - soft_right, 'S'])
114 121
115 if left_changed[0]: 122 if left_changed[0]:
138 if bit[-5] == '0': 145 if bit[-5] == '0':
139 direct_strand = True 146 direct_strand = True
140 return direct_strand 147 return direct_strand
141 148
142 149
143 def verify_mapped_tip(reference_length, mapping_position, read_length, cigar): 150 def verify_mapped_tip(reference_length, mapping_position, cigar):
144 tip = False 151 tip = False
145 if 'S' in cigar: 152 if 'S' in cigar:
146 cigar = split_cigar(cigar) 153 cigar = split_cigar(cigar)
147 if cigar[0][1] == 'S': 154 if cigar[0][1] == 'S':
148 if mapping_position - cigar[0][0] < 0: 155 if mapping_position - cigar[0][0] < 0:
172 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))]) 179 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))])
173 return seq, bases_quality, str(sam_flag_bit), cigar 180 return seq, bases_quality, str(sam_flag_bit), cigar
174 181
175 182
176 @utils.trace_unhandled_exceptions 183 @utils.trace_unhandled_exceptions
177 def parallelized_recode_soft_clipping(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode): 184 def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_base_quality, sequences_length,
185 soft_clip_cigar_flag_recode):
178 lines_sam = [] 186 lines_sam = []
179 for line in line_collection: 187 for line in line_collection:
180 line = line.splitlines()[0] 188 line = line.rstrip('\r\n')
181 if len(line) > 0: 189 if len(line) > 0:
182 if line.startswith('@'): 190 if line.startswith('@'):
183 lines_sam.append(line) 191 lines_sam.append(line)
184 else: 192 else:
185 line = line.split('\t') 193 line = line.split('\t')
186 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), len(line[9]), line[5]): 194 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), line[5]):
187 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], softClip_baseQuality, int(line[3]), verify_mapped_direct_strand(int(line[1])), softClip_cigarFlagRecode) 195 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], soft_clip_base_quality,
196 int(line[3]),
197 verify_mapped_direct_strand(int(line[1])),
198 soft_clip_cigar_flag_recode)
188 lines_sam.append('\t'.join(line)) 199 lines_sam.append('\t'.join(line))
189 with open(pickleFile, 'wb') as writer: 200 with open(pickle_file, 'wb') as writer:
190 pickle.dump(lines_sam, writer) 201 pickle.dump(lines_sam, writer)
191 202
192 203
193 def recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode): 204 def recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
205 soft_clip_cigar_flag_recode):
194 pickle_files = [] 206 pickle_files = []
195 sequences_length = {} 207 sequences_length = {}
196 for x, seq_info in reference_dict.items(): 208 for x, seq_info in list(reference_dict.items()):
197 sequences_length[seq_info['header']] = seq_info['length'] 209 sequences_length[seq_info['header']] = seq_info['length']
198 210
199 with open(sam_file, 'rtU') as reader: 211 with open(sam_file, 'rtU') as reader:
200 pool = multiprocessing.Pool(processes=threads) 212 pool = multiprocessing.Pool(processes=threads)
201 line_collection = [] 213 line_collection = []
202 x = 0 214 x = 0
203 for x, line in enumerate(reader): 215 for x, line in enumerate(reader):
204 line_collection.append(line) 216 line_collection.append(line)
205 if x % 10000 == 0: 217 if x % 10000 == 0:
206 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') 218 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
207 pickle_files.append(pickleFile) 219 pickle_files.append(pickle_file)
208 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) 220 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
221 soft_clip_base_quality, sequences_length,
222 soft_clip_cigar_flag_recode,))
209 line_collection = [] 223 line_collection = []
210 if len(line_collection) > 0: 224 if len(line_collection) > 0:
211 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') 225 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl')
212 pickle_files.append(pickleFile) 226 pickle_files.append(pickle_file)
213 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) 227 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file,
214 line_collection = [] 228 soft_clip_base_quality, sequences_length,
229 soft_clip_cigar_flag_recode,))
215 pool.close() 230 pool.close()
216 pool.join() 231 pool.join()
217 232
218 os.remove(sam_file) 233 os.remove(sam_file)
219 234
220 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam') 235 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam')
221 with open(new_sam_file, 'wt') as writer: 236 with open(new_sam_file, 'wt') as writer:
222 for pickleFile in pickle_files: 237 for pickle_file in pickle_files:
223 if os.path.isfile(pickleFile): 238 if os.path.isfile(pickle_file):
224 lines_sam = None 239 lines_sam = None
225 with open(pickleFile, 'rb') as reader: 240 with open(pickle_file, 'rb') as reader:
226 lines_sam = pickle.load(reader) 241 lines_sam = pickle.load(reader)
227 if lines_sam is not None: 242 if lines_sam is not None:
228 for line in lines_sam: 243 for line in lines_sam:
229 writer.write(line + '\n') 244 writer.write(line + '\n')
230 os.remove(pickleFile) 245 os.remove(pickle_file)
231 246
232 return new_sam_file 247 return new_sam_file
233 248
234 249
235 # Sort alignment file 250 # Sort alignment file
236 def sortAlignment(alignment_file, output_file, sortByName_True, threads): 251 def sort_alignment(alignment_file, output_file, sort_by_name_true, threads):
237 outFormat_string = os.path.splitext(output_file)[1][1:].lower() 252 out_format_string = os.path.splitext(output_file)[1][1:].lower()
238 command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] 253 command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file]
239 if sortByName_True: 254 if sort_by_name_true:
240 command[6] = '-n' 255 command[6] = '-n'
241 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) 256 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
242 if not run_successfully: 257 if not run_successfully:
243 output_file = None 258 output_file = None
244 return run_successfully, output_file 259 return run_successfully, output_file
245 260
246 261
247 # Index alignment file 262 # Index alignment file
248 def indexAlignment(alignment_file): 263 def index_alignment(alignment_file):
249 command = ['samtools', 'index', alignment_file] 264 command = ['samtools', 'index', alignment_file]
250 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) 265 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
251 return run_successfully 266 return run_successfully
252 267
253 268
254 def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT): 269 def mapping_reads(fastq_files, reference_file, threads, outdir, num_map_loc, rematch_run,
270 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
271 bowtie_algorithm, bowtie_opt, clean_run=True):
255 # Create a symbolic link to the reference_file 272 # Create a symbolic link to the reference_file
256 reference_link = os.path.join(outdir, os.path.basename(reference_file)) 273 if clean_run:
257 os.symlink(reference_file, reference_link) 274 reference_link = os.path.join(outdir, os.path.basename(reference_file))
275 if os.path.islink(reference_link):
276 os.unlink(reference_link)
277 os.symlink(reference_file, reference_link)
278 reference_file = reference_link
258 279
259 bam_file = None 280 bam_file = None
260 # Mapping reads using Bowtie2 281 # Mapping reads using Bowtie2
261 run_successfully, sam_file = mappingBowtie2(fastq_files, reference_link, threads, outdir, conserved_True, numMapLoc, bowtieOPT) 282 run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=reference_file,
283 threads=threads, outdir=outdir, num_map_loc=num_map_loc,
284 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt)
262 285
263 if run_successfully: 286 if run_successfully:
264 # Remove soft clipping 287 # Remove soft clipping
265 if rematch_run == softClip_recodeRun or softClip_recodeRun == 'both': 288 if rematch_run == soft_clip_recode_run or soft_clip_recode_run == 'both':
266 print 'Recoding soft clipped regions' 289 print('Recoding soft clipped regions')
267 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode) 290 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict,
291 soft_clip_cigar_flag_recode)
268 292
269 # Convert sam to bam and sort bam 293 # Convert sam to bam and sort bam
270 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) 294 run_successfully, bam_file = sort_alignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
295 threads)
271 296
272 if run_successfully: 297 if run_successfully:
273 os.remove(sam_file) 298 os.remove(sam_file)
274 # Index bam 299 # Index bam
275 run_successfully = indexAlignment(bam_file) 300 run_successfully = index_alignment(bam_file)
276 301
277 return run_successfully, bam_file, reference_link 302 return run_successfully, bam_file, reference_file
278 303
279 304
280 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): 305 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):
281 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf') 306 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf')
282 307
283 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref', reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', '--output-tags', 'INFO/AD,AD,DP', bam_file] 308 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref',
284 309 reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed',
285 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 310 '--output-tags', 'INFO/AD,AD,DP', bam_file]
311
312 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
286 if not run_successfully: 313 if not run_successfully:
287 gene_vcf = None 314 gene_vcf = None
288 return run_successfully, gene_vcf 315 return run_successfully, gene_vcf
289 316
290 317
291 # Read vcf file 318 # Read vcf file
292 class Vcf(): 319 class Vcf:
293 def __init__(self, vcfFile): 320 def __init__(self, vcf_file, encoding=None, newline=None):
294 self.vcf = open(vcfFile, 'rtU') 321 try:
322 self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
323 except TypeError:
324 self.vcf = open(vcf_file, 'rt')
295 self.line_read = self.vcf.readline() 325 self.line_read = self.vcf.readline()
326 self.contigs_info_dict = {}
296 while self.line_read.startswith('#'): 327 while self.line_read.startswith('#'):
328 if self.line_read.startswith('##contig=<ID='):
329 seq = self.line_read.split('=')[2].split(',')[0]
330 seq_len = self.line_read.split('=')[3].split('>')[0]
331 self.contigs_info_dict[seq] = int(seq_len)
297 self.line_read = self.vcf.readline() 332 self.line_read = self.vcf.readline()
298 self.line = self.line_read 333 self.line = self.line_read
299 334
300 def readline(self): 335 def readline(self):
301 self.line_stored = self.line 336 line_stored = self.line
302 self.line = self.vcf.readline() 337 self.line = self.vcf.readline()
303 return self.line_stored 338 return line_stored
304 339
305 def close(self): 340 def close(self):
306 self.vcf.close() 341 self.vcf.close()
307 342
308 343 def get_contig_legth(self, contig):
309 def get_variants(gene_vcf): 344 return self.contigs_info_dict[contig]
345
346
347 def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
310 variants = {} 348 variants = {}
311 349
312 vfc_file = Vcf(gene_vcf) 350 vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
313 line = vfc_file.readline() 351 line = vfc_file.readline()
352 counter = 1
314 while len(line) > 0: 353 while len(line) > 0:
315 fields = line.splitlines()[0].split('\t') 354 fields = line.rstrip('\r\n').split('\t')
316 if len(fields) > 0: 355 if len(fields) > 0:
317 fields[1] = int(fields[1]) 356 fields[1] = int(fields[1])
318 357
319 info_field = {} 358 info_field = {}
320 for i in fields[7].split(';'): 359 try:
321 i = i.split('=') 360 for i in fields[7].split(';'):
322 if len(i) > 1: 361 i = i.split('=')
323 info_field[i[0]] = i[1] 362 if len(i) > 1:
363 info_field[i[0]] = i[1]
364 else:
365 info_field[i[0]] = None
366 except IndexError:
367 if counter > vfc_file.get_contig_legth(contig=seq_name):
368 break
324 else: 369 else:
325 info_field[i[0]] = None 370 raise IndexError
326 371
327 format_field = {} 372 format_field = {}
328 format_field_name = fields[8].split(':') 373 format_field_name = fields[8].split(':')
329 format_data = fields[9].split(':') 374 format_data = fields[9].split(':')
330 375
331 for i in range(0, len(format_data)): 376 for i in range(0, len(format_data)):
332 format_field[format_field_name[i]] = format_data[i].split(',') 377 format_field[format_field_name[i]] = format_data[i].split(',')
333 378
334 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field, 'format': format_field} 379 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field,
380 'format': format_field}
335 if fields[1] in variants: 381 if fields[1] in variants:
336 variants[fields[1]][len(variants[fields[1]])] = fields_to_store 382 variants[fields[1]][len(variants[fields[1]])] = fields_to_store
337 else: 383 else:
338 variants[fields[1]] = {0: fields_to_store} 384 variants[fields[1]] = {0: fields_to_store}
339 385
340 line = vfc_file.readline() 386 try:
387 line = vfc_file.readline()
388 except UnicodeDecodeError:
389 if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
390 break
391 else:
392 raise UnicodeDecodeError
393
394 counter += 1
341 vfc_file.close() 395 vfc_file.close()
342 396
343 return variants 397 return variants
344 398
345 399
346 def indel_entry(variant_position): 400 def indel_entry(variant_position):
347 entry_with_indel = [] 401 entry_with_indel = []
348 entry_with_snp = None 402 entry_with_snp = None
349 for i in variant_position: 403 for i in variant_position:
350 keys = variant_position[i]['info'].keys() 404 keys = list(variant_position[i]['info'].keys())
351 if 'INDEL' in keys: 405 if 'INDEL' in keys:
352 entry_with_indel.append(i) 406 entry_with_indel.append(i)
353 else: 407 else:
354 entry_with_snp = i 408 entry_with_snp = i
355 409
356 return entry_with_indel, entry_with_snp 410 return entry_with_indel, entry_with_snp
357 411
358 412
359 def get_alt_noMatter(variant_position, indel_true): 413 def get_alt_no_matter(variant_position, indel_true):
360 dp = sum(map(int, variant_position['format']['AD'])) 414 dp = sum(map(int, variant_position['format']['AD']))
361 index_alleles_sorted_position = sorted(zip(map(int, variant_position['format']['AD']), range(0, len(variant_position['format']['AD']))), reverse=True) 415 index_alleles_sorted_position = sorted(zip(list(map(int, variant_position['format']['AD'])),
416 list(range(0, len(variant_position['format']['AD'])))),
417 reverse=True)
362 index_dominant_allele = None 418 index_dominant_allele = None
363 if not indel_true: 419 if not indel_true:
364 ad_idv = index_alleles_sorted_position[0][0] 420 ad_idv = index_alleles_sorted_position[0][0]
365 421
366 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1: 422 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1:
375 else: 431 else:
376 ad_idv = int(variant_position['info']['IDV']) 432 ad_idv = int(variant_position['info']['IDV'])
377 433
378 if float(ad_idv) / float(dp) >= 0.5: 434 if float(ad_idv) / float(dp) >= 0.5:
379 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1: 435 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1:
380 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) 436 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if
437 x[0] == index_alleles_sorted_position[0][0]])
381 438
382 index_dominant_allele = index_alleles_sorted_position[0][1] 439 index_dominant_allele = index_alleles_sorted_position[0][1]
383 if index_dominant_allele == 0: 440 if index_dominant_allele == 0:
384 alt = '.' 441 alt = '.'
385 else: 442 else:
402 number_diferences += 1 459 number_diferences += 1
403 460
404 return number_diferences 461 return number_diferences
405 462
406 463
407 def get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): 464 def get_alt_correct(variant_position, alt_no_matter, dp, ad_idv, index_dominant_allele, minimum_depth_presence,
465 minimum_depth_call, minimum_depth_frequency_dominant_allele):
408 alt = None 466 alt = None
409 low_coverage = False 467 low_coverage = False
410 multiple_alleles = False 468 multiple_alleles = False
411 469
412 if dp >= minimum_depth_presence: 470 if dp >= minimum_depth_presence:
421 multiple_alleles = True 479 multiple_alleles = True
422 else: 480 else:
423 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele: 481 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele:
424 alt = 'N' * len(variant_position['REF']) 482 alt = 'N' * len(variant_position['REF'])
425 if index_dominant_allele is not None: 483 if index_dominant_allele is not None:
426 variants_coverage = [int(variant_position['format']['AD'][i]) for i in range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele] 484 variants_coverage = [int(variant_position['format']['AD'][i]) for i in
485 range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele]
427 if sum(variants_coverage) > 0: 486 if sum(variants_coverage) > 0:
428 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5: 487 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5:
429 multiple_alleles = True 488 multiple_alleles = True
430 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and len(variants_coverage) > 2: 489 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and \
490 len(variants_coverage) > 2:
431 multiple_alleles = True 491 multiple_alleles = True
432 else: 492 else:
433 multiple_alleles = True 493 multiple_alleles = True
434 else: 494 else:
435 alt = alt_noMatter 495 alt = alt_no_matter
436 else: 496 else:
437 low_coverage = True 497 low_coverage = True
438 498
439 return alt, low_coverage, multiple_alleles 499 return alt, low_coverage, multiple_alleles
440 500
462 for i in indels_entry: 522 for i in indels_entry:
463 indel_coverage[i] = int(variant_position['info']['IDV']) 523 indel_coverage[i] = int(variant_position['info']['IDV'])
464 return indel_coverage.index(str(max(indel_coverage.values()))) 524 return indel_coverage.index(str(max(indel_coverage.values())))
465 525
466 526
467 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, indel_true): 527 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call,
468 alt_noMatter, dp, ad_idv, index_dominant_allele = get_alt_noMatter(variant_position, indel_true) 528 minimum_depth_frequency_dominant_allele, indel_true):
469 529 alt_no_matter, dp, ad_idv, index_dominant_allele = get_alt_no_matter(variant_position, indel_true)
470 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) 530
531 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_no_matter, dp, ad_idv,
532 index_dominant_allele, minimum_depth_presence,
533 minimum_depth_call,
534 minimum_depth_frequency_dominant_allele)
471 535
472 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct) 536 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct)
473 537
474 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment 538 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
475 539
476 540
477 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, alignment_true): 541 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call,
542 minimum_depth_frequency_dominant_allele, alignment_true):
478 alt = list(alt) 543 alt = list(alt)
479 544
480 for i in range(0, len(alt) - 1): 545 for i in range(0, len(alt) - 1):
481 if len(alt) < len(ref): 546 if len(alt) < len(ref):
482 new_position = position_start_indel + len(alt) - i - 1 547 new_position = position_start_indel + len(alt) - i - 1
494 else: 559 else:
495 alt = alt[: alt_position] 560 alt = alt[: alt_position]
496 break 561 break
497 562
498 entry_with_indel, entry_with_snp = indel_entry(variants[new_position]) 563 entry_with_indel, entry_with_snp = indel_entry(variants[new_position])
499 new_ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) 564 new_ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
500 if alt_noMatter != '.' and alt[alt_position] != alt_noMatter: 565 determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
501 alt[alt_position] = alt_noMatter 566 minimum_depth_frequency_dominant_allele, False)
567 if alt_no_matter != '.' and alt[alt_position] != alt_no_matter:
568 alt[alt_position] = alt_no_matter
502 569
503 return ''.join(alt) 570 return ''.join(alt)
504 571
505 572
506 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): 573 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
507 entry_with_indel, entry_with_snp = indel_entry(variants[position]) 574 entry_with_indel, entry_with_snp = indel_entry(variants[position])
508 575
509 if len(entry_with_indel) == 0: 576 if len(entry_with_indel) == 0:
510 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) 577 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
578 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
579 minimum_depth_frequency_dominant_allele, False)
511 else: 580 else:
512 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) 581 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp = \
582 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call,
583 minimum_depth_frequency_dominant_allele, False)
513 584
514 indel_more_likely = entry_with_indel[0] 585 indel_more_likely = entry_with_indel[0]
515 if len(entry_with_indel) > 1: 586 if len(entry_with_indel) > 1:
516 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel) 587 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel)
517 588
518 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) 589 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
519 590 determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call,
520 if alt_noMatter == '.': 591 minimum_depth_frequency_dominant_allele, True)
521 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp 592
593 if alt_no_matter == '.':
594 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
595 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp
522 else: 596 else:
523 if alt_correct is None and alt_correct_snp is not None: 597 if alt_correct is None and alt_correct_snp is not None:
524 alt_correct = alt_correct_snp 598 alt_correct = alt_correct_snp
525 elif alt_correct is not None and alt_correct_snp is not None: 599 elif alt_correct is not None and alt_correct_snp is not None:
526 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp: 600 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp:
527 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp 601 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp
528 if alt_noMatter_snp != '.' and alt_noMatter[0] != alt_noMatter_snp: 602 if alt_no_matter_snp != '.' and alt_no_matter[0] != alt_no_matter_snp:
529 alt_noMatter = alt_noMatter_snp + alt_noMatter[1:] if len(alt_noMatter) > 1 else alt_noMatter_snp 603 alt_no_matter = alt_no_matter_snp + alt_no_matter[1:] if len(alt_no_matter) > 1 else alt_no_matter_snp
530 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp: 604 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp:
531 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp 605 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp
532 606
533 # if alt_noMatter != '.': 607 # if alt_no_matter != '.':
534 # alt_noMatter = confirm_nucleotides_indel(ref, alt_noMatter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) 608 # alt_no_matter = confirm_nucleotides_indel(ref, alt_no_matter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
535 # if alt_correct is not None and alt_correct != '.': 609 # if alt_correct is not None and alt_correct != '.':
536 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) 610 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False)
537 # if alt_alignment != '.': 611 # if alt_alignment != '.':
538 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) 612 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True)
539 613
540 return ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment 614 return ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment
541 615
542 616
543 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence): 617 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
618 sequence):
544 variants_correct = {} 619 variants_correct = {}
545 variants_noMatter = {} 620 variants_no_matter = {}
546 variants_alignment = {} 621 variants_alignment = {}
547 622
548 correct_absent_positions = {} 623 correct_absent_positions = {}
549 correct_last_absent_position = '' 624 correct_last_absent_position = ''
550 625
551 noMatter_absent_positions = {} 626 no_matter_absent_positions = {}
552 noMatter_last_absent_position = '' 627 no_matter_last_absent_position = ''
553 628
554 multiple_alleles_found = [] 629 multiple_alleles_found = []
555 630
556 counter = 1 631 counter = 1
557 while counter <= len(sequence): 632 while counter <= len(sequence):
558 if counter in variants: 633 if counter in variants:
559 noMatter_last_absent_position = '' 634 no_matter_last_absent_position = ''
560 635
561 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) 636 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \
637 snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call,
638 minimum_depth_frequency_dominant_allele)
562 639
563 if alt_alignment != '.': 640 if alt_alignment != '.':
564 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment} 641 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment}
565 642
566 if alt_noMatter != '.': 643 if alt_no_matter != '.':
567 variants_noMatter[counter] = {'REF': ref, 'ALT': alt_noMatter} 644 variants_no_matter[counter] = {'REF': ref, 'ALT': alt_no_matter}
568 645
569 if alt_correct is None: 646 if alt_correct is None:
570 if counter - len(correct_last_absent_position) in correct_absent_positions: 647 if counter - len(correct_last_absent_position) in correct_absent_positions:
571 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref 648 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref
572 else: 649 else:
598 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1] 675 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1]
599 else: 676 else:
600 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} 677 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
601 correct_last_absent_position += sequence[counter - 1] 678 correct_last_absent_position += sequence[counter - 1]
602 679
603 if counter - len(noMatter_last_absent_position) in noMatter_absent_positions: 680 if counter - len(no_matter_last_absent_position) in no_matter_absent_positions:
604 noMatter_absent_positions[counter - len(noMatter_last_absent_position)]['REF'] += sequence[counter - 1] 681 no_matter_absent_positions[counter - len(no_matter_last_absent_position)]['REF'] += \
605 else: 682 sequence[counter - 1]
606 noMatter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} 683 else:
607 noMatter_last_absent_position += sequence[counter - 1] 684 no_matter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''}
685 no_matter_last_absent_position += sequence[counter - 1]
608 686
609 counter += 1 687 counter += 1
610 688
611 for position in correct_absent_positions: 689 for position in correct_absent_positions:
612 if position == 1: 690 if position == 1:
613 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'} 691 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'}
614 else: 692 else:
615 if position - 1 not in variants_correct: 693 if position - 1 not in variants_correct:
616 variants_correct[position - 1] = {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']} 694 variants_correct[position - 1] = \
617 else: 695 {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'],
618 variants_correct[position - 1] = {'REF': variants_correct[position - 1]['REF'] + correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:], 'ALT': variants_correct[position - 1]['ALT'] + correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if len(variants_correct[position - 1]['ALT']) > 0 else 0:]} 696 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']}
619 697 else:
620 for position in noMatter_absent_positions: 698 variants_correct[position - 1] = \
699 {'REF': variants_correct[position - 1]['REF'] +
700 correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:],
701 'ALT': variants_correct[position - 1]['ALT'] +
702 correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if
703 len(variants_correct[position - 1]['ALT']) > 0
704 else 0:]}
705
706 for position in no_matter_absent_positions:
621 if position == 1: 707 if position == 1:
622 variants_noMatter[position] = {'REF': noMatter_absent_positions[position]['REF'], 'ALT': 'N'} 708 variants_no_matter[position] = {'REF': no_matter_absent_positions[position]['REF'], 'ALT': 'N'}
623 else: 709 else:
624 if position - 1 not in variants_noMatter: 710 if position - 1 not in variants_no_matter:
625 variants_noMatter[position - 1] = {'REF': sequence[position - 2] + noMatter_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + noMatter_absent_positions[position]['ALT']} 711 variants_no_matter[position - 1] = \
626 else: 712 {'REF': sequence[position - 2] + no_matter_absent_positions[position]['REF'],
627 variants_noMatter[position - 1] = {'REF': variants_noMatter[position - 1]['REF'] + noMatter_absent_positions[position]['REF'][len(variants_noMatter[position - 1]['REF']) - 1:], 'ALT': variants_noMatter[position - 1]['ALT'] + noMatter_absent_positions[position]['ALT'][len(variants_noMatter[position - 1]['ALT']) - 1 if len(variants_noMatter[position - 1]['ALT']) > 0 else 0:]} 713 'ALT': sequence[position - 2] + no_matter_absent_positions[position]['ALT']}
628 714 else:
629 return variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found 715 variants_no_matter[position - 1] = \
630 716 {'REF': variants_no_matter[position - 1]['REF'] +
631 717 no_matter_absent_positions[position]['REF'][len(variants_no_matter[position - 1]['REF']) -
632 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found, number_multi_alleles): 718 1:],
719 'ALT': variants_no_matter[position - 1]['ALT'] +
720 no_matter_absent_positions[position]['ALT'][len(variants_no_matter[position - 1]['ALT']) -
721 1 if
722 len(variants_no_matter[position - 1]['ALT']) > 0
723 else 0:]}
724
725 return variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found
726
727
728 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found,
729 number_multi_alleles):
633 number_diferences = 0 730 number_diferences = 0
634 731
635 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq: 732 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq:
636 if multiple_alleles_found is not None and position in multiple_alleles_found: 733 if multiple_alleles_found is not None and position in multiple_alleles_found:
637 number_multi_alleles += 1 734 number_multi_alleles += 1
638 735
639 temp_variant = variant_dict[position] 736 temp_variant = variant_dict[position]
640 del variant_dict[position] 737 del variant_dict[position]
641 variant_dict[length_extra_seq] = {} 738 variant_dict[length_extra_seq] = {}
642 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:] 739 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:]
643 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if len(temp_variant['ALT']) > length_extra_seq - position else temp_variant['REF'][length_extra_seq - position] 740 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if \
644 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'], variant_dict[length_extra_seq]['ALT']) 741 len(temp_variant['ALT']) > length_extra_seq - position else \
742 temp_variant['REF'][length_extra_seq - position]
743 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'],
744 variant_dict[length_extra_seq]['ALT'])
645 else: 745 else:
646 del variant_dict[position] 746 del variant_dict[position]
647 747
648 return variant_dict, number_multi_alleles, number_diferences 748 return variant_dict, number_multi_alleles, number_diferences
649 749
650 750
651 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq): 751 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq):
652 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq: 752 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq:
653 variant_dict[position]['REF'] = variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1] 753 variant_dict[position]['REF'] = \
654 variant_dict[position]['ALT'] = variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else variant_dict[position]['ALT'] 754 variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1]
755 variant_dict[position]['ALT'] = \
756 variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if \
757 len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else \
758 variant_dict[position]['ALT']
655 759
656 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT']) 760 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT'])
657 761
658 return variant_dict, number_diferences 762 return variant_dict, number_diferences
659 763
660 764
661 def cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, sequence_length): 765 def cleanning_variants_extra_seq(variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found,
766 length_extra_seq, sequence_length):
662 number_multi_alleles = 0 767 number_multi_alleles = 0
663 number_diferences = 0 768 number_diferences = 0
664 769
665 counter = 1 770 counter = 1
666 while counter <= sequence_length: 771 while counter <= sequence_length:
667 if counter <= length_extra_seq: 772 if counter <= length_extra_seq:
668 if counter in variants_correct: 773 if counter in variants_correct:
669 variants_correct, number_multi_alleles, number_diferences = clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found, number_multi_alleles) 774 variants_correct, number_multi_alleles, number_diferences = \
670 if counter in variants_noMatter: 775 clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found,
671 variants_noMatter, ignore, ignore = clean_variant_in_extra_seq_left(variants_noMatter, counter, length_extra_seq, None, None) 776 number_multi_alleles)
777 if counter in variants_no_matter:
778 variants_no_matter, ignore, ignore = \
779 clean_variant_in_extra_seq_left(variants_no_matter, counter, length_extra_seq, None, None)
672 if counter in variants_alignment: 780 if counter in variants_alignment:
673 variants_alignment, ignore, ignore = clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None) 781 variants_alignment, ignore, ignore = \
674 elif counter > length_extra_seq and counter <= sequence_length - length_extra_seq: 782 clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None)
783 elif sequence_length - length_extra_seq >= counter > length_extra_seq:
675 if counter in variants_correct: 784 if counter in variants_correct:
676 if counter in multiple_alleles_found: 785 if counter in multiple_alleles_found:
677 number_multi_alleles += 1 786 number_multi_alleles += 1
678 variants_correct, number_diferences_found = clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq) 787 variants_correct, number_diferences_found = \
788 clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq)
679 number_diferences += number_diferences_found 789 number_diferences += number_diferences_found
680 if counter in variants_noMatter: 790 if counter in variants_no_matter:
681 variants_noMatter, ignore = clean_variant_in_extra_seq_rigth(variants_noMatter, counter, sequence_length, length_extra_seq) 791 variants_no_matter, ignore = \
792 clean_variant_in_extra_seq_rigth(variants_no_matter, counter, sequence_length, length_extra_seq)
682 if counter in variants_alignment: 793 if counter in variants_alignment:
683 variants_alignment, ignore = clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq) 794 variants_alignment, ignore = \
795 clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq)
684 else: 796 else:
685 if counter in variants_correct: 797 if counter in variants_correct:
686 del variants_correct[counter] 798 del variants_correct[counter]
687 if counter in variants_noMatter: 799 if counter in variants_no_matter:
688 del variants_noMatter[counter] 800 del variants_no_matter[counter]
689 if counter in variants_alignment: 801 if counter in variants_alignment:
690 del variants_alignment[counter] 802 del variants_alignment[counter]
691 803
692 counter += 1 804 counter += 1
693 805
694 return variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences 806 return variants_correct, variants_no_matter, variants_alignment, number_multi_alleles, number_diferences
695 807
696 808
697 def get_coverage(gene_coverage): 809 def get_coverage(gene_coverage):
698 coverage = {} 810 coverage = {}
699 811
700 with open(gene_coverage, 'rtU') as reader: 812 with open(gene_coverage, 'rtU') as reader:
701 for line in reader: 813 for line in reader:
702 line = line.splitlines()[0] 814 line = line.rstrip('\r\n')
703 if len(line) > 0: 815 if len(line) > 0:
704 line = line.split('\t') 816 line = line.split('\t')
705 coverage[int(line[1])] = int(line[2]) 817 coverage[int(line[1])] = int(line[2])
706 818
707 return coverage 819 return coverage
710 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq): 822 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq):
711 if len(coverage) == 0: 823 if len(coverage) == 0:
712 return sequence_length - 2 * length_extra_seq, 100.0, 0.0 824 return sequence_length - 2 * length_extra_seq, 100.0, 0.0
713 825
714 count_absent = 0 826 count_absent = 0
715 count_lowCoverage = 0 827 count_low_coverage = 0
716 sum_coverage = 0 828 sum_coverage = 0
717 829
718 counter = 1 830 counter = 1
719 while counter <= sequence_length: 831 while counter <= sequence_length:
720 if counter > length_extra_seq and counter <= sequence_length - length_extra_seq: 832 if sequence_length - length_extra_seq >= counter > length_extra_seq:
721 if coverage[counter] < minimum_depth_presence: 833 if coverage[counter] < minimum_depth_presence:
722 count_absent += 1 834 count_absent += 1
723 else: 835 else:
724 if coverage[counter] < minimum_depth_call: 836 if coverage[counter] < minimum_depth_call:
725 count_lowCoverage += 1 837 count_low_coverage += 1
726 sum_coverage += coverage[counter] 838 sum_coverage += coverage[counter]
727 counter += 1 839 counter += 1
728 840
729 mean_coverage = 0 841 mean_coverage = 0
730 percentage_lowCoverage = 0 842 percentage_low_coverage = 0
731 if sequence_length - 2 * length_extra_seq - count_absent > 0: 843 if sequence_length - 2 * length_extra_seq - count_absent > 0:
732 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) 844 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent)
733 percentage_lowCoverage = float(count_lowCoverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100 845 percentage_low_coverage = \
734 846 float(count_low_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100
735 return count_absent, percentage_lowCoverage, mean_coverage 847
848 return count_absent, percentage_low_coverage, mean_coverage
736 849
737 850
738 # Get genome coverage data 851 # Get genome coverage data
739 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): 852 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter):
740 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') 853 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab')
741 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file] 854 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>',
742 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) 855 genome_coverage_data_file]
856 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, False)
743 return run_successfully, genome_coverage_data_file 857 return run_successfully, genome_coverage_data_file
744 858
745 859
746 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix): 860 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix):
747 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf')) 861 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf'))
748 with open(vcf_file, 'wt') as writer: 862 with open(vcf_file, 'wt') as writer:
749 writer.write('##fileformat=VCFv4.2' + '\n') 863 writer.write('##fileformat=VCFv4.2' + '\n')
750 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence', 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n') 864 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence',
865 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n')
751 for i in sorted(variants.keys()): 866 for i in sorted(variants.keys()):
752 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.', '.', '.']) + '\n') 867 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.',
868 '.', '.']) + '\n')
753 869
754 compressed_vcf_file = vcf_file + '.gz' 870 compressed_vcf_file = vcf_file + '.gz'
755 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file] 871 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file]
756 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 872 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
757 if run_successfully: 873 if run_successfully:
758 command = ['bcftools', 'index', compressed_vcf_file] 874 command = ['bcftools', 'index', compressed_vcf_file]
759 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 875 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
760 876
761 if not run_successfully: 877 if not run_successfully:
762 compressed_vcf_file = None 878 compressed_vcf_file = None
763 879
764 return run_successfully, compressed_vcf_file 880 return run_successfully, compressed_vcf_file
765 881
766 882
767 def parse_fasta_inMemory(fasta_memory): 883 def parse_fasta_in_memory(fasta_memory):
768 fasta_memory = fasta_memory.splitlines() 884 fasta_memory = fasta_memory.splitlines()
769 sequence_dict = {} 885 sequence_dict = {}
770 for line in fasta_memory: 886 for line in fasta_memory:
771 if len(line) > 0: 887 if len(line) > 0:
772 if line.startswith('>'): 888 if line.startswith('>'):
775 sequence_dict['sequence'] += line 891 sequence_dict['sequence'] += line
776 892
777 return sequence_dict 893 return sequence_dict
778 894
779 895
780 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, sufix): 896 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir):
781 sequence_dict = None 897 sequence_dict = None
782 898
783 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta')) 899 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta'))
784 900
785 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False) 901 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False)
786 if run_successfully: 902 if run_successfully:
787 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file] 903 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file]
788 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 904 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
789 if run_successfully: 905 if run_successfully:
790 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1'] 906 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1']
791 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 907 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
792 if run_successfully: 908 if run_successfully:
793 sequence_dict = parse_fasta_inMemory(stdout) 909 sequence_dict = parse_fasta_in_memory(stdout)
794 910
795 return run_successfully, sequence_dict 911 return run_successfully, sequence_dict
796 912
797 913
798 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence, length_extra_seq): 914 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence,
799 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence) 915 minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence,
800 916 length_extra_seq):
801 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, len(sequence)) 917 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = \
918 get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
919 sequence)
920
921 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = \
922 cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found,
923 length_extra_seq, len(sequence))
802 924
803 run_successfully = False 925 run_successfully = False
804 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}} 926 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}}
805 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']: 927 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']:
806 run_successfully, compressed_vcf_file = write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1]) 928 run_successfully, compressed_vcf_file = \
929 write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1])
807 if run_successfully: 930 if run_successfully:
808 run_successfully, sequence_dict = compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, variant_type.split('_', 1)[1]) 931 run_successfully, sequence_dict = \
932 compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir)
809 if run_successfully: 933 if run_successfully:
810 consensus[variant_type.split('_', 1)[1]] = {'header': sequence_dict['header'], 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) - length_extra_seq]} 934 consensus[variant_type.split('_', 1)[1]] = \
935 {'header': sequence_dict['header'],
936 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) -
937 length_extra_seq]}
811 938
812 return run_successfully, number_multi_alleles, consensus, number_diferences 939 return run_successfully, number_multi_alleles, consensus, number_diferences
813 940
814 941
815 @utils.trace_unhandled_exceptions 942 @utils.trace_unhandled_exceptions
816 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads): 943 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq,
944 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele):
817 count_absent = None 945 count_absent = None
818 percentage_lowCoverage = None 946 percentage_low_coverage = None
819 meanCoverage = None 947 mean_coverage = None
820 number_diferences = 0 948 number_diferences = 0
949 number_multi_alleles = 0
950 consensus_sequence = {'correct': {}, 'noMatter': {}, 'alignment': {}}
821 951
822 # Create vcf file (for multiple alleles check) 952 # Create vcf file (for multiple alleles check)
823 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) 953 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file)
824 if run_successfully: 954 if run_successfully:
825 # Create coverage tab file 955 # Create coverage tab file
826 run_successfully, gene_coverage = compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter) 956 run_successfully, gene_coverage = \
957 compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter)
827 958
828 if run_successfully: 959 if run_successfully:
829 variants = get_variants(gene_vcf) 960 try:
961 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
962 encoding=sys.getdefaultencoding())
963 except UnicodeDecodeError:
964 try:
965 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
966 ' "utf_8" encoding: {}'.format(gene_vcf))
967 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
968 encoding='utf_8')
969 except UnicodeDecodeError:
970 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
971 ' "latin_1" encoding: {}'.format(gene_vcf))
972 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
973 encoding='latin_1')
830 974
831 coverage = get_coverage(gene_coverage) 975 coverage = get_coverage(gene_coverage)
832 976
833 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence_information['sequence'], length_extra_seq) 977 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = \
834 978 create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants,
835 count_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq) 979 minimum_depth_presence, minimum_depth_call,
836 980 minimum_depth_frequency_dominant_allele,
837 utils.saveVariableToPickle([run_successfully, counter, number_multi_alleles, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences], outdir, str('coverage_info.' + str(counter))) 981 sequence_information['sequence'], length_extra_seq)
982
983 try:
984 count_absent, percentage_low_coverage, mean_coverage = \
985 get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence,
986 minimum_depth_call, length_extra_seq)
987 except KeyError:
988 print('ERROR: KeyError')
989 print(sequence_information)
990 raise KeyError
991
992 utils.save_variable_to_pickle([run_successfully, counter, number_multi_alleles, count_absent,
993 percentage_low_coverage, mean_coverage, consensus_sequence, number_diferences],
994 outdir, str('coverage_info.' + str(counter)))
838 995
839 996
840 def clean_header(header): 997 def clean_header(header):
841 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] 998 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"]
842 new_header = str(header) 999 new_header = str(header)
854 with open(fasta_file, 'rtU') as reader: 1011 with open(fasta_file, 'rtU') as reader:
855 blank_line_found = False 1012 blank_line_found = False
856 sequence_counter = 0 1013 sequence_counter = 0
857 temp_sequence_dict = {} 1014 temp_sequence_dict = {}
858 for line in reader: 1015 for line in reader:
859 line = line.splitlines()[0] 1016 line = line.rstrip('\r\n')
860 if len(line) > 0: 1017 if len(line) > 0:
861 if not blank_line_found: 1018 if not blank_line_found:
862 if line.startswith('>'): 1019 if line.startswith('>'):
863 if len(temp_sequence_dict) > 0: 1020 if len(temp_sequence_dict) > 0:
864 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: 1021 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
865 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] 1022 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
866 else: 1023 else:
867 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) 1024 print('{header} sequence ignored due to'
868 del headers[temp_sequence_dict.values()[0]['header']] 1025 ' length = 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
1026 del headers[list(temp_sequence_dict.values())[0]['header']]
869 temp_sequence_dict = {} 1027 temp_sequence_dict = {}
870 1028
871 original_header, new_header = clean_header(line[1:]) 1029 original_header, new_header = clean_header(line[1:])
872 if new_header in headers: 1030 if new_header in headers:
873 sys.exit('Found duplicated sequence headers: {original_header}'.format(original_header=original_header)) 1031 sys.exit('Found duplicated sequence'
1032 ' headers: {original_header}'.format(original_header=original_header))
874 1033
875 sequence_counter += 1 1034 sequence_counter += 1
876 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0} 1035 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0}
877 headers[new_header] = str(original_header) 1036 headers[new_header] = str(original_header)
878 if new_header != original_header: 1037 if new_header != original_header:
879 headers_changed = True 1038 headers_changed = True
880 else: 1039 else:
881 temp_sequence_dict[sequence_counter]['sequence'] += line 1040 temp_sequence_dict[sequence_counter]['sequence'] += line.replace(' ', '')
882 temp_sequence_dict[sequence_counter]['length'] += len(line) 1041 temp_sequence_dict[sequence_counter]['length'] += len(line.replace(' ', ''))
883 else: 1042 else:
884 sys.exit('It was found a blank line between the fasta file above line ' + line) 1043 sys.exit('It was found a blank line between the fasta file above line ' + line)
885 else: 1044 else:
886 blank_line_found = True 1045 blank_line_found = True
887 1046
888 if len(temp_sequence_dict) > 0: 1047 if len(temp_sequence_dict) > 0:
889 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: 1048 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
890 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] 1049 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
891 else: 1050 else:
892 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) 1051 print('{header} sequence ignored due to'
893 del headers[temp_sequence_dict.values()[0]['header']] 1052 ' length <= 0'.format(header=list(temp_sequence_dict.values())[0]['header']))
1053 del headers[list(temp_sequence_dict.values())[0]['header']]
894 1054
895 return sequence_dict, headers, headers_changed 1055 return sequence_dict, headers, headers_changed
896 1056
897 1057
898 def determine_threads_2_use(number_sequences, threads): 1058 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence,
899 if number_sequences >= threads: 1059 minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, not_write_consensus):
900 return 1
901 else:
902 return threads / number_sequences
903
904
905 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus):
906 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') 1060 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
907 utils.removeDirectory(sequence_data_outdir) 1061 utils.remove_directory(sequence_data_outdir)
908 os.mkdir(sequence_data_outdir) 1062 os.mkdir(sequence_data_outdir)
909 1063
910 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq) 1064 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq)
911
912 threads_2_use = determine_threads_2_use(len(sequences), threads)
913 1065
914 pool = multiprocessing.Pool(processes=threads) 1066 pool = multiprocessing.Pool(processes=threads)
915 for sequence_counter in sequences: 1067 for sequence_counter in sequences:
916 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') 1068 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '')
917 utils.removeDirectory(sequence_dir) 1069 utils.remove_directory(sequence_dir)
918 os.makedirs(sequence_dir) 1070 os.makedirs(sequence_dir)
919 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,)) 1071 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir,
1072 sequence_counter, reference_file, length_extra_seq,
1073 minimum_depth_presence, minimum_depth_call,
1074 minimum_depth_frequency_dominant_allele,))
920 pool.close() 1075 pool.close()
921 pool.join() 1076 pool.join()
922 1077
923 run_successfully, sample_data, consensus_files, consensus_sequences = gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, notWriteConsensus) 1078 run_successfully, sample_data, consensus_files, consensus_sequences = \
1079 gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true,
1080 length_extra_seq, not_write_consensus)
924 1081
925 return run_successfully, sample_data, consensus_files, consensus_sequences 1082 return run_successfully, sample_data, consensus_files, consensus_sequences
926 1083
927 1084
928 def chunkstring(string, length): 1085 def chunkstring(string, length):
939 for line in fasta_sequence_lines: 1096 for line in fasta_sequence_lines:
940 writer.write(line + '\n') 1097 writer.write(line + '\n')
941 return consensus_files 1098 return consensus_files
942 1099
943 1100
944 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq, notWriteConsensus): 1101 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq,
1102 not_write_consensus):
945 run_successfully = True 1103 run_successfully = True
946 counter = 0 1104 counter = 0
947 sample_data = {} 1105 sample_data = {}
948 1106
949 consensus_files = None 1107 consensus_files = None
950 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}} 1108 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}}
951 1109
952 write_consensus_first_time = True 1110 write_consensus_first_time = True
953 1111
954 genes_directories = [d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, ''))] 1112 genes_directories = [d for d in os.listdir(data_directory) if
1113 not d.startswith('.') and
1114 os.path.isdir(os.path.join(data_directory, d, ''))]
955 for gene_dir in genes_directories: 1115 for gene_dir in genes_directories:
956 gene_dir_path = os.path.join(data_directory, gene_dir, '') 1116 gene_dir_path = os.path.join(data_directory, gene_dir, '')
957 1117
958 files = [f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f))] 1118 files = [f for f in os.listdir(gene_dir_path) if
1119 not f.startswith('.') and
1120 os.path.isfile(os.path.join(gene_dir_path, f))]
959 for file_found in files: 1121 for file_found in files:
960 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'): 1122 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'):
961 file_path = os.path.join(gene_dir_path, file_found) 1123 file_path = os.path.join(gene_dir_path, file_found)
962 1124
963 if run_successfully: 1125 if run_successfully:
964 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(file_path) 1126 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_low_coverage, \
965 1127 mean_coverage, consensus_sequence, \
966 if not notWriteConsensus: 1128 number_diferences = utils.extract_variable_from_pickle(file_path)
1129
1130 if not not_write_consensus:
967 for consensus_type in consensus_sequence: 1131 for consensus_type in consensus_sequence:
968 consensus_sequences_together[consensus_type][sequence_counter] = {'header': consensus_sequence[consensus_type]['header'], 'sequence': consensus_sequence[consensus_type]['sequence']} 1132 consensus_sequences_together[consensus_type][sequence_counter] = \
1133 {'header': consensus_sequence[consensus_type]['header'],
1134 'sequence': consensus_sequence[consensus_type]['sequence']}
969 1135
970 if write_consensus_first_time: 1136 if write_consensus_first_time:
971 for consensus_type in ['correct', 'noMatter', 'alignment']: 1137 for consensus_type in ['correct', 'noMatter', 'alignment']:
972 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta')) 1138 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta'))
973 if os.path.isfile(file_to_remove): 1139 if os.path.isfile(file_to_remove):
975 write_consensus_first_time = False 1141 write_consensus_first_time = False
976 consensus_files = write_consensus(outdir, sample, consensus_sequence) 1142 consensus_files = write_consensus(outdir, sample, consensus_sequence)
977 1143
978 gene_identity = 0 1144 gene_identity = 0
979 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0: 1145 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0:
980 gene_identity = 100 - (float(number_diferences) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent)) * 100 1146 gene_identity = 100 - \
981 1147 (float(number_diferences) /
982 sample_data[sequence_counter] = {'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - (float(count_absent) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq)) * 100, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': gene_identity} 1148 (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq -
1149 count_absent)) * 100
1150
1151 sample_data[sequence_counter] = \
1152 {'header': sequences_information[sequence_counter]['header'],
1153 'gene_coverage': 100 - (float(count_absent) /
1154 (sequences_information[sequence_counter]['length'] - 2 *
1155 length_extra_seq)) * 100,
1156 'gene_low_coverage': percentage_low_coverage,
1157 'gene_number_positions_multiple_alleles': multiple_alleles_found,
1158 'gene_mean_read_coverage': mean_coverage,
1159 'gene_identity': gene_identity}
983 counter += 1 1160 counter += 1
984 1161
985 if not debug_mode_true: 1162 if not debug_mode_true:
986 utils.removeDirectory(gene_dir_path) 1163 utils.remove_directory(gene_dir_path)
987 1164
988 if counter != len(sequences_information): 1165 if counter != len(sequences_information):
989 run_successfully = False 1166 run_successfully = False
990 1167
991 return run_successfully, sample_data, consensus_files, consensus_sequences_together 1168 return run_successfully, sample_data, consensus_files, consensus_sequences_together
993 1170
994 rematch_timer = functools.partial(utils.timer, name='ReMatCh module') 1171 rematch_timer = functools.partial(utils.timer, name='ReMatCh module')
995 1172
996 1173
997 @rematch_timer 1174 @rematch_timer
998 def runRematchModule(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, conserved_True, debug_mode_true, numMapLoc, minimum_gene_identity, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT, gene_list_reference, notWriteConsensus): 1175 def run_rematch_module(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence,
1176 minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage,
1177 debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run,
1178 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode,
1179 bowtie_algorithm, bowtie_opt, gene_list_reference, not_write_consensus, clean_run=True):
999 rematch_folder = os.path.join(outdir, 'rematch_module', '') 1180 rematch_folder = os.path.join(outdir, 'rematch_module', '')
1000 utils.removeDirectory(rematch_folder) 1181
1182 utils.remove_directory(rematch_folder)
1001 os.mkdir(rematch_folder) 1183 os.mkdir(rematch_folder)
1002 1184
1003 # Map reads 1185 # Map reads
1004 run_successfully, bam_file, reference_file = mapping_reads(fastq_files, reference_file, threads, rematch_folder, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT) 1186 run_successfully, bam_file, reference_file = mapping_reads(fastq_files=fastq_files, reference_file=reference_file,
1005 1187 threads=threads, outdir=rematch_folder,
1188 num_map_loc=num_map_loc, rematch_run=rematch_run,
1189 soft_clip_base_quality=soft_clip_base_quality,
1190 soft_clip_recode_run=soft_clip_recode_run,
1191 reference_dict=reference_dict,
1192 soft_clip_cigar_flag_recode=soft_clip_cigar_flag_recode,
1193 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt,
1194 clean_run=clean_run)
1006 if run_successfully: 1195 if run_successfully:
1007 # Index reference file 1196 # Index reference file
1008 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True) 1197 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True)
1009 if run_successfully: 1198 if run_successfully:
1010 print 'Analysing alignment data' 1199 print('Analysing alignment data')
1011 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus) 1200 run_successfully, sample_data, consensus_files, consensus_sequences = \
1201 sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq,
1202 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,
1203 debug_mode_true, not_write_consensus)
1012 1204
1013 if run_successfully: 1205 if run_successfully:
1014 print 'Writing report file' 1206 print('Writing report file')
1015 number_absent_genes = 0 1207 number_absent_genes = 0
1016 number_genes_multiple_alleles = 0 1208 number_genes_multiple_alleles = 0
1017 mean_sample_coverage = 0 1209 mean_sample_coverage = 0
1018 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: 1210 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer:
1019 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') 1211 print('\n'.join(outdir, 'rematchModule_report.txt'))
1212 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage',
1213 'percentage_gene_low_coverage', 'number_positions_multiple_alleles',
1214 'percentage_gene_identity']) + '\n')
1020 for i in range(1, len(sample_data) + 1): 1215 for i in range(1, len(sample_data) + 1):
1021 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') 1216 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']],
1022 1217 str(round(sample_data[i]['gene_coverage'], 2)),
1023 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: 1218 str(round(sample_data[i]['gene_mean_read_coverage'], 2)),
1219 str(round(sample_data[i]['gene_low_coverage'], 2)),
1220 str(sample_data[i]['gene_number_positions_multiple_alleles']),
1221 str(round(sample_data[i]['gene_identity'], 2))]) + '\n')
1222
1223 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or \
1224 sample_data[i]['gene_identity'] < minimum_gene_identity:
1024 number_absent_genes += 1 1225 number_absent_genes += 1
1025 else: 1226 else:
1026 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage'] 1227 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage']
1027 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0: 1228 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0:
1028 number_genes_multiple_alleles += 1 1229 number_genes_multiple_alleles += 1
1029 1230
1030 if len(sample_data) - number_absent_genes > 0: 1231 if len(sample_data) - number_absent_genes > 0:
1031 mean_sample_coverage = float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes) 1232 mean_sample_coverage = \
1233 float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes)
1032 else: 1234 else:
1033 mean_sample_coverage = 0 1235 mean_sample_coverage = 0
1034 1236
1035 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') 1237 writer.write('\n'.join(['#general',
1036 1238 '>number_absent_genes', str(number_absent_genes),
1037 print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) 1239 '>number_genes_multiple_alleles', str(number_genes_multiple_alleles),
1240 '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n')
1241
1242 print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)),
1243 str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)),
1244 str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]))
1038 1245
1039 if not debug_mode_true: 1246 if not debug_mode_true:
1040 utils.removeDirectory(rematch_folder) 1247 utils.remove_directory(rematch_folder)
1041 1248
1042 return run_successfully, sample_data if 'sample_data' in locals() else None, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, consensus_files if 'consensus_files' in locals() else None, consensus_sequences if 'consensus_sequences' in locals() else None 1249 return run_successfully, sample_data if 'sample_data' in locals() else None, \
1250 {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None,
1251 'number_genes_multiple_alleles': number_genes_multiple_alleles if
1252 'number_genes_multiple_alleles' in locals() else None,
1253 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, \
1254 consensus_files if 'consensus_files' in locals() else None,\
1255 consensus_sequences if 'consensus_sequences' in locals() else None