Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/rematch_module.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
comparison
equal
deleted
inserted
replaced
2:6837f733b4aa | 3:0cbed1c0a762 |
---|---|
1 import os.path | 1 import os.path |
2 import multiprocessing | 2 import multiprocessing |
3 import utils | |
4 import functools | 3 import functools |
5 import sys | 4 import sys |
6 import pickle | 5 import pickle |
7 | 6 |
8 | 7 # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change |
9 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): | 8 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))) |
9 import utils | |
10 | |
11 | |
12 def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true): | |
10 command = ['samtools', 'faidx', fasta, '', '', ''] | 13 command = ['samtools', 'faidx', fasta, '', '', ''] |
11 shell_true = False | 14 shell_true = False |
12 if region_None is not None: | 15 if region_none is not None: |
13 command[3] = region_None | 16 command[3] = region_none |
14 if region_outfile_none is not None: | 17 if region_outfile_none is not None: |
15 command[4] = '>' | 18 command[4] = '>' |
16 command[5] = region_outfile_none | 19 command[5] = region_outfile_none |
17 shell_true = True | 20 shell_true = True |
18 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True) | 21 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, shell_true, None, print_comand_true) |
19 return run_successfully, stdout | 22 return run_successfully, stdout |
20 | 23 |
21 | 24 |
22 # Indexing reference file using Bowtie2 | 25 # Indexing reference file using Bowtie2 |
23 def indexSequenceBowtie2(referenceFile, threads): | 26 def index_sequence_bowtie2(reference_file, threads): |
24 if os.path.isfile(str(referenceFile + '.1.bt2')): | 27 if os.path.isfile(str(reference_file + '.1.bt2')): |
25 run_successfully = True | 28 run_successfully = True |
26 else: | 29 else: |
27 command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] | 30 command = ['bowtie2-build', '--threads', str(threads), reference_file, reference_file] |
28 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | 31 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) |
29 return run_successfully | 32 return run_successfully |
30 | 33 |
31 | 34 |
32 # Mapping with Bowtie2 | 35 # Mapping with Bowtie2 |
33 def mappingBowtie2(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc, bowtieOPT): | 36 def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc, |
37 bowtie_algorithm='--very-sensitive-local', bowtie_opt=None): | |
34 sam_file = os.path.join(outdir, str('alignment.sam')) | 38 sam_file = os.path.join(outdir, str('alignment.sam')) |
35 | 39 |
36 # Index reference file | 40 # Index reference file |
37 run_successfully = indexSequenceBowtie2(referenceFile, threads) | 41 run_successfully = index_sequence_bowtie2(reference_file, threads) |
38 | 42 |
39 if run_successfully: | 43 if run_successfully: |
40 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '', '-S', sam_file] | 44 command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x', |
45 reference_file, '', '--no-unal', '', '-S', sam_file] | |
46 | |
47 if num_map_loc is not None and num_map_loc > 1: | |
48 command[1] = '-k' | |
49 command[2] = str(num_map_loc) | |
41 | 50 |
42 if len(fastq_files) == 1: | 51 if len(fastq_files) == 1: |
43 command[9] = '-U ' + fastq_files[0] | 52 command[9] = '-U ' + fastq_files[0] |
44 elif len(fastq_files) == 2: | 53 elif len(fastq_files) == 2: |
45 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] | 54 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] |
46 else: | 55 else: |
47 return False, None | 56 return False, None |
48 | 57 |
49 if conserved_True: | 58 if bowtie_opt is not None: |
50 command[4] = '--sensitive' | 59 command[11] = bowtie_opt |
51 else: | 60 |
52 command[4] = '--very-sensitive-local' | 61 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) |
53 | |
54 if bowtieOPT is not None: | |
55 command[11] = bowtieOPT | |
56 | |
57 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
58 | 62 |
59 if not run_successfully: | 63 if not run_successfully: |
60 sam_file = None | 64 sam_file = None |
61 | 65 |
62 return run_successfully, sam_file | 66 return run_successfully, sam_file |
75 numbers = '' | 79 numbers = '' |
76 | 80 |
77 return splited_cigars | 81 return splited_cigars |
78 | 82 |
79 | 83 |
80 def recode_cigar_based_on_base_quality(cigar, bases_quality, softClip_baseQuality, mapping_position, direct_strand_true, softClip_cigarFlagRecode): | 84 def recode_cigar_based_on_base_quality(cigar, bases_quality, soft_clip_base_quality, mapping_position, |
85 direct_strand_true, soft_clip_cigar_flag_recode): | |
81 cigar = split_cigar(cigar) | 86 cigar = split_cigar(cigar) |
82 soft_left = [] | 87 soft_left = [] |
83 soft_right = [] | 88 soft_right = [] |
84 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X') | 89 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X') |
85 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if cigar_part[1] in cigar_flags_for_reads_length]) - (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0) | 90 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if |
91 cigar_part[1] in cigar_flags_for_reads_length]) - \ | |
92 (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0) | |
86 for x, base in enumerate(bases_quality): | 93 for x, base in enumerate(bases_quality): |
87 if ord(base) - 33 >= softClip_baseQuality: | 94 if ord(base) - 33 >= soft_clip_base_quality: |
88 if x <= cigar[0][0] - 1: | 95 if x <= cigar[0][0] - 1: |
89 if cigar[0][1] == 'S': | 96 if cigar[0][1] == 'S': |
90 soft_left.append(x) | 97 soft_left.append(x) |
91 elif x > read_length_without_right_s - 1: | 98 elif x > read_length_without_right_s - 1: |
92 if cigar[len(cigar) - 1][1] == 'S': | 99 if cigar[len(cigar) - 1][1] == 'S': |
94 | 101 |
95 left_changed = (False, 0) | 102 left_changed = (False, 0) |
96 if len(soft_left) > 0: | 103 if len(soft_left) > 0: |
97 soft_left = min(soft_left) + 1 | 104 soft_left = min(soft_left) + 1 |
98 if soft_left == 1: | 105 if soft_left == 1: |
99 cigar = [[cigar[0][0], softClip_cigarFlagRecode]] + cigar[1:] | 106 cigar = [[cigar[0][0], soft_clip_cigar_flag_recode]] + cigar[1:] |
100 left_changed = (True, cigar[0][0]) | 107 left_changed = (True, cigar[0][0]) |
101 elif cigar[0][0] - soft_left > 0: | 108 elif cigar[0][0] - soft_left > 0: |
102 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, softClip_cigarFlagRecode]] + cigar[1:] | 109 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, soft_clip_cigar_flag_recode]] + cigar[1:] |
103 left_changed = (True, cigar[0][0] - soft_left) | 110 left_changed = (True, cigar[0][0] - soft_left) |
104 | 111 |
105 right_changed = (False, 0) | 112 right_changed = (False, 0) |
106 if len(soft_right) > 0: | 113 if len(soft_right) > 0: |
107 soft_right = max(soft_right) + 1 | 114 soft_right = max(soft_right) + 1 |
108 cigar = cigar[:-1] | 115 cigar = cigar[:-1] |
109 if soft_right - read_length_without_right_s > 0: | 116 if soft_right - read_length_without_right_s > 0: |
110 cigar.append([soft_right - read_length_without_right_s, softClip_cigarFlagRecode]) | 117 cigar.append([soft_right - read_length_without_right_s, soft_clip_cigar_flag_recode]) |
111 right_changed = (True, soft_right - read_length_without_right_s) | 118 right_changed = (True, soft_right - read_length_without_right_s) |
112 if len(bases_quality) - soft_right > 0: | 119 if len(bases_quality) - soft_right > 0: |
113 cigar.append([len(bases_quality) - soft_right, 'S']) | 120 cigar.append([len(bases_quality) - soft_right, 'S']) |
114 | 121 |
115 if left_changed[0]: | 122 if left_changed[0]: |
138 if bit[-5] == '0': | 145 if bit[-5] == '0': |
139 direct_strand = True | 146 direct_strand = True |
140 return direct_strand | 147 return direct_strand |
141 | 148 |
142 | 149 |
143 def verify_mapped_tip(reference_length, mapping_position, read_length, cigar): | 150 def verify_mapped_tip(reference_length, mapping_position, cigar): |
144 tip = False | 151 tip = False |
145 if 'S' in cigar: | 152 if 'S' in cigar: |
146 cigar = split_cigar(cigar) | 153 cigar = split_cigar(cigar) |
147 if cigar[0][1] == 'S': | 154 if cigar[0][1] == 'S': |
148 if mapping_position - cigar[0][0] < 0: | 155 if mapping_position - cigar[0][0] < 0: |
172 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))]) | 179 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))]) |
173 return seq, bases_quality, str(sam_flag_bit), cigar | 180 return seq, bases_quality, str(sam_flag_bit), cigar |
174 | 181 |
175 | 182 |
176 @utils.trace_unhandled_exceptions | 183 @utils.trace_unhandled_exceptions |
177 def parallelized_recode_soft_clipping(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode): | 184 def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_base_quality, sequences_length, |
185 soft_clip_cigar_flag_recode): | |
178 lines_sam = [] | 186 lines_sam = [] |
179 for line in line_collection: | 187 for line in line_collection: |
180 line = line.splitlines()[0] | 188 line = line.rstrip('\r\n') |
181 if len(line) > 0: | 189 if len(line) > 0: |
182 if line.startswith('@'): | 190 if line.startswith('@'): |
183 lines_sam.append(line) | 191 lines_sam.append(line) |
184 else: | 192 else: |
185 line = line.split('\t') | 193 line = line.split('\t') |
186 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), len(line[9]), line[5]): | 194 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), line[5]): |
187 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], softClip_baseQuality, int(line[3]), verify_mapped_direct_strand(int(line[1])), softClip_cigarFlagRecode) | 195 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], soft_clip_base_quality, |
196 int(line[3]), | |
197 verify_mapped_direct_strand(int(line[1])), | |
198 soft_clip_cigar_flag_recode) | |
188 lines_sam.append('\t'.join(line)) | 199 lines_sam.append('\t'.join(line)) |
189 with open(pickleFile, 'wb') as writer: | 200 with open(pickle_file, 'wb') as writer: |
190 pickle.dump(lines_sam, writer) | 201 pickle.dump(lines_sam, writer) |
191 | 202 |
192 | 203 |
193 def recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode): | 204 def recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict, |
205 soft_clip_cigar_flag_recode): | |
194 pickle_files = [] | 206 pickle_files = [] |
195 sequences_length = {} | 207 sequences_length = {} |
196 for x, seq_info in reference_dict.items(): | 208 for x, seq_info in list(reference_dict.items()): |
197 sequences_length[seq_info['header']] = seq_info['length'] | 209 sequences_length[seq_info['header']] = seq_info['length'] |
198 | 210 |
199 with open(sam_file, 'rtU') as reader: | 211 with open(sam_file, 'rtU') as reader: |
200 pool = multiprocessing.Pool(processes=threads) | 212 pool = multiprocessing.Pool(processes=threads) |
201 line_collection = [] | 213 line_collection = [] |
202 x = 0 | 214 x = 0 |
203 for x, line in enumerate(reader): | 215 for x, line in enumerate(reader): |
204 line_collection.append(line) | 216 line_collection.append(line) |
205 if x % 10000 == 0: | 217 if x % 10000 == 0: |
206 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') | 218 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') |
207 pickle_files.append(pickleFile) | 219 pickle_files.append(pickle_file) |
208 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) | 220 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file, |
221 soft_clip_base_quality, sequences_length, | |
222 soft_clip_cigar_flag_recode,)) | |
209 line_collection = [] | 223 line_collection = [] |
210 if len(line_collection) > 0: | 224 if len(line_collection) > 0: |
211 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') | 225 pickle_file = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') |
212 pickle_files.append(pickleFile) | 226 pickle_files.append(pickle_file) |
213 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) | 227 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickle_file, |
214 line_collection = [] | 228 soft_clip_base_quality, sequences_length, |
229 soft_clip_cigar_flag_recode,)) | |
215 pool.close() | 230 pool.close() |
216 pool.join() | 231 pool.join() |
217 | 232 |
218 os.remove(sam_file) | 233 os.remove(sam_file) |
219 | 234 |
220 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam') | 235 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam') |
221 with open(new_sam_file, 'wt') as writer: | 236 with open(new_sam_file, 'wt') as writer: |
222 for pickleFile in pickle_files: | 237 for pickle_file in pickle_files: |
223 if os.path.isfile(pickleFile): | 238 if os.path.isfile(pickle_file): |
224 lines_sam = None | 239 lines_sam = None |
225 with open(pickleFile, 'rb') as reader: | 240 with open(pickle_file, 'rb') as reader: |
226 lines_sam = pickle.load(reader) | 241 lines_sam = pickle.load(reader) |
227 if lines_sam is not None: | 242 if lines_sam is not None: |
228 for line in lines_sam: | 243 for line in lines_sam: |
229 writer.write(line + '\n') | 244 writer.write(line + '\n') |
230 os.remove(pickleFile) | 245 os.remove(pickle_file) |
231 | 246 |
232 return new_sam_file | 247 return new_sam_file |
233 | 248 |
234 | 249 |
235 # Sort alignment file | 250 # Sort alignment file |
236 def sortAlignment(alignment_file, output_file, sortByName_True, threads): | 251 def sort_alignment(alignment_file, output_file, sort_by_name_true, threads): |
237 outFormat_string = os.path.splitext(output_file)[1][1:].lower() | 252 out_format_string = os.path.splitext(output_file)[1][1:].lower() |
238 command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] | 253 command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file] |
239 if sortByName_True: | 254 if sort_by_name_true: |
240 command[6] = '-n' | 255 command[6] = '-n' |
241 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | 256 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) |
242 if not run_successfully: | 257 if not run_successfully: |
243 output_file = None | 258 output_file = None |
244 return run_successfully, output_file | 259 return run_successfully, output_file |
245 | 260 |
246 | 261 |
247 # Index alignment file | 262 # Index alignment file |
248 def indexAlignment(alignment_file): | 263 def index_alignment(alignment_file): |
249 command = ['samtools', 'index', alignment_file] | 264 command = ['samtools', 'index', alignment_file] |
250 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | 265 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) |
251 return run_successfully | 266 return run_successfully |
252 | 267 |
253 | 268 |
254 def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT): | 269 def mapping_reads(fastq_files, reference_file, threads, outdir, num_map_loc, rematch_run, |
270 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode, | |
271 bowtie_algorithm, bowtie_opt, clean_run=True): | |
255 # Create a symbolic link to the reference_file | 272 # Create a symbolic link to the reference_file |
256 reference_link = os.path.join(outdir, os.path.basename(reference_file)) | 273 if clean_run: |
257 os.symlink(reference_file, reference_link) | 274 reference_link = os.path.join(outdir, os.path.basename(reference_file)) |
275 if os.path.islink(reference_link): | |
276 os.unlink(reference_link) | |
277 os.symlink(reference_file, reference_link) | |
278 reference_file = reference_link | |
258 | 279 |
259 bam_file = None | 280 bam_file = None |
260 # Mapping reads using Bowtie2 | 281 # Mapping reads using Bowtie2 |
261 run_successfully, sam_file = mappingBowtie2(fastq_files, reference_link, threads, outdir, conserved_True, numMapLoc, bowtieOPT) | 282 run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=reference_file, |
283 threads=threads, outdir=outdir, num_map_loc=num_map_loc, | |
284 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt) | |
262 | 285 |
263 if run_successfully: | 286 if run_successfully: |
264 # Remove soft clipping | 287 # Remove soft clipping |
265 if rematch_run == softClip_recodeRun or softClip_recodeRun == 'both': | 288 if rematch_run == soft_clip_recode_run or soft_clip_recode_run == 'both': |
266 print 'Recoding soft clipped regions' | 289 print('Recoding soft clipped regions') |
267 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode) | 290 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, soft_clip_base_quality, reference_dict, |
291 soft_clip_cigar_flag_recode) | |
268 | 292 |
269 # Convert sam to bam and sort bam | 293 # Convert sam to bam and sort bam |
270 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) | 294 run_successfully, bam_file = sort_alignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, |
295 threads) | |
271 | 296 |
272 if run_successfully: | 297 if run_successfully: |
273 os.remove(sam_file) | 298 os.remove(sam_file) |
274 # Index bam | 299 # Index bam |
275 run_successfully = indexAlignment(bam_file) | 300 run_successfully = index_alignment(bam_file) |
276 | 301 |
277 return run_successfully, bam_file, reference_link | 302 return run_successfully, bam_file, reference_file |
278 | 303 |
279 | 304 |
280 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): | 305 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): |
281 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf') | 306 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf') |
282 | 307 |
283 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref', reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', '--output-tags', 'INFO/AD,AD,DP', bam_file] | 308 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref', |
284 | 309 reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', |
285 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 310 '--output-tags', 'INFO/AD,AD,DP', bam_file] |
311 | |
312 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) | |
286 if not run_successfully: | 313 if not run_successfully: |
287 gene_vcf = None | 314 gene_vcf = None |
288 return run_successfully, gene_vcf | 315 return run_successfully, gene_vcf |
289 | 316 |
290 | 317 |
291 # Read vcf file | 318 # Read vcf file |
292 class Vcf(): | 319 class Vcf: |
293 def __init__(self, vcfFile): | 320 def __init__(self, vcf_file, encoding=None, newline=None): |
294 self.vcf = open(vcfFile, 'rtU') | 321 try: |
322 self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline) | |
323 except TypeError: | |
324 self.vcf = open(vcf_file, 'rt') | |
295 self.line_read = self.vcf.readline() | 325 self.line_read = self.vcf.readline() |
326 self.contigs_info_dict = {} | |
296 while self.line_read.startswith('#'): | 327 while self.line_read.startswith('#'): |
328 if self.line_read.startswith('##contig=<ID='): | |
329 seq = self.line_read.split('=')[2].split(',')[0] | |
330 seq_len = self.line_read.split('=')[3].split('>')[0] | |
331 self.contigs_info_dict[seq] = int(seq_len) | |
297 self.line_read = self.vcf.readline() | 332 self.line_read = self.vcf.readline() |
298 self.line = self.line_read | 333 self.line = self.line_read |
299 | 334 |
300 def readline(self): | 335 def readline(self): |
301 self.line_stored = self.line | 336 line_stored = self.line |
302 self.line = self.vcf.readline() | 337 self.line = self.vcf.readline() |
303 return self.line_stored | 338 return line_stored |
304 | 339 |
305 def close(self): | 340 def close(self): |
306 self.vcf.close() | 341 self.vcf.close() |
307 | 342 |
308 | 343 def get_contig_legth(self, contig): |
309 def get_variants(gene_vcf): | 344 return self.contigs_info_dict[contig] |
345 | |
346 | |
347 def get_variants(gene_vcf, seq_name, encoding=None, newline=None): | |
310 variants = {} | 348 variants = {} |
311 | 349 |
312 vfc_file = Vcf(gene_vcf) | 350 vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline) |
313 line = vfc_file.readline() | 351 line = vfc_file.readline() |
352 counter = 1 | |
314 while len(line) > 0: | 353 while len(line) > 0: |
315 fields = line.splitlines()[0].split('\t') | 354 fields = line.rstrip('\r\n').split('\t') |
316 if len(fields) > 0: | 355 if len(fields) > 0: |
317 fields[1] = int(fields[1]) | 356 fields[1] = int(fields[1]) |
318 | 357 |
319 info_field = {} | 358 info_field = {} |
320 for i in fields[7].split(';'): | 359 try: |
321 i = i.split('=') | 360 for i in fields[7].split(';'): |
322 if len(i) > 1: | 361 i = i.split('=') |
323 info_field[i[0]] = i[1] | 362 if len(i) > 1: |
363 info_field[i[0]] = i[1] | |
364 else: | |
365 info_field[i[0]] = None | |
366 except IndexError: | |
367 if counter > vfc_file.get_contig_legth(contig=seq_name): | |
368 break | |
324 else: | 369 else: |
325 info_field[i[0]] = None | 370 raise IndexError |
326 | 371 |
327 format_field = {} | 372 format_field = {} |
328 format_field_name = fields[8].split(':') | 373 format_field_name = fields[8].split(':') |
329 format_data = fields[9].split(':') | 374 format_data = fields[9].split(':') |
330 | 375 |
331 for i in range(0, len(format_data)): | 376 for i in range(0, len(format_data)): |
332 format_field[format_field_name[i]] = format_data[i].split(',') | 377 format_field[format_field_name[i]] = format_data[i].split(',') |
333 | 378 |
334 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field, 'format': format_field} | 379 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field, |
380 'format': format_field} | |
335 if fields[1] in variants: | 381 if fields[1] in variants: |
336 variants[fields[1]][len(variants[fields[1]])] = fields_to_store | 382 variants[fields[1]][len(variants[fields[1]])] = fields_to_store |
337 else: | 383 else: |
338 variants[fields[1]] = {0: fields_to_store} | 384 variants[fields[1]] = {0: fields_to_store} |
339 | 385 |
340 line = vfc_file.readline() | 386 try: |
387 line = vfc_file.readline() | |
388 except UnicodeDecodeError: | |
389 if counter + 1 > vfc_file.get_contig_legth(contig=seq_name): | |
390 break | |
391 else: | |
392 raise UnicodeDecodeError | |
393 | |
394 counter += 1 | |
341 vfc_file.close() | 395 vfc_file.close() |
342 | 396 |
343 return variants | 397 return variants |
344 | 398 |
345 | 399 |
346 def indel_entry(variant_position): | 400 def indel_entry(variant_position): |
347 entry_with_indel = [] | 401 entry_with_indel = [] |
348 entry_with_snp = None | 402 entry_with_snp = None |
349 for i in variant_position: | 403 for i in variant_position: |
350 keys = variant_position[i]['info'].keys() | 404 keys = list(variant_position[i]['info'].keys()) |
351 if 'INDEL' in keys: | 405 if 'INDEL' in keys: |
352 entry_with_indel.append(i) | 406 entry_with_indel.append(i) |
353 else: | 407 else: |
354 entry_with_snp = i | 408 entry_with_snp = i |
355 | 409 |
356 return entry_with_indel, entry_with_snp | 410 return entry_with_indel, entry_with_snp |
357 | 411 |
358 | 412 |
359 def get_alt_noMatter(variant_position, indel_true): | 413 def get_alt_no_matter(variant_position, indel_true): |
360 dp = sum(map(int, variant_position['format']['AD'])) | 414 dp = sum(map(int, variant_position['format']['AD'])) |
361 index_alleles_sorted_position = sorted(zip(map(int, variant_position['format']['AD']), range(0, len(variant_position['format']['AD']))), reverse=True) | 415 index_alleles_sorted_position = sorted(zip(list(map(int, variant_position['format']['AD'])), |
416 list(range(0, len(variant_position['format']['AD'])))), | |
417 reverse=True) | |
362 index_dominant_allele = None | 418 index_dominant_allele = None |
363 if not indel_true: | 419 if not indel_true: |
364 ad_idv = index_alleles_sorted_position[0][0] | 420 ad_idv = index_alleles_sorted_position[0][0] |
365 | 421 |
366 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1: | 422 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1: |
375 else: | 431 else: |
376 ad_idv = int(variant_position['info']['IDV']) | 432 ad_idv = int(variant_position['info']['IDV']) |
377 | 433 |
378 if float(ad_idv) / float(dp) >= 0.5: | 434 if float(ad_idv) / float(dp) >= 0.5: |
379 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1: | 435 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1: |
380 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) | 436 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if |
437 x[0] == index_alleles_sorted_position[0][0]]) | |
381 | 438 |
382 index_dominant_allele = index_alleles_sorted_position[0][1] | 439 index_dominant_allele = index_alleles_sorted_position[0][1] |
383 if index_dominant_allele == 0: | 440 if index_dominant_allele == 0: |
384 alt = '.' | 441 alt = '.' |
385 else: | 442 else: |
402 number_diferences += 1 | 459 number_diferences += 1 |
403 | 460 |
404 return number_diferences | 461 return number_diferences |
405 | 462 |
406 | 463 |
407 def get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): | 464 def get_alt_correct(variant_position, alt_no_matter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, |
465 minimum_depth_call, minimum_depth_frequency_dominant_allele): | |
408 alt = None | 466 alt = None |
409 low_coverage = False | 467 low_coverage = False |
410 multiple_alleles = False | 468 multiple_alleles = False |
411 | 469 |
412 if dp >= minimum_depth_presence: | 470 if dp >= minimum_depth_presence: |
421 multiple_alleles = True | 479 multiple_alleles = True |
422 else: | 480 else: |
423 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele: | 481 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele: |
424 alt = 'N' * len(variant_position['REF']) | 482 alt = 'N' * len(variant_position['REF']) |
425 if index_dominant_allele is not None: | 483 if index_dominant_allele is not None: |
426 variants_coverage = [int(variant_position['format']['AD'][i]) for i in range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele] | 484 variants_coverage = [int(variant_position['format']['AD'][i]) for i in |
485 range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele] | |
427 if sum(variants_coverage) > 0: | 486 if sum(variants_coverage) > 0: |
428 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5: | 487 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5: |
429 multiple_alleles = True | 488 multiple_alleles = True |
430 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and len(variants_coverage) > 2: | 489 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and \ |
490 len(variants_coverage) > 2: | |
431 multiple_alleles = True | 491 multiple_alleles = True |
432 else: | 492 else: |
433 multiple_alleles = True | 493 multiple_alleles = True |
434 else: | 494 else: |
435 alt = alt_noMatter | 495 alt = alt_no_matter |
436 else: | 496 else: |
437 low_coverage = True | 497 low_coverage = True |
438 | 498 |
439 return alt, low_coverage, multiple_alleles | 499 return alt, low_coverage, multiple_alleles |
440 | 500 |
462 for i in indels_entry: | 522 for i in indels_entry: |
463 indel_coverage[i] = int(variant_position['info']['IDV']) | 523 indel_coverage[i] = int(variant_position['info']['IDV']) |
464 return indel_coverage.index(str(max(indel_coverage.values()))) | 524 return indel_coverage.index(str(max(indel_coverage.values()))) |
465 | 525 |
466 | 526 |
467 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, indel_true): | 527 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call, |
468 alt_noMatter, dp, ad_idv, index_dominant_allele = get_alt_noMatter(variant_position, indel_true) | 528 minimum_depth_frequency_dominant_allele, indel_true): |
469 | 529 alt_no_matter, dp, ad_idv, index_dominant_allele = get_alt_no_matter(variant_position, indel_true) |
470 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) | 530 |
531 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_no_matter, dp, ad_idv, | |
532 index_dominant_allele, minimum_depth_presence, | |
533 minimum_depth_call, | |
534 minimum_depth_frequency_dominant_allele) | |
471 | 535 |
472 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct) | 536 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct) |
473 | 537 |
474 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment | 538 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment |
475 | 539 |
476 | 540 |
477 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, alignment_true): | 541 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call, |
542 minimum_depth_frequency_dominant_allele, alignment_true): | |
478 alt = list(alt) | 543 alt = list(alt) |
479 | 544 |
480 for i in range(0, len(alt) - 1): | 545 for i in range(0, len(alt) - 1): |
481 if len(alt) < len(ref): | 546 if len(alt) < len(ref): |
482 new_position = position_start_indel + len(alt) - i - 1 | 547 new_position = position_start_indel + len(alt) - i - 1 |
494 else: | 559 else: |
495 alt = alt[: alt_position] | 560 alt = alt[: alt_position] |
496 break | 561 break |
497 | 562 |
498 entry_with_indel, entry_with_snp = indel_entry(variants[new_position]) | 563 entry_with_indel, entry_with_snp = indel_entry(variants[new_position]) |
499 new_ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | 564 new_ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \ |
500 if alt_noMatter != '.' and alt[alt_position] != alt_noMatter: | 565 determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call, |
501 alt[alt_position] = alt_noMatter | 566 minimum_depth_frequency_dominant_allele, False) |
567 if alt_no_matter != '.' and alt[alt_position] != alt_no_matter: | |
568 alt[alt_position] = alt_no_matter | |
502 | 569 |
503 return ''.join(alt) | 570 return ''.join(alt) |
504 | 571 |
505 | 572 |
506 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): | 573 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): |
507 entry_with_indel, entry_with_snp = indel_entry(variants[position]) | 574 entry_with_indel, entry_with_snp = indel_entry(variants[position]) |
508 | 575 |
509 if len(entry_with_indel) == 0: | 576 if len(entry_with_indel) == 0: |
510 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | 577 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \ |
578 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, | |
579 minimum_depth_frequency_dominant_allele, False) | |
511 else: | 580 else: |
512 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | 581 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp = \ |
582 determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, | |
583 minimum_depth_frequency_dominant_allele, False) | |
513 | 584 |
514 indel_more_likely = entry_with_indel[0] | 585 indel_more_likely = entry_with_indel[0] |
515 if len(entry_with_indel) > 1: | 586 if len(entry_with_indel) > 1: |
516 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel) | 587 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel) |
517 | 588 |
518 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) | 589 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \ |
519 | 590 determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call, |
520 if alt_noMatter == '.': | 591 minimum_depth_frequency_dominant_allele, True) |
521 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp | 592 |
593 if alt_no_matter == '.': | |
594 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \ | |
595 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_no_matter_snp, alt_alignment_snp | |
522 else: | 596 else: |
523 if alt_correct is None and alt_correct_snp is not None: | 597 if alt_correct is None and alt_correct_snp is not None: |
524 alt_correct = alt_correct_snp | 598 alt_correct = alt_correct_snp |
525 elif alt_correct is not None and alt_correct_snp is not None: | 599 elif alt_correct is not None and alt_correct_snp is not None: |
526 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp: | 600 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp: |
527 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp | 601 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp |
528 if alt_noMatter_snp != '.' and alt_noMatter[0] != alt_noMatter_snp: | 602 if alt_no_matter_snp != '.' and alt_no_matter[0] != alt_no_matter_snp: |
529 alt_noMatter = alt_noMatter_snp + alt_noMatter[1:] if len(alt_noMatter) > 1 else alt_noMatter_snp | 603 alt_no_matter = alt_no_matter_snp + alt_no_matter[1:] if len(alt_no_matter) > 1 else alt_no_matter_snp |
530 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp: | 604 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp: |
531 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp | 605 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp |
532 | 606 |
533 # if alt_noMatter != '.': | 607 # if alt_no_matter != '.': |
534 # alt_noMatter = confirm_nucleotides_indel(ref, alt_noMatter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | 608 # alt_no_matter = confirm_nucleotides_indel(ref, alt_no_matter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) |
535 # if alt_correct is not None and alt_correct != '.': | 609 # if alt_correct is not None and alt_correct != '.': |
536 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | 610 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) |
537 # if alt_alignment != '.': | 611 # if alt_alignment != '.': |
538 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) | 612 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) |
539 | 613 |
540 return ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment | 614 return ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment |
541 | 615 |
542 | 616 |
543 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence): | 617 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, |
618 sequence): | |
544 variants_correct = {} | 619 variants_correct = {} |
545 variants_noMatter = {} | 620 variants_no_matter = {} |
546 variants_alignment = {} | 621 variants_alignment = {} |
547 | 622 |
548 correct_absent_positions = {} | 623 correct_absent_positions = {} |
549 correct_last_absent_position = '' | 624 correct_last_absent_position = '' |
550 | 625 |
551 noMatter_absent_positions = {} | 626 no_matter_absent_positions = {} |
552 noMatter_last_absent_position = '' | 627 no_matter_last_absent_position = '' |
553 | 628 |
554 multiple_alleles_found = [] | 629 multiple_alleles_found = [] |
555 | 630 |
556 counter = 1 | 631 counter = 1 |
557 while counter <= len(sequence): | 632 while counter <= len(sequence): |
558 if counter in variants: | 633 if counter in variants: |
559 noMatter_last_absent_position = '' | 634 no_matter_last_absent_position = '' |
560 | 635 |
561 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) | 636 ref, alt_correct, low_coverage, multiple_alleles, alt_no_matter, alt_alignment = \ |
637 snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call, | |
638 minimum_depth_frequency_dominant_allele) | |
562 | 639 |
563 if alt_alignment != '.': | 640 if alt_alignment != '.': |
564 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment} | 641 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment} |
565 | 642 |
566 if alt_noMatter != '.': | 643 if alt_no_matter != '.': |
567 variants_noMatter[counter] = {'REF': ref, 'ALT': alt_noMatter} | 644 variants_no_matter[counter] = {'REF': ref, 'ALT': alt_no_matter} |
568 | 645 |
569 if alt_correct is None: | 646 if alt_correct is None: |
570 if counter - len(correct_last_absent_position) in correct_absent_positions: | 647 if counter - len(correct_last_absent_position) in correct_absent_positions: |
571 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref | 648 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref |
572 else: | 649 else: |
598 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1] | 675 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1] |
599 else: | 676 else: |
600 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} | 677 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} |
601 correct_last_absent_position += sequence[counter - 1] | 678 correct_last_absent_position += sequence[counter - 1] |
602 | 679 |
603 if counter - len(noMatter_last_absent_position) in noMatter_absent_positions: | 680 if counter - len(no_matter_last_absent_position) in no_matter_absent_positions: |
604 noMatter_absent_positions[counter - len(noMatter_last_absent_position)]['REF'] += sequence[counter - 1] | 681 no_matter_absent_positions[counter - len(no_matter_last_absent_position)]['REF'] += \ |
605 else: | 682 sequence[counter - 1] |
606 noMatter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} | 683 else: |
607 noMatter_last_absent_position += sequence[counter - 1] | 684 no_matter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} |
685 no_matter_last_absent_position += sequence[counter - 1] | |
608 | 686 |
609 counter += 1 | 687 counter += 1 |
610 | 688 |
611 for position in correct_absent_positions: | 689 for position in correct_absent_positions: |
612 if position == 1: | 690 if position == 1: |
613 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'} | 691 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'} |
614 else: | 692 else: |
615 if position - 1 not in variants_correct: | 693 if position - 1 not in variants_correct: |
616 variants_correct[position - 1] = {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']} | 694 variants_correct[position - 1] = \ |
617 else: | 695 {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'], |
618 variants_correct[position - 1] = {'REF': variants_correct[position - 1]['REF'] + correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:], 'ALT': variants_correct[position - 1]['ALT'] + correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if len(variants_correct[position - 1]['ALT']) > 0 else 0:]} | 696 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']} |
619 | 697 else: |
620 for position in noMatter_absent_positions: | 698 variants_correct[position - 1] = \ |
699 {'REF': variants_correct[position - 1]['REF'] + | |
700 correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:], | |
701 'ALT': variants_correct[position - 1]['ALT'] + | |
702 correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if | |
703 len(variants_correct[position - 1]['ALT']) > 0 | |
704 else 0:]} | |
705 | |
706 for position in no_matter_absent_positions: | |
621 if position == 1: | 707 if position == 1: |
622 variants_noMatter[position] = {'REF': noMatter_absent_positions[position]['REF'], 'ALT': 'N'} | 708 variants_no_matter[position] = {'REF': no_matter_absent_positions[position]['REF'], 'ALT': 'N'} |
623 else: | 709 else: |
624 if position - 1 not in variants_noMatter: | 710 if position - 1 not in variants_no_matter: |
625 variants_noMatter[position - 1] = {'REF': sequence[position - 2] + noMatter_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + noMatter_absent_positions[position]['ALT']} | 711 variants_no_matter[position - 1] = \ |
626 else: | 712 {'REF': sequence[position - 2] + no_matter_absent_positions[position]['REF'], |
627 variants_noMatter[position - 1] = {'REF': variants_noMatter[position - 1]['REF'] + noMatter_absent_positions[position]['REF'][len(variants_noMatter[position - 1]['REF']) - 1:], 'ALT': variants_noMatter[position - 1]['ALT'] + noMatter_absent_positions[position]['ALT'][len(variants_noMatter[position - 1]['ALT']) - 1 if len(variants_noMatter[position - 1]['ALT']) > 0 else 0:]} | 713 'ALT': sequence[position - 2] + no_matter_absent_positions[position]['ALT']} |
628 | 714 else: |
629 return variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found | 715 variants_no_matter[position - 1] = \ |
630 | 716 {'REF': variants_no_matter[position - 1]['REF'] + |
631 | 717 no_matter_absent_positions[position]['REF'][len(variants_no_matter[position - 1]['REF']) - |
632 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found, number_multi_alleles): | 718 1:], |
719 'ALT': variants_no_matter[position - 1]['ALT'] + | |
720 no_matter_absent_positions[position]['ALT'][len(variants_no_matter[position - 1]['ALT']) - | |
721 1 if | |
722 len(variants_no_matter[position - 1]['ALT']) > 0 | |
723 else 0:]} | |
724 | |
725 return variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found | |
726 | |
727 | |
728 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found, | |
729 number_multi_alleles): | |
633 number_diferences = 0 | 730 number_diferences = 0 |
634 | 731 |
635 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq: | 732 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq: |
636 if multiple_alleles_found is not None and position in multiple_alleles_found: | 733 if multiple_alleles_found is not None and position in multiple_alleles_found: |
637 number_multi_alleles += 1 | 734 number_multi_alleles += 1 |
638 | 735 |
639 temp_variant = variant_dict[position] | 736 temp_variant = variant_dict[position] |
640 del variant_dict[position] | 737 del variant_dict[position] |
641 variant_dict[length_extra_seq] = {} | 738 variant_dict[length_extra_seq] = {} |
642 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:] | 739 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:] |
643 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if len(temp_variant['ALT']) > length_extra_seq - position else temp_variant['REF'][length_extra_seq - position] | 740 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if \ |
644 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'], variant_dict[length_extra_seq]['ALT']) | 741 len(temp_variant['ALT']) > length_extra_seq - position else \ |
742 temp_variant['REF'][length_extra_seq - position] | |
743 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'], | |
744 variant_dict[length_extra_seq]['ALT']) | |
645 else: | 745 else: |
646 del variant_dict[position] | 746 del variant_dict[position] |
647 | 747 |
648 return variant_dict, number_multi_alleles, number_diferences | 748 return variant_dict, number_multi_alleles, number_diferences |
649 | 749 |
650 | 750 |
651 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq): | 751 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq): |
652 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq: | 752 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq: |
653 variant_dict[position]['REF'] = variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1] | 753 variant_dict[position]['REF'] = \ |
654 variant_dict[position]['ALT'] = variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else variant_dict[position]['ALT'] | 754 variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1] |
755 variant_dict[position]['ALT'] = \ | |
756 variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if \ | |
757 len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else \ | |
758 variant_dict[position]['ALT'] | |
655 | 759 |
656 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT']) | 760 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT']) |
657 | 761 |
658 return variant_dict, number_diferences | 762 return variant_dict, number_diferences |
659 | 763 |
660 | 764 |
661 def cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, sequence_length): | 765 def cleanning_variants_extra_seq(variants_correct, variants_no_matter, variants_alignment, multiple_alleles_found, |
766 length_extra_seq, sequence_length): | |
662 number_multi_alleles = 0 | 767 number_multi_alleles = 0 |
663 number_diferences = 0 | 768 number_diferences = 0 |
664 | 769 |
665 counter = 1 | 770 counter = 1 |
666 while counter <= sequence_length: | 771 while counter <= sequence_length: |
667 if counter <= length_extra_seq: | 772 if counter <= length_extra_seq: |
668 if counter in variants_correct: | 773 if counter in variants_correct: |
669 variants_correct, number_multi_alleles, number_diferences = clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found, number_multi_alleles) | 774 variants_correct, number_multi_alleles, number_diferences = \ |
670 if counter in variants_noMatter: | 775 clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found, |
671 variants_noMatter, ignore, ignore = clean_variant_in_extra_seq_left(variants_noMatter, counter, length_extra_seq, None, None) | 776 number_multi_alleles) |
777 if counter in variants_no_matter: | |
778 variants_no_matter, ignore, ignore = \ | |
779 clean_variant_in_extra_seq_left(variants_no_matter, counter, length_extra_seq, None, None) | |
672 if counter in variants_alignment: | 780 if counter in variants_alignment: |
673 variants_alignment, ignore, ignore = clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None) | 781 variants_alignment, ignore, ignore = \ |
674 elif counter > length_extra_seq and counter <= sequence_length - length_extra_seq: | 782 clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None) |
783 elif sequence_length - length_extra_seq >= counter > length_extra_seq: | |
675 if counter in variants_correct: | 784 if counter in variants_correct: |
676 if counter in multiple_alleles_found: | 785 if counter in multiple_alleles_found: |
677 number_multi_alleles += 1 | 786 number_multi_alleles += 1 |
678 variants_correct, number_diferences_found = clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq) | 787 variants_correct, number_diferences_found = \ |
788 clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq) | |
679 number_diferences += number_diferences_found | 789 number_diferences += number_diferences_found |
680 if counter in variants_noMatter: | 790 if counter in variants_no_matter: |
681 variants_noMatter, ignore = clean_variant_in_extra_seq_rigth(variants_noMatter, counter, sequence_length, length_extra_seq) | 791 variants_no_matter, ignore = \ |
792 clean_variant_in_extra_seq_rigth(variants_no_matter, counter, sequence_length, length_extra_seq) | |
682 if counter in variants_alignment: | 793 if counter in variants_alignment: |
683 variants_alignment, ignore = clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq) | 794 variants_alignment, ignore = \ |
795 clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq) | |
684 else: | 796 else: |
685 if counter in variants_correct: | 797 if counter in variants_correct: |
686 del variants_correct[counter] | 798 del variants_correct[counter] |
687 if counter in variants_noMatter: | 799 if counter in variants_no_matter: |
688 del variants_noMatter[counter] | 800 del variants_no_matter[counter] |
689 if counter in variants_alignment: | 801 if counter in variants_alignment: |
690 del variants_alignment[counter] | 802 del variants_alignment[counter] |
691 | 803 |
692 counter += 1 | 804 counter += 1 |
693 | 805 |
694 return variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences | 806 return variants_correct, variants_no_matter, variants_alignment, number_multi_alleles, number_diferences |
695 | 807 |
696 | 808 |
697 def get_coverage(gene_coverage): | 809 def get_coverage(gene_coverage): |
698 coverage = {} | 810 coverage = {} |
699 | 811 |
700 with open(gene_coverage, 'rtU') as reader: | 812 with open(gene_coverage, 'rtU') as reader: |
701 for line in reader: | 813 for line in reader: |
702 line = line.splitlines()[0] | 814 line = line.rstrip('\r\n') |
703 if len(line) > 0: | 815 if len(line) > 0: |
704 line = line.split('\t') | 816 line = line.split('\t') |
705 coverage[int(line[1])] = int(line[2]) | 817 coverage[int(line[1])] = int(line[2]) |
706 | 818 |
707 return coverage | 819 return coverage |
710 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq): | 822 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq): |
711 if len(coverage) == 0: | 823 if len(coverage) == 0: |
712 return sequence_length - 2 * length_extra_seq, 100.0, 0.0 | 824 return sequence_length - 2 * length_extra_seq, 100.0, 0.0 |
713 | 825 |
714 count_absent = 0 | 826 count_absent = 0 |
715 count_lowCoverage = 0 | 827 count_low_coverage = 0 |
716 sum_coverage = 0 | 828 sum_coverage = 0 |
717 | 829 |
718 counter = 1 | 830 counter = 1 |
719 while counter <= sequence_length: | 831 while counter <= sequence_length: |
720 if counter > length_extra_seq and counter <= sequence_length - length_extra_seq: | 832 if sequence_length - length_extra_seq >= counter > length_extra_seq: |
721 if coverage[counter] < minimum_depth_presence: | 833 if coverage[counter] < minimum_depth_presence: |
722 count_absent += 1 | 834 count_absent += 1 |
723 else: | 835 else: |
724 if coverage[counter] < minimum_depth_call: | 836 if coverage[counter] < minimum_depth_call: |
725 count_lowCoverage += 1 | 837 count_low_coverage += 1 |
726 sum_coverage += coverage[counter] | 838 sum_coverage += coverage[counter] |
727 counter += 1 | 839 counter += 1 |
728 | 840 |
729 mean_coverage = 0 | 841 mean_coverage = 0 |
730 percentage_lowCoverage = 0 | 842 percentage_low_coverage = 0 |
731 if sequence_length - 2 * length_extra_seq - count_absent > 0: | 843 if sequence_length - 2 * length_extra_seq - count_absent > 0: |
732 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) | 844 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) |
733 percentage_lowCoverage = float(count_lowCoverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100 | 845 percentage_low_coverage = \ |
734 | 846 float(count_low_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100 |
735 return count_absent, percentage_lowCoverage, mean_coverage | 847 |
848 return count_absent, percentage_low_coverage, mean_coverage | |
736 | 849 |
737 | 850 |
738 # Get genome coverage data | 851 # Get genome coverage data |
739 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): | 852 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): |
740 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') | 853 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') |
741 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file] | 854 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>', |
742 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) | 855 genome_coverage_data_file] |
856 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, False) | |
743 return run_successfully, genome_coverage_data_file | 857 return run_successfully, genome_coverage_data_file |
744 | 858 |
745 | 859 |
746 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix): | 860 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix): |
747 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf')) | 861 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf')) |
748 with open(vcf_file, 'wt') as writer: | 862 with open(vcf_file, 'wt') as writer: |
749 writer.write('##fileformat=VCFv4.2' + '\n') | 863 writer.write('##fileformat=VCFv4.2' + '\n') |
750 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence', 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n') | 864 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence', |
865 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n') | |
751 for i in sorted(variants.keys()): | 866 for i in sorted(variants.keys()): |
752 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.', '.', '.']) + '\n') | 867 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.', |
868 '.', '.']) + '\n') | |
753 | 869 |
754 compressed_vcf_file = vcf_file + '.gz' | 870 compressed_vcf_file = vcf_file + '.gz' |
755 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file] | 871 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file] |
756 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 872 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) |
757 if run_successfully: | 873 if run_successfully: |
758 command = ['bcftools', 'index', compressed_vcf_file] | 874 command = ['bcftools', 'index', compressed_vcf_file] |
759 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 875 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) |
760 | 876 |
761 if not run_successfully: | 877 if not run_successfully: |
762 compressed_vcf_file = None | 878 compressed_vcf_file = None |
763 | 879 |
764 return run_successfully, compressed_vcf_file | 880 return run_successfully, compressed_vcf_file |
765 | 881 |
766 | 882 |
767 def parse_fasta_inMemory(fasta_memory): | 883 def parse_fasta_in_memory(fasta_memory): |
768 fasta_memory = fasta_memory.splitlines() | 884 fasta_memory = fasta_memory.splitlines() |
769 sequence_dict = {} | 885 sequence_dict = {} |
770 for line in fasta_memory: | 886 for line in fasta_memory: |
771 if len(line) > 0: | 887 if len(line) > 0: |
772 if line.startswith('>'): | 888 if line.startswith('>'): |
775 sequence_dict['sequence'] += line | 891 sequence_dict['sequence'] += line |
776 | 892 |
777 return sequence_dict | 893 return sequence_dict |
778 | 894 |
779 | 895 |
780 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, sufix): | 896 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir): |
781 sequence_dict = None | 897 sequence_dict = None |
782 | 898 |
783 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta')) | 899 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta')) |
784 | 900 |
785 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False) | 901 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False) |
786 if run_successfully: | 902 if run_successfully: |
787 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file] | 903 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file] |
788 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 904 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) |
789 if run_successfully: | 905 if run_successfully: |
790 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1'] | 906 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1'] |
791 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 907 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) |
792 if run_successfully: | 908 if run_successfully: |
793 sequence_dict = parse_fasta_inMemory(stdout) | 909 sequence_dict = parse_fasta_in_memory(stdout) |
794 | 910 |
795 return run_successfully, sequence_dict | 911 return run_successfully, sequence_dict |
796 | 912 |
797 | 913 |
798 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence, length_extra_seq): | 914 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence, |
799 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence) | 915 minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence, |
800 | 916 length_extra_seq): |
801 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, len(sequence)) | 917 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = \ |
918 get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, | |
919 sequence) | |
920 | |
921 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = \ | |
922 cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, | |
923 length_extra_seq, len(sequence)) | |
802 | 924 |
803 run_successfully = False | 925 run_successfully = False |
804 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}} | 926 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}} |
805 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']: | 927 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']: |
806 run_successfully, compressed_vcf_file = write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1]) | 928 run_successfully, compressed_vcf_file = \ |
929 write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1]) | |
807 if run_successfully: | 930 if run_successfully: |
808 run_successfully, sequence_dict = compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, variant_type.split('_', 1)[1]) | 931 run_successfully, sequence_dict = \ |
932 compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir) | |
809 if run_successfully: | 933 if run_successfully: |
810 consensus[variant_type.split('_', 1)[1]] = {'header': sequence_dict['header'], 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) - length_extra_seq]} | 934 consensus[variant_type.split('_', 1)[1]] = \ |
935 {'header': sequence_dict['header'], | |
936 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) - | |
937 length_extra_seq]} | |
811 | 938 |
812 return run_successfully, number_multi_alleles, consensus, number_diferences | 939 return run_successfully, number_multi_alleles, consensus, number_diferences |
813 | 940 |
814 | 941 |
815 @utils.trace_unhandled_exceptions | 942 @utils.trace_unhandled_exceptions |
816 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads): | 943 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, |
944 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): | |
817 count_absent = None | 945 count_absent = None |
818 percentage_lowCoverage = None | 946 percentage_low_coverage = None |
819 meanCoverage = None | 947 mean_coverage = None |
820 number_diferences = 0 | 948 number_diferences = 0 |
949 number_multi_alleles = 0 | |
950 consensus_sequence = {'correct': {}, 'noMatter': {}, 'alignment': {}} | |
821 | 951 |
822 # Create vcf file (for multiple alleles check) | 952 # Create vcf file (for multiple alleles check) |
823 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) | 953 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) |
824 if run_successfully: | 954 if run_successfully: |
825 # Create coverage tab file | 955 # Create coverage tab file |
826 run_successfully, gene_coverage = compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter) | 956 run_successfully, gene_coverage = \ |
957 compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter) | |
827 | 958 |
828 if run_successfully: | 959 if run_successfully: |
829 variants = get_variants(gene_vcf) | 960 try: |
961 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], | |
962 encoding=sys.getdefaultencoding()) | |
963 except UnicodeDecodeError: | |
964 try: | |
965 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' | |
966 ' "utf_8" encoding: {}'.format(gene_vcf)) | |
967 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], | |
968 encoding='utf_8') | |
969 except UnicodeDecodeError: | |
970 print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' | |
971 ' "latin_1" encoding: {}'.format(gene_vcf)) | |
972 variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], | |
973 encoding='latin_1') | |
830 | 974 |
831 coverage = get_coverage(gene_coverage) | 975 coverage = get_coverage(gene_coverage) |
832 | 976 |
833 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence_information['sequence'], length_extra_seq) | 977 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = \ |
834 | 978 create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants, |
835 count_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq) | 979 minimum_depth_presence, minimum_depth_call, |
836 | 980 minimum_depth_frequency_dominant_allele, |
837 utils.saveVariableToPickle([run_successfully, counter, number_multi_alleles, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences], outdir, str('coverage_info.' + str(counter))) | 981 sequence_information['sequence'], length_extra_seq) |
982 | |
983 try: | |
984 count_absent, percentage_low_coverage, mean_coverage = \ | |
985 get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence, | |
986 minimum_depth_call, length_extra_seq) | |
987 except KeyError: | |
988 print('ERROR: KeyError') | |
989 print(sequence_information) | |
990 raise KeyError | |
991 | |
992 utils.save_variable_to_pickle([run_successfully, counter, number_multi_alleles, count_absent, | |
993 percentage_low_coverage, mean_coverage, consensus_sequence, number_diferences], | |
994 outdir, str('coverage_info.' + str(counter))) | |
838 | 995 |
839 | 996 |
840 def clean_header(header): | 997 def clean_header(header): |
841 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | 998 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] |
842 new_header = str(header) | 999 new_header = str(header) |
854 with open(fasta_file, 'rtU') as reader: | 1011 with open(fasta_file, 'rtU') as reader: |
855 blank_line_found = False | 1012 blank_line_found = False |
856 sequence_counter = 0 | 1013 sequence_counter = 0 |
857 temp_sequence_dict = {} | 1014 temp_sequence_dict = {} |
858 for line in reader: | 1015 for line in reader: |
859 line = line.splitlines()[0] | 1016 line = line.rstrip('\r\n') |
860 if len(line) > 0: | 1017 if len(line) > 0: |
861 if not blank_line_found: | 1018 if not blank_line_found: |
862 if line.startswith('>'): | 1019 if line.startswith('>'): |
863 if len(temp_sequence_dict) > 0: | 1020 if len(temp_sequence_dict) > 0: |
864 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | 1021 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: |
865 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | 1022 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] |
866 else: | 1023 else: |
867 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) | 1024 print('{header} sequence ignored due to' |
868 del headers[temp_sequence_dict.values()[0]['header']] | 1025 ' length = 0'.format(header=list(temp_sequence_dict.values())[0]['header'])) |
1026 del headers[list(temp_sequence_dict.values())[0]['header']] | |
869 temp_sequence_dict = {} | 1027 temp_sequence_dict = {} |
870 | 1028 |
871 original_header, new_header = clean_header(line[1:]) | 1029 original_header, new_header = clean_header(line[1:]) |
872 if new_header in headers: | 1030 if new_header in headers: |
873 sys.exit('Found duplicated sequence headers: {original_header}'.format(original_header=original_header)) | 1031 sys.exit('Found duplicated sequence' |
1032 ' headers: {original_header}'.format(original_header=original_header)) | |
874 | 1033 |
875 sequence_counter += 1 | 1034 sequence_counter += 1 |
876 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0} | 1035 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0} |
877 headers[new_header] = str(original_header) | 1036 headers[new_header] = str(original_header) |
878 if new_header != original_header: | 1037 if new_header != original_header: |
879 headers_changed = True | 1038 headers_changed = True |
880 else: | 1039 else: |
881 temp_sequence_dict[sequence_counter]['sequence'] += line | 1040 temp_sequence_dict[sequence_counter]['sequence'] += line.replace(' ', '') |
882 temp_sequence_dict[sequence_counter]['length'] += len(line) | 1041 temp_sequence_dict[sequence_counter]['length'] += len(line.replace(' ', '')) |
883 else: | 1042 else: |
884 sys.exit('It was found a blank line between the fasta file above line ' + line) | 1043 sys.exit('It was found a blank line between the fasta file above line ' + line) |
885 else: | 1044 else: |
886 blank_line_found = True | 1045 blank_line_found = True |
887 | 1046 |
888 if len(temp_sequence_dict) > 0: | 1047 if len(temp_sequence_dict) > 0: |
889 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | 1048 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: |
890 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | 1049 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] |
891 else: | 1050 else: |
892 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) | 1051 print('{header} sequence ignored due to' |
893 del headers[temp_sequence_dict.values()[0]['header']] | 1052 ' length <= 0'.format(header=list(temp_sequence_dict.values())[0]['header'])) |
1053 del headers[list(temp_sequence_dict.values())[0]['header']] | |
894 | 1054 |
895 return sequence_dict, headers, headers_changed | 1055 return sequence_dict, headers, headers_changed |
896 | 1056 |
897 | 1057 |
898 def determine_threads_2_use(number_sequences, threads): | 1058 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, |
899 if number_sequences >= threads: | 1059 minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, not_write_consensus): |
900 return 1 | |
901 else: | |
902 return threads / number_sequences | |
903 | |
904 | |
905 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus): | |
906 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') | 1060 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') |
907 utils.removeDirectory(sequence_data_outdir) | 1061 utils.remove_directory(sequence_data_outdir) |
908 os.mkdir(sequence_data_outdir) | 1062 os.mkdir(sequence_data_outdir) |
909 | 1063 |
910 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq) | 1064 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq) |
911 | |
912 threads_2_use = determine_threads_2_use(len(sequences), threads) | |
913 | 1065 |
914 pool = multiprocessing.Pool(processes=threads) | 1066 pool = multiprocessing.Pool(processes=threads) |
915 for sequence_counter in sequences: | 1067 for sequence_counter in sequences: |
916 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') | 1068 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') |
917 utils.removeDirectory(sequence_dir) | 1069 utils.remove_directory(sequence_dir) |
918 os.makedirs(sequence_dir) | 1070 os.makedirs(sequence_dir) |
919 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,)) | 1071 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, |
1072 sequence_counter, reference_file, length_extra_seq, | |
1073 minimum_depth_presence, minimum_depth_call, | |
1074 minimum_depth_frequency_dominant_allele,)) | |
920 pool.close() | 1075 pool.close() |
921 pool.join() | 1076 pool.join() |
922 | 1077 |
923 run_successfully, sample_data, consensus_files, consensus_sequences = gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, notWriteConsensus) | 1078 run_successfully, sample_data, consensus_files, consensus_sequences = \ |
1079 gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, | |
1080 length_extra_seq, not_write_consensus) | |
924 | 1081 |
925 return run_successfully, sample_data, consensus_files, consensus_sequences | 1082 return run_successfully, sample_data, consensus_files, consensus_sequences |
926 | 1083 |
927 | 1084 |
928 def chunkstring(string, length): | 1085 def chunkstring(string, length): |
939 for line in fasta_sequence_lines: | 1096 for line in fasta_sequence_lines: |
940 writer.write(line + '\n') | 1097 writer.write(line + '\n') |
941 return consensus_files | 1098 return consensus_files |
942 | 1099 |
943 | 1100 |
944 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq, notWriteConsensus): | 1101 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq, |
1102 not_write_consensus): | |
945 run_successfully = True | 1103 run_successfully = True |
946 counter = 0 | 1104 counter = 0 |
947 sample_data = {} | 1105 sample_data = {} |
948 | 1106 |
949 consensus_files = None | 1107 consensus_files = None |
950 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}} | 1108 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}} |
951 | 1109 |
952 write_consensus_first_time = True | 1110 write_consensus_first_time = True |
953 | 1111 |
954 genes_directories = [d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, ''))] | 1112 genes_directories = [d for d in os.listdir(data_directory) if |
1113 not d.startswith('.') and | |
1114 os.path.isdir(os.path.join(data_directory, d, ''))] | |
955 for gene_dir in genes_directories: | 1115 for gene_dir in genes_directories: |
956 gene_dir_path = os.path.join(data_directory, gene_dir, '') | 1116 gene_dir_path = os.path.join(data_directory, gene_dir, '') |
957 | 1117 |
958 files = [f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f))] | 1118 files = [f for f in os.listdir(gene_dir_path) if |
1119 not f.startswith('.') and | |
1120 os.path.isfile(os.path.join(gene_dir_path, f))] | |
959 for file_found in files: | 1121 for file_found in files: |
960 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'): | 1122 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'): |
961 file_path = os.path.join(gene_dir_path, file_found) | 1123 file_path = os.path.join(gene_dir_path, file_found) |
962 | 1124 |
963 if run_successfully: | 1125 if run_successfully: |
964 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(file_path) | 1126 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_low_coverage, \ |
965 | 1127 mean_coverage, consensus_sequence, \ |
966 if not notWriteConsensus: | 1128 number_diferences = utils.extract_variable_from_pickle(file_path) |
1129 | |
1130 if not not_write_consensus: | |
967 for consensus_type in consensus_sequence: | 1131 for consensus_type in consensus_sequence: |
968 consensus_sequences_together[consensus_type][sequence_counter] = {'header': consensus_sequence[consensus_type]['header'], 'sequence': consensus_sequence[consensus_type]['sequence']} | 1132 consensus_sequences_together[consensus_type][sequence_counter] = \ |
1133 {'header': consensus_sequence[consensus_type]['header'], | |
1134 'sequence': consensus_sequence[consensus_type]['sequence']} | |
969 | 1135 |
970 if write_consensus_first_time: | 1136 if write_consensus_first_time: |
971 for consensus_type in ['correct', 'noMatter', 'alignment']: | 1137 for consensus_type in ['correct', 'noMatter', 'alignment']: |
972 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta')) | 1138 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta')) |
973 if os.path.isfile(file_to_remove): | 1139 if os.path.isfile(file_to_remove): |
975 write_consensus_first_time = False | 1141 write_consensus_first_time = False |
976 consensus_files = write_consensus(outdir, sample, consensus_sequence) | 1142 consensus_files = write_consensus(outdir, sample, consensus_sequence) |
977 | 1143 |
978 gene_identity = 0 | 1144 gene_identity = 0 |
979 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0: | 1145 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0: |
980 gene_identity = 100 - (float(number_diferences) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent)) * 100 | 1146 gene_identity = 100 - \ |
981 | 1147 (float(number_diferences) / |
982 sample_data[sequence_counter] = {'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - (float(count_absent) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq)) * 100, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': gene_identity} | 1148 (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - |
1149 count_absent)) * 100 | |
1150 | |
1151 sample_data[sequence_counter] = \ | |
1152 {'header': sequences_information[sequence_counter]['header'], | |
1153 'gene_coverage': 100 - (float(count_absent) / | |
1154 (sequences_information[sequence_counter]['length'] - 2 * | |
1155 length_extra_seq)) * 100, | |
1156 'gene_low_coverage': percentage_low_coverage, | |
1157 'gene_number_positions_multiple_alleles': multiple_alleles_found, | |
1158 'gene_mean_read_coverage': mean_coverage, | |
1159 'gene_identity': gene_identity} | |
983 counter += 1 | 1160 counter += 1 |
984 | 1161 |
985 if not debug_mode_true: | 1162 if not debug_mode_true: |
986 utils.removeDirectory(gene_dir_path) | 1163 utils.remove_directory(gene_dir_path) |
987 | 1164 |
988 if counter != len(sequences_information): | 1165 if counter != len(sequences_information): |
989 run_successfully = False | 1166 run_successfully = False |
990 | 1167 |
991 return run_successfully, sample_data, consensus_files, consensus_sequences_together | 1168 return run_successfully, sample_data, consensus_files, consensus_sequences_together |
993 | 1170 |
994 rematch_timer = functools.partial(utils.timer, name='ReMatCh module') | 1171 rematch_timer = functools.partial(utils.timer, name='ReMatCh module') |
995 | 1172 |
996 | 1173 |
997 @rematch_timer | 1174 @rematch_timer |
998 def runRematchModule(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, conserved_True, debug_mode_true, numMapLoc, minimum_gene_identity, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT, gene_list_reference, notWriteConsensus): | 1175 def run_rematch_module(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, |
1176 minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, | |
1177 debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run, | |
1178 soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode, | |
1179 bowtie_algorithm, bowtie_opt, gene_list_reference, not_write_consensus, clean_run=True): | |
999 rematch_folder = os.path.join(outdir, 'rematch_module', '') | 1180 rematch_folder = os.path.join(outdir, 'rematch_module', '') |
1000 utils.removeDirectory(rematch_folder) | 1181 |
1182 utils.remove_directory(rematch_folder) | |
1001 os.mkdir(rematch_folder) | 1183 os.mkdir(rematch_folder) |
1002 | 1184 |
1003 # Map reads | 1185 # Map reads |
1004 run_successfully, bam_file, reference_file = mapping_reads(fastq_files, reference_file, threads, rematch_folder, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT) | 1186 run_successfully, bam_file, reference_file = mapping_reads(fastq_files=fastq_files, reference_file=reference_file, |
1005 | 1187 threads=threads, outdir=rematch_folder, |
1188 num_map_loc=num_map_loc, rematch_run=rematch_run, | |
1189 soft_clip_base_quality=soft_clip_base_quality, | |
1190 soft_clip_recode_run=soft_clip_recode_run, | |
1191 reference_dict=reference_dict, | |
1192 soft_clip_cigar_flag_recode=soft_clip_cigar_flag_recode, | |
1193 bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt, | |
1194 clean_run=clean_run) | |
1006 if run_successfully: | 1195 if run_successfully: |
1007 # Index reference file | 1196 # Index reference file |
1008 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True) | 1197 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True) |
1009 if run_successfully: | 1198 if run_successfully: |
1010 print 'Analysing alignment data' | 1199 print('Analysing alignment data') |
1011 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus) | 1200 run_successfully, sample_data, consensus_files, consensus_sequences = \ |
1201 sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, | |
1202 minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, | |
1203 debug_mode_true, not_write_consensus) | |
1012 | 1204 |
1013 if run_successfully: | 1205 if run_successfully: |
1014 print 'Writing report file' | 1206 print('Writing report file') |
1015 number_absent_genes = 0 | 1207 number_absent_genes = 0 |
1016 number_genes_multiple_alleles = 0 | 1208 number_genes_multiple_alleles = 0 |
1017 mean_sample_coverage = 0 | 1209 mean_sample_coverage = 0 |
1018 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: | 1210 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: |
1019 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') | 1211 print('\n'.join(outdir, 'rematchModule_report.txt')) |
1212 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', | |
1213 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', | |
1214 'percentage_gene_identity']) + '\n') | |
1020 for i in range(1, len(sample_data) + 1): | 1215 for i in range(1, len(sample_data) + 1): |
1021 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') | 1216 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']], |
1022 | 1217 str(round(sample_data[i]['gene_coverage'], 2)), |
1023 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: | 1218 str(round(sample_data[i]['gene_mean_read_coverage'], 2)), |
1219 str(round(sample_data[i]['gene_low_coverage'], 2)), | |
1220 str(sample_data[i]['gene_number_positions_multiple_alleles']), | |
1221 str(round(sample_data[i]['gene_identity'], 2))]) + '\n') | |
1222 | |
1223 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or \ | |
1224 sample_data[i]['gene_identity'] < minimum_gene_identity: | |
1024 number_absent_genes += 1 | 1225 number_absent_genes += 1 |
1025 else: | 1226 else: |
1026 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage'] | 1227 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage'] |
1027 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0: | 1228 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0: |
1028 number_genes_multiple_alleles += 1 | 1229 number_genes_multiple_alleles += 1 |
1029 | 1230 |
1030 if len(sample_data) - number_absent_genes > 0: | 1231 if len(sample_data) - number_absent_genes > 0: |
1031 mean_sample_coverage = float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes) | 1232 mean_sample_coverage = \ |
1233 float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes) | |
1032 else: | 1234 else: |
1033 mean_sample_coverage = 0 | 1235 mean_sample_coverage = 0 |
1034 | 1236 |
1035 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') | 1237 writer.write('\n'.join(['#general', |
1036 | 1238 '>number_absent_genes', str(number_absent_genes), |
1037 print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) | 1239 '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), |
1240 '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') | |
1241 | |
1242 print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), | |
1243 str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), | |
1244 str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))])) | |
1038 | 1245 |
1039 if not debug_mode_true: | 1246 if not debug_mode_true: |
1040 utils.removeDirectory(rematch_folder) | 1247 utils.remove_directory(rematch_folder) |
1041 | 1248 |
1042 return run_successfully, sample_data if 'sample_data' in locals() else None, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, consensus_files if 'consensus_files' in locals() else None, consensus_sequences if 'consensus_sequences' in locals() else None | 1249 return run_successfully, sample_data if 'sample_data' in locals() else None, \ |
1250 {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, | |
1251 'number_genes_multiple_alleles': number_genes_multiple_alleles if | |
1252 'number_genes_multiple_alleles' in locals() else None, | |
1253 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, \ | |
1254 consensus_files if 'consensus_files' in locals() else None,\ | |
1255 consensus_sequences if 'consensus_sequences' in locals() else None |