Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/rematch_module.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Wed, 22 Jan 2020 08:41:44 -0500 |
| parents | |
| children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:965517909457 |
|---|---|
| 1 import os.path | |
| 2 import multiprocessing | |
| 3 import utils | |
| 4 import functools | |
| 5 import sys | |
| 6 import pickle | |
| 7 | |
| 8 | |
| 9 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): | |
| 10 command = ['samtools', 'faidx', fasta, '', '', ''] | |
| 11 shell_true = False | |
| 12 if region_None is not None: | |
| 13 command[3] = region_None | |
| 14 if region_outfile_none is not None: | |
| 15 command[4] = '>' | |
| 16 command[5] = region_outfile_none | |
| 17 shell_true = True | |
| 18 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True) | |
| 19 return run_successfully, stdout | |
| 20 | |
| 21 | |
| 22 # Indexing reference file using Bowtie2 | |
| 23 def indexSequenceBowtie2(referenceFile, threads): | |
| 24 if os.path.isfile(str(referenceFile + '.1.bt2')): | |
| 25 run_successfully = True | |
| 26 else: | |
| 27 command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] | |
| 28 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
| 29 return run_successfully | |
| 30 | |
| 31 | |
| 32 # Mapping with Bowtie2 | |
| 33 def mappingBowtie2(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc, bowtieOPT): | |
| 34 sam_file = os.path.join(outdir, str('alignment.sam')) | |
| 35 | |
| 36 # Index reference file | |
| 37 run_successfully = indexSequenceBowtie2(referenceFile, threads) | |
| 38 | |
| 39 if run_successfully: | |
| 40 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '', '-S', sam_file] | |
| 41 | |
| 42 if len(fastq_files) == 1: | |
| 43 command[9] = '-U ' + fastq_files[0] | |
| 44 elif len(fastq_files) == 2: | |
| 45 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] | |
| 46 else: | |
| 47 return False, None | |
| 48 | |
| 49 if conserved_True: | |
| 50 command[4] = '--sensitive' | |
| 51 else: | |
| 52 command[4] = '--very-sensitive-local' | |
| 53 | |
| 54 if bowtieOPT is not None: | |
| 55 command[11] = bowtieOPT | |
| 56 | |
| 57 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
| 58 | |
| 59 if not run_successfully: | |
| 60 sam_file = None | |
| 61 | |
| 62 return run_successfully, sam_file | |
| 63 | |
| 64 | |
| 65 def split_cigar(cigar): | |
| 66 cigars = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'] | |
| 67 | |
| 68 splited_cigars = [] | |
| 69 numbers = '' | |
| 70 for char in cigar: | |
| 71 if char not in cigars: | |
| 72 numbers += char | |
| 73 else: | |
| 74 splited_cigars.append([int(numbers), char]) | |
| 75 numbers = '' | |
| 76 | |
| 77 return splited_cigars | |
| 78 | |
| 79 | |
| 80 def recode_cigar_based_on_base_quality(cigar, bases_quality, softClip_baseQuality, mapping_position, direct_strand_true, softClip_cigarFlagRecode): | |
| 81 cigar = split_cigar(cigar) | |
| 82 soft_left = [] | |
| 83 soft_right = [] | |
| 84 cigar_flags_for_reads_length = ('M', 'I', 'S', '=', 'X') | |
| 85 read_length_without_right_s = sum([cigar_part[0] for cigar_part in cigar if cigar_part[1] in cigar_flags_for_reads_length]) - (cigar[len(cigar) - 1][0] if cigar[len(cigar) - 1][1] == 'S' else 0) | |
| 86 for x, base in enumerate(bases_quality): | |
| 87 if ord(base) - 33 >= softClip_baseQuality: | |
| 88 if x <= cigar[0][0] - 1: | |
| 89 if cigar[0][1] == 'S': | |
| 90 soft_left.append(x) | |
| 91 elif x > read_length_without_right_s - 1: | |
| 92 if cigar[len(cigar) - 1][1] == 'S': | |
| 93 soft_right.append(x) | |
| 94 | |
| 95 left_changed = (False, 0) | |
| 96 if len(soft_left) > 0: | |
| 97 soft_left = min(soft_left) + 1 | |
| 98 if soft_left == 1: | |
| 99 cigar = [[cigar[0][0], softClip_cigarFlagRecode]] + cigar[1:] | |
| 100 left_changed = (True, cigar[0][0]) | |
| 101 elif cigar[0][0] - soft_left > 0: | |
| 102 cigar = [[soft_left, 'S']] + [[cigar[0][0] - soft_left, softClip_cigarFlagRecode]] + cigar[1:] | |
| 103 left_changed = (True, cigar[0][0] - soft_left) | |
| 104 | |
| 105 right_changed = (False, 0) | |
| 106 if len(soft_right) > 0: | |
| 107 soft_right = max(soft_right) + 1 | |
| 108 cigar = cigar[:-1] | |
| 109 if soft_right - read_length_without_right_s > 0: | |
| 110 cigar.append([soft_right - read_length_without_right_s, softClip_cigarFlagRecode]) | |
| 111 right_changed = (True, soft_right - read_length_without_right_s) | |
| 112 if len(bases_quality) - soft_right > 0: | |
| 113 cigar.append([len(bases_quality) - soft_right, 'S']) | |
| 114 | |
| 115 if left_changed[0]: | |
| 116 # if direct_strand_true: | |
| 117 mapping_position = mapping_position - left_changed[1] | |
| 118 # if right_changed[0]: | |
| 119 # if not direct_strand_true: | |
| 120 # mapping_position = mapping_position + right_changed[1] | |
| 121 | |
| 122 return ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in cigar]), str(mapping_position) | |
| 123 | |
| 124 | |
| 125 def verify_is_forward_read(sam_flag_bit): | |
| 126 # 64 = 1000000 | |
| 127 forward_read = False | |
| 128 bit = format(sam_flag_bit, 'b').zfill(7) | |
| 129 if bit[-7] == '1': | |
| 130 forward_read = True | |
| 131 return forward_read | |
| 132 | |
| 133 | |
| 134 def verify_mapped_direct_strand(sam_flag_bit): | |
| 135 # 16 = 10000 -> mapped in reverse strand | |
| 136 direct_strand = False | |
| 137 bit = format(sam_flag_bit, 'b').zfill(5) | |
| 138 if bit[-5] == '0': | |
| 139 direct_strand = True | |
| 140 return direct_strand | |
| 141 | |
| 142 | |
| 143 def verify_mapped_tip(reference_length, mapping_position, read_length, cigar): | |
| 144 tip = False | |
| 145 if 'S' in cigar: | |
| 146 cigar = split_cigar(cigar) | |
| 147 if cigar[0][1] == 'S': | |
| 148 if mapping_position - cigar[0][0] < 0: | |
| 149 tip = True | |
| 150 if cigar[len(cigar) - 1][1] == 'S': | |
| 151 if mapping_position + cigar[len(cigar) - 1][0] > reference_length: | |
| 152 tip = True | |
| 153 return tip | |
| 154 | |
| 155 | |
| 156 def change_sam_flag_bit_mapped_reverse_strand_2_direct_strand(sam_flag_bit): | |
| 157 bit = list(format(sam_flag_bit, 'b').zfill(5)) | |
| 158 bit[-5] = '0' | |
| 159 return int(''.join(bit), 2) | |
| 160 | |
| 161 | |
| 162 def change_sam_flag_bit_mate_reverse_strand_2_direct_strand(sam_flag_bit): | |
| 163 bit = list(format(sam_flag_bit, 'b').zfill(6)) | |
| 164 bit[-6] = '0' | |
| 165 return int(''.join(bit), 2) | |
| 166 | |
| 167 | |
| 168 def move_read_mapped_reverse_strand_2_direct_strand(seq, bases_quality, sam_flag_bit, cigar): | |
| 169 seq = utils.reverse_complement(seq) | |
| 170 bases_quality = ''.join(reversed(list(bases_quality))) | |
| 171 sam_flag_bit = change_sam_flag_bit_mapped_reverse_strand_2_direct_strand(sam_flag_bit) | |
| 172 cigar = ''.join([str(cigar_part[0]) + cigar_part[1] for cigar_part in reversed(split_cigar(cigar))]) | |
| 173 return seq, bases_quality, str(sam_flag_bit), cigar | |
| 174 | |
| 175 | |
| 176 @utils.trace_unhandled_exceptions | |
| 177 def parallelized_recode_soft_clipping(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode): | |
| 178 lines_sam = [] | |
| 179 for line in line_collection: | |
| 180 line = line.splitlines()[0] | |
| 181 if len(line) > 0: | |
| 182 if line.startswith('@'): | |
| 183 lines_sam.append(line) | |
| 184 else: | |
| 185 line = line.split('\t') | |
| 186 if not verify_mapped_tip(sequences_length[line[2]], int(line[3]), len(line[9]), line[5]): | |
| 187 line[5], line[3] = recode_cigar_based_on_base_quality(line[5], line[10], softClip_baseQuality, int(line[3]), verify_mapped_direct_strand(int(line[1])), softClip_cigarFlagRecode) | |
| 188 lines_sam.append('\t'.join(line)) | |
| 189 with open(pickleFile, 'wb') as writer: | |
| 190 pickle.dump(lines_sam, writer) | |
| 191 | |
| 192 | |
| 193 def recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode): | |
| 194 pickle_files = [] | |
| 195 sequences_length = {} | |
| 196 for x, seq_info in reference_dict.items(): | |
| 197 sequences_length[seq_info['header']] = seq_info['length'] | |
| 198 | |
| 199 with open(sam_file, 'rtU') as reader: | |
| 200 pool = multiprocessing.Pool(processes=threads) | |
| 201 line_collection = [] | |
| 202 x = 0 | |
| 203 for x, line in enumerate(reader): | |
| 204 line_collection.append(line) | |
| 205 if x % 10000 == 0: | |
| 206 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') | |
| 207 pickle_files.append(pickleFile) | |
| 208 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) | |
| 209 line_collection = [] | |
| 210 if len(line_collection) > 0: | |
| 211 pickleFile = os.path.join(outdir, 'remove_soft_clipping.' + str(x) + '.pkl') | |
| 212 pickle_files.append(pickleFile) | |
| 213 pool.apply_async(parallelized_recode_soft_clipping, args=(line_collection, pickleFile, softClip_baseQuality, sequences_length, softClip_cigarFlagRecode,)) | |
| 214 line_collection = [] | |
| 215 pool.close() | |
| 216 pool.join() | |
| 217 | |
| 218 os.remove(sam_file) | |
| 219 | |
| 220 new_sam_file = os.path.join(outdir, 'alignment_with_soft_clipping_recoded.sam') | |
| 221 with open(new_sam_file, 'wt') as writer: | |
| 222 for pickleFile in pickle_files: | |
| 223 if os.path.isfile(pickleFile): | |
| 224 lines_sam = None | |
| 225 with open(pickleFile, 'rb') as reader: | |
| 226 lines_sam = pickle.load(reader) | |
| 227 if lines_sam is not None: | |
| 228 for line in lines_sam: | |
| 229 writer.write(line + '\n') | |
| 230 os.remove(pickleFile) | |
| 231 | |
| 232 return new_sam_file | |
| 233 | |
| 234 | |
| 235 # Sort alignment file | |
| 236 def sortAlignment(alignment_file, output_file, sortByName_True, threads): | |
| 237 outFormat_string = os.path.splitext(output_file)[1][1:].lower() | |
| 238 command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] | |
| 239 if sortByName_True: | |
| 240 command[6] = '-n' | |
| 241 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
| 242 if not run_successfully: | |
| 243 output_file = None | |
| 244 return run_successfully, output_file | |
| 245 | |
| 246 | |
| 247 # Index alignment file | |
| 248 def indexAlignment(alignment_file): | |
| 249 command = ['samtools', 'index', alignment_file] | |
| 250 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
| 251 return run_successfully | |
| 252 | |
| 253 | |
| 254 def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT): | |
| 255 # Create a symbolic link to the reference_file | |
| 256 reference_link = os.path.join(outdir, os.path.basename(reference_file)) | |
| 257 os.symlink(reference_file, reference_link) | |
| 258 | |
| 259 bam_file = None | |
| 260 # Mapping reads using Bowtie2 | |
| 261 run_successfully, sam_file = mappingBowtie2(fastq_files, reference_link, threads, outdir, conserved_True, numMapLoc, bowtieOPT) | |
| 262 | |
| 263 if run_successfully: | |
| 264 # Remove soft clipping | |
| 265 if rematch_run == softClip_recodeRun or softClip_recodeRun == 'both': | |
| 266 print 'Recoding soft clipped regions' | |
| 267 sam_file = recode_soft_clipping_from_sam(sam_file, outdir, threads, softClip_baseQuality, reference_dict, softClip_cigarFlagRecode) | |
| 268 | |
| 269 # Convert sam to bam and sort bam | |
| 270 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) | |
| 271 | |
| 272 if run_successfully: | |
| 273 os.remove(sam_file) | |
| 274 # Index bam | |
| 275 run_successfully = indexAlignment(bam_file) | |
| 276 | |
| 277 return run_successfully, bam_file, reference_link | |
| 278 | |
| 279 | |
| 280 def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): | |
| 281 gene_vcf = os.path.join(outdir, 'samtools_mpileup.sequence_' + str(counter) + '.vcf') | |
| 282 | |
| 283 command = ['samtools', 'mpileup', '--count-orphans', '--no-BAQ', '--min-BQ', '0', '--min-MQ', str(7), '--fasta-ref', reference_file, '--region', sequence_to_analyse, '--output', gene_vcf, '--VCF', '--uncompressed', '--output-tags', 'INFO/AD,AD,DP', bam_file] | |
| 284 | |
| 285 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
| 286 if not run_successfully: | |
| 287 gene_vcf = None | |
| 288 return run_successfully, gene_vcf | |
| 289 | |
| 290 | |
| 291 # Read vcf file | |
| 292 class Vcf(): | |
| 293 def __init__(self, vcfFile): | |
| 294 self.vcf = open(vcfFile, 'rtU') | |
| 295 self.line_read = self.vcf.readline() | |
| 296 while self.line_read.startswith('#'): | |
| 297 self.line_read = self.vcf.readline() | |
| 298 self.line = self.line_read | |
| 299 | |
| 300 def readline(self): | |
| 301 self.line_stored = self.line | |
| 302 self.line = self.vcf.readline() | |
| 303 return self.line_stored | |
| 304 | |
| 305 def close(self): | |
| 306 self.vcf.close() | |
| 307 | |
| 308 | |
| 309 def get_variants(gene_vcf): | |
| 310 variants = {} | |
| 311 | |
| 312 vfc_file = Vcf(gene_vcf) | |
| 313 line = vfc_file.readline() | |
| 314 while len(line) > 0: | |
| 315 fields = line.splitlines()[0].split('\t') | |
| 316 if len(fields) > 0: | |
| 317 fields[1] = int(fields[1]) | |
| 318 | |
| 319 info_field = {} | |
| 320 for i in fields[7].split(';'): | |
| 321 i = i.split('=') | |
| 322 if len(i) > 1: | |
| 323 info_field[i[0]] = i[1] | |
| 324 else: | |
| 325 info_field[i[0]] = None | |
| 326 | |
| 327 format_field = {} | |
| 328 format_field_name = fields[8].split(':') | |
| 329 format_data = fields[9].split(':') | |
| 330 | |
| 331 for i in range(0, len(format_data)): | |
| 332 format_field[format_field_name[i]] = format_data[i].split(',') | |
| 333 | |
| 334 fields_to_store = {'REF': fields[3], 'ALT': fields[4].split(','), 'info': info_field, 'format': format_field} | |
| 335 if fields[1] in variants: | |
| 336 variants[fields[1]][len(variants[fields[1]])] = fields_to_store | |
| 337 else: | |
| 338 variants[fields[1]] = {0: fields_to_store} | |
| 339 | |
| 340 line = vfc_file.readline() | |
| 341 vfc_file.close() | |
| 342 | |
| 343 return variants | |
| 344 | |
| 345 | |
| 346 def indel_entry(variant_position): | |
| 347 entry_with_indel = [] | |
| 348 entry_with_snp = None | |
| 349 for i in variant_position: | |
| 350 keys = variant_position[i]['info'].keys() | |
| 351 if 'INDEL' in keys: | |
| 352 entry_with_indel.append(i) | |
| 353 else: | |
| 354 entry_with_snp = i | |
| 355 | |
| 356 return entry_with_indel, entry_with_snp | |
| 357 | |
| 358 | |
| 359 def get_alt_noMatter(variant_position, indel_true): | |
| 360 dp = sum(map(int, variant_position['format']['AD'])) | |
| 361 index_alleles_sorted_position = sorted(zip(map(int, variant_position['format']['AD']), range(0, len(variant_position['format']['AD']))), reverse=True) | |
| 362 index_dominant_allele = None | |
| 363 if not indel_true: | |
| 364 ad_idv = index_alleles_sorted_position[0][0] | |
| 365 | |
| 366 if len([x for x in index_alleles_sorted_position if x[0] == ad_idv]) > 1: | |
| 367 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == ad_idv]) | |
| 368 | |
| 369 index_dominant_allele = index_alleles_sorted_position[0][1] | |
| 370 if index_dominant_allele == 0: | |
| 371 alt = '.' | |
| 372 else: | |
| 373 alt = variant_position['ALT'][index_dominant_allele - 1] | |
| 374 | |
| 375 else: | |
| 376 ad_idv = int(variant_position['info']['IDV']) | |
| 377 | |
| 378 if float(ad_idv) / float(dp) >= 0.5: | |
| 379 if len([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) > 1: | |
| 380 index_alleles_sorted_position = sorted([x for x in index_alleles_sorted_position if x[0] == index_alleles_sorted_position[0][0]]) | |
| 381 | |
| 382 index_dominant_allele = index_alleles_sorted_position[0][1] | |
| 383 if index_dominant_allele == 0: | |
| 384 alt = '.' | |
| 385 else: | |
| 386 alt = variant_position['ALT'][index_dominant_allele - 1] | |
| 387 else: | |
| 388 ad_idv = int(variant_position['format']['AD'][0]) | |
| 389 alt = '.' | |
| 390 | |
| 391 return alt, dp, ad_idv, index_dominant_allele | |
| 392 | |
| 393 | |
| 394 def count_number_diferences(ref, alt): | |
| 395 number_diferences = 0 | |
| 396 | |
| 397 if len(ref) != len(alt): | |
| 398 number_diferences += 1 | |
| 399 | |
| 400 for i in range(0, min(len(ref), len(alt))): | |
| 401 if alt[i] != 'N' and ref[i] != alt[i]: | |
| 402 number_diferences += 1 | |
| 403 | |
| 404 return number_diferences | |
| 405 | |
| 406 | |
| 407 def get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): | |
| 408 alt = None | |
| 409 low_coverage = False | |
| 410 multiple_alleles = False | |
| 411 | |
| 412 if dp >= minimum_depth_presence: | |
| 413 if dp < minimum_depth_call: | |
| 414 alt = 'N' * len(variant_position['REF']) | |
| 415 low_coverage = True | |
| 416 else: | |
| 417 if ad_idv < minimum_depth_call: | |
| 418 alt = 'N' * len(variant_position['REF']) | |
| 419 low_coverage = True | |
| 420 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele: | |
| 421 multiple_alleles = True | |
| 422 else: | |
| 423 if float(ad_idv) / float(dp) < minimum_depth_frequency_dominant_allele: | |
| 424 alt = 'N' * len(variant_position['REF']) | |
| 425 if index_dominant_allele is not None: | |
| 426 variants_coverage = [int(variant_position['format']['AD'][i]) for i in range(0, len(variant_position['ALT']) + 1) if i != index_dominant_allele] | |
| 427 if sum(variants_coverage) > 0: | |
| 428 if float(max(variants_coverage)) / float(sum(variants_coverage)) > 0.5: | |
| 429 multiple_alleles = True | |
| 430 elif float(max(variants_coverage)) / float(sum(variants_coverage)) == 0.5 and len(variants_coverage) > 2: | |
| 431 multiple_alleles = True | |
| 432 else: | |
| 433 multiple_alleles = True | |
| 434 else: | |
| 435 alt = alt_noMatter | |
| 436 else: | |
| 437 low_coverage = True | |
| 438 | |
| 439 return alt, low_coverage, multiple_alleles | |
| 440 | |
| 441 | |
| 442 def get_alt_alignment(ref, alt): | |
| 443 if alt is None: | |
| 444 alt = 'N' * len(ref) | |
| 445 else: | |
| 446 if len(ref) != len(alt): | |
| 447 if len(alt) < len(ref): | |
| 448 if alt == '.': | |
| 449 alt = ref | |
| 450 alt += 'N' * (len(ref) - len(alt)) | |
| 451 else: | |
| 452 if alt[:len(ref)] == ref: | |
| 453 alt = '.' | |
| 454 else: | |
| 455 alt = alt[:len(ref)] | |
| 456 | |
| 457 return alt | |
| 458 | |
| 459 | |
| 460 def get_indel_more_likely(variant_position, indels_entry): | |
| 461 indel_coverage = {} | |
| 462 for i in indels_entry: | |
| 463 indel_coverage[i] = int(variant_position['info']['IDV']) | |
| 464 return indel_coverage.index(str(max(indel_coverage.values()))) | |
| 465 | |
| 466 | |
| 467 def determine_variant(variant_position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, indel_true): | |
| 468 alt_noMatter, dp, ad_idv, index_dominant_allele = get_alt_noMatter(variant_position, indel_true) | |
| 469 | |
| 470 alt_correct, low_coverage, multiple_alleles = get_alt_correct(variant_position, alt_noMatter, dp, ad_idv, index_dominant_allele, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) | |
| 471 | |
| 472 alt_alignment = get_alt_alignment(variant_position['REF'], alt_correct) | |
| 473 | |
| 474 return variant_position['REF'], alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment | |
| 475 | |
| 476 | |
| 477 def confirm_nucleotides_indel(ref, alt, variants, position_start_indel, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, alignment_true): | |
| 478 alt = list(alt) | |
| 479 | |
| 480 for i in range(0, len(alt) - 1): | |
| 481 if len(alt) < len(ref): | |
| 482 new_position = position_start_indel + len(alt) - i - 1 | |
| 483 alt_position = len(alt) - i - 1 | |
| 484 else: | |
| 485 if i + 1 > len(ref): | |
| 486 break | |
| 487 new_position = position_start_indel + 1 + i | |
| 488 alt_position = 1 + i | |
| 489 | |
| 490 if alt[alt_position] != 'N': | |
| 491 if new_position not in variants: | |
| 492 if alignment_true: | |
| 493 alt[alt_position] = 'N' | |
| 494 else: | |
| 495 alt = alt[: alt_position] | |
| 496 break | |
| 497 | |
| 498 entry_with_indel, entry_with_snp = indel_entry(variants[new_position]) | |
| 499 new_ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[new_position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | |
| 500 if alt_noMatter != '.' and alt[alt_position] != alt_noMatter: | |
| 501 alt[alt_position] = alt_noMatter | |
| 502 | |
| 503 return ''.join(alt) | |
| 504 | |
| 505 | |
| 506 def snp_indel(variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele): | |
| 507 entry_with_indel, entry_with_snp = indel_entry(variants[position]) | |
| 508 | |
| 509 if len(entry_with_indel) == 0: | |
| 510 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | |
| 511 else: | |
| 512 ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp = determine_variant(variants[position][entry_with_snp], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | |
| 513 | |
| 514 indel_more_likely = entry_with_indel[0] | |
| 515 if len(entry_with_indel) > 1: | |
| 516 indel_more_likely = get_indel_more_likely(variants[position], entry_with_indel) | |
| 517 | |
| 518 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = determine_variant(variants[position][indel_more_likely], minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) | |
| 519 | |
| 520 if alt_noMatter == '.': | |
| 521 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = ref_snp, alt_correct_snp, low_coverage_snp, multiple_alleles_snp, alt_noMatter_snp, alt_alignment_snp | |
| 522 else: | |
| 523 if alt_correct is None and alt_correct_snp is not None: | |
| 524 alt_correct = alt_correct_snp | |
| 525 elif alt_correct is not None and alt_correct_snp is not None: | |
| 526 if alt_correct_snp != '.' and alt_correct[0] != alt_correct_snp: | |
| 527 alt_correct = alt_correct_snp + alt_correct[1:] if len(alt_correct) > 1 else alt_correct_snp | |
| 528 if alt_noMatter_snp != '.' and alt_noMatter[0] != alt_noMatter_snp: | |
| 529 alt_noMatter = alt_noMatter_snp + alt_noMatter[1:] if len(alt_noMatter) > 1 else alt_noMatter_snp | |
| 530 if alt_alignment_snp != '.' and alt_alignment[0] != alt_alignment_snp: | |
| 531 alt_alignment = alt_alignment_snp + alt_alignment[1:] if len(alt_alignment) > 1 else alt_alignment_snp | |
| 532 | |
| 533 # if alt_noMatter != '.': | |
| 534 # alt_noMatter = confirm_nucleotides_indel(ref, alt_noMatter, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | |
| 535 # if alt_correct is not None and alt_correct != '.': | |
| 536 # alt_correct = confirm_nucleotides_indel(ref, alt_correct, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, False) | |
| 537 # if alt_alignment != '.': | |
| 538 # alt_alignment = confirm_nucleotides_indel(ref, alt_alignment, variants, position, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, True) | |
| 539 | |
| 540 return ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment | |
| 541 | |
| 542 | |
| 543 def get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence): | |
| 544 variants_correct = {} | |
| 545 variants_noMatter = {} | |
| 546 variants_alignment = {} | |
| 547 | |
| 548 correct_absent_positions = {} | |
| 549 correct_last_absent_position = '' | |
| 550 | |
| 551 noMatter_absent_positions = {} | |
| 552 noMatter_last_absent_position = '' | |
| 553 | |
| 554 multiple_alleles_found = [] | |
| 555 | |
| 556 counter = 1 | |
| 557 while counter <= len(sequence): | |
| 558 if counter in variants: | |
| 559 noMatter_last_absent_position = '' | |
| 560 | |
| 561 ref, alt_correct, low_coverage, multiple_alleles, alt_noMatter, alt_alignment = snp_indel(variants, counter, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele) | |
| 562 | |
| 563 if alt_alignment != '.': | |
| 564 variants_alignment[counter] = {'REF': ref, 'ALT': alt_alignment} | |
| 565 | |
| 566 if alt_noMatter != '.': | |
| 567 variants_noMatter[counter] = {'REF': ref, 'ALT': alt_noMatter} | |
| 568 | |
| 569 if alt_correct is None: | |
| 570 if counter - len(correct_last_absent_position) in correct_absent_positions: | |
| 571 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += ref | |
| 572 else: | |
| 573 correct_absent_positions[counter] = {'REF': ref, 'ALT': ''} | |
| 574 correct_last_absent_position += ref | |
| 575 else: | |
| 576 if alt_correct != '.': | |
| 577 if len(alt_correct) < len(ref): | |
| 578 if len(alt_correct) == 1: | |
| 579 correct_absent_positions[counter + 1] = {'REF': ref[1:], 'ALT': ''} | |
| 580 else: | |
| 581 correct_absent_positions[counter + 1] = {'REF': ref[1:], 'ALT': alt_correct[1:]} | |
| 582 | |
| 583 correct_last_absent_position = ref[1:] | |
| 584 else: | |
| 585 variants_correct[counter] = {'REF': ref, 'ALT': alt_correct} | |
| 586 correct_last_absent_position = '' | |
| 587 else: | |
| 588 correct_last_absent_position = '' | |
| 589 | |
| 590 if multiple_alleles: | |
| 591 multiple_alleles_found.append(counter) | |
| 592 | |
| 593 counter += len(ref) | |
| 594 else: | |
| 595 variants_alignment[counter] = {'REF': sequence[counter - 1], 'ALT': 'N'} | |
| 596 | |
| 597 if counter - len(correct_last_absent_position) in correct_absent_positions: | |
| 598 correct_absent_positions[counter - len(correct_last_absent_position)]['REF'] += sequence[counter - 1] | |
| 599 else: | |
| 600 correct_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} | |
| 601 correct_last_absent_position += sequence[counter - 1] | |
| 602 | |
| 603 if counter - len(noMatter_last_absent_position) in noMatter_absent_positions: | |
| 604 noMatter_absent_positions[counter - len(noMatter_last_absent_position)]['REF'] += sequence[counter - 1] | |
| 605 else: | |
| 606 noMatter_absent_positions[counter] = {'REF': sequence[counter - 1], 'ALT': ''} | |
| 607 noMatter_last_absent_position += sequence[counter - 1] | |
| 608 | |
| 609 counter += 1 | |
| 610 | |
| 611 for position in correct_absent_positions: | |
| 612 if position == 1: | |
| 613 variants_correct[position] = {'REF': correct_absent_positions[position]['REF'], 'ALT': 'N'} | |
| 614 else: | |
| 615 if position - 1 not in variants_correct: | |
| 616 variants_correct[position - 1] = {'REF': sequence[position - 2] + correct_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + correct_absent_positions[position]['ALT']} | |
| 617 else: | |
| 618 variants_correct[position - 1] = {'REF': variants_correct[position - 1]['REF'] + correct_absent_positions[position]['REF'][len(variants_correct[position - 1]['REF']) - 1:], 'ALT': variants_correct[position - 1]['ALT'] + correct_absent_positions[position]['ALT'][len(variants_correct[position - 1]['ALT']) - 1 if len(variants_correct[position - 1]['ALT']) > 0 else 0:]} | |
| 619 | |
| 620 for position in noMatter_absent_positions: | |
| 621 if position == 1: | |
| 622 variants_noMatter[position] = {'REF': noMatter_absent_positions[position]['REF'], 'ALT': 'N'} | |
| 623 else: | |
| 624 if position - 1 not in variants_noMatter: | |
| 625 variants_noMatter[position - 1] = {'REF': sequence[position - 2] + noMatter_absent_positions[position]['REF'], 'ALT': sequence[position - 2] + noMatter_absent_positions[position]['ALT']} | |
| 626 else: | |
| 627 variants_noMatter[position - 1] = {'REF': variants_noMatter[position - 1]['REF'] + noMatter_absent_positions[position]['REF'][len(variants_noMatter[position - 1]['REF']) - 1:], 'ALT': variants_noMatter[position - 1]['ALT'] + noMatter_absent_positions[position]['ALT'][len(variants_noMatter[position - 1]['ALT']) - 1 if len(variants_noMatter[position - 1]['ALT']) > 0 else 0:]} | |
| 628 | |
| 629 return variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found | |
| 630 | |
| 631 | |
| 632 def clean_variant_in_extra_seq_left(variant_dict, position, length_extra_seq, multiple_alleles_found, number_multi_alleles): | |
| 633 number_diferences = 0 | |
| 634 | |
| 635 if position + len(variant_dict[position]['REF']) - 1 > length_extra_seq: | |
| 636 if multiple_alleles_found is not None and position in multiple_alleles_found: | |
| 637 number_multi_alleles += 1 | |
| 638 | |
| 639 temp_variant = variant_dict[position] | |
| 640 del variant_dict[position] | |
| 641 variant_dict[length_extra_seq] = {} | |
| 642 variant_dict[length_extra_seq]['REF'] = temp_variant['REF'][length_extra_seq - position:] | |
| 643 variant_dict[length_extra_seq]['ALT'] = temp_variant['ALT'][length_extra_seq - position:] if len(temp_variant['ALT']) > length_extra_seq - position else temp_variant['REF'][length_extra_seq - position] | |
| 644 number_diferences = count_number_diferences(variant_dict[length_extra_seq]['REF'], variant_dict[length_extra_seq]['ALT']) | |
| 645 else: | |
| 646 del variant_dict[position] | |
| 647 | |
| 648 return variant_dict, number_multi_alleles, number_diferences | |
| 649 | |
| 650 | |
| 651 def clean_variant_in_extra_seq_rigth(variant_dict, position, sequence_length, length_extra_seq): | |
| 652 if position + len(variant_dict[position]['REF']) - 1 > sequence_length - length_extra_seq: | |
| 653 variant_dict[position]['REF'] = variant_dict[position]['REF'][: - (position - (sequence_length - length_extra_seq)) + 1] | |
| 654 variant_dict[position]['ALT'] = variant_dict[position]['ALT'][: - (position - (sequence_length - length_extra_seq)) + 1] if len(variant_dict[position]['ALT']) >= - (position - (sequence_length - length_extra_seq)) + 1 else variant_dict[position]['ALT'] | |
| 655 | |
| 656 number_diferences = count_number_diferences(variant_dict[position]['REF'], variant_dict[position]['ALT']) | |
| 657 | |
| 658 return variant_dict, number_diferences | |
| 659 | |
| 660 | |
| 661 def cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, sequence_length): | |
| 662 number_multi_alleles = 0 | |
| 663 number_diferences = 0 | |
| 664 | |
| 665 counter = 1 | |
| 666 while counter <= sequence_length: | |
| 667 if counter <= length_extra_seq: | |
| 668 if counter in variants_correct: | |
| 669 variants_correct, number_multi_alleles, number_diferences = clean_variant_in_extra_seq_left(variants_correct, counter, length_extra_seq, multiple_alleles_found, number_multi_alleles) | |
| 670 if counter in variants_noMatter: | |
| 671 variants_noMatter, ignore, ignore = clean_variant_in_extra_seq_left(variants_noMatter, counter, length_extra_seq, None, None) | |
| 672 if counter in variants_alignment: | |
| 673 variants_alignment, ignore, ignore = clean_variant_in_extra_seq_left(variants_alignment, counter, length_extra_seq, None, None) | |
| 674 elif counter > length_extra_seq and counter <= sequence_length - length_extra_seq: | |
| 675 if counter in variants_correct: | |
| 676 if counter in multiple_alleles_found: | |
| 677 number_multi_alleles += 1 | |
| 678 variants_correct, number_diferences_found = clean_variant_in_extra_seq_rigth(variants_correct, counter, sequence_length, length_extra_seq) | |
| 679 number_diferences += number_diferences_found | |
| 680 if counter in variants_noMatter: | |
| 681 variants_noMatter, ignore = clean_variant_in_extra_seq_rigth(variants_noMatter, counter, sequence_length, length_extra_seq) | |
| 682 if counter in variants_alignment: | |
| 683 variants_alignment, ignore = clean_variant_in_extra_seq_rigth(variants_alignment, counter, sequence_length, length_extra_seq) | |
| 684 else: | |
| 685 if counter in variants_correct: | |
| 686 del variants_correct[counter] | |
| 687 if counter in variants_noMatter: | |
| 688 del variants_noMatter[counter] | |
| 689 if counter in variants_alignment: | |
| 690 del variants_alignment[counter] | |
| 691 | |
| 692 counter += 1 | |
| 693 | |
| 694 return variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences | |
| 695 | |
| 696 | |
| 697 def get_coverage(gene_coverage): | |
| 698 coverage = {} | |
| 699 | |
| 700 with open(gene_coverage, 'rtU') as reader: | |
| 701 for line in reader: | |
| 702 line = line.splitlines()[0] | |
| 703 if len(line) > 0: | |
| 704 line = line.split('\t') | |
| 705 coverage[int(line[1])] = int(line[2]) | |
| 706 | |
| 707 return coverage | |
| 708 | |
| 709 | |
| 710 def get_coverage_report(coverage, sequence_length, minimum_depth_presence, minimum_depth_call, length_extra_seq): | |
| 711 if len(coverage) == 0: | |
| 712 return sequence_length - 2 * length_extra_seq, 100.0, 0.0 | |
| 713 | |
| 714 count_absent = 0 | |
| 715 count_lowCoverage = 0 | |
| 716 sum_coverage = 0 | |
| 717 | |
| 718 counter = 1 | |
| 719 while counter <= sequence_length: | |
| 720 if counter > length_extra_seq and counter <= sequence_length - length_extra_seq: | |
| 721 if coverage[counter] < minimum_depth_presence: | |
| 722 count_absent += 1 | |
| 723 else: | |
| 724 if coverage[counter] < minimum_depth_call: | |
| 725 count_lowCoverage += 1 | |
| 726 sum_coverage += coverage[counter] | |
| 727 counter += 1 | |
| 728 | |
| 729 mean_coverage = 0 | |
| 730 percentage_lowCoverage = 0 | |
| 731 if sequence_length - 2 * length_extra_seq - count_absent > 0: | |
| 732 mean_coverage = float(sum_coverage) / float(sequence_length - 2 * length_extra_seq - count_absent) | |
| 733 percentage_lowCoverage = float(count_lowCoverage) / float(sequence_length - 2 * length_extra_seq - count_absent) * 100 | |
| 734 | |
| 735 return count_absent, percentage_lowCoverage, mean_coverage | |
| 736 | |
| 737 | |
| 738 # Get genome coverage data | |
| 739 def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): | |
| 740 genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') | |
| 741 command = ['samtools', 'depth', '-a', '-q', '0', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file] | |
| 742 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) | |
| 743 return run_successfully, genome_coverage_data_file | |
| 744 | |
| 745 | |
| 746 def write_variants_vcf(variants, outdir, sequence_to_analyse, sufix): | |
| 747 vcf_file = os.path.join(outdir, str(sequence_to_analyse + '.' + sufix + '.vcf')) | |
| 748 with open(vcf_file, 'wt') as writer: | |
| 749 writer.write('##fileformat=VCFv4.2' + '\n') | |
| 750 writer.write('#' + '\t'.join(['SEQUENCE', 'POSITION', 'ID_unused', 'REFERENCE_sequence', 'ALTERNATIVE_sequence', 'QUALITY_unused', 'FILTER_unused', 'INFO_unused', 'FORMAT_unused']) + '\n') | |
| 751 for i in sorted(variants.keys()): | |
| 752 writer.write('\t'.join([sequence_to_analyse, str(i), '.', variants[i]['REF'], variants[i]['ALT'], '.', '.', '.', '.']) + '\n') | |
| 753 | |
| 754 compressed_vcf_file = vcf_file + '.gz' | |
| 755 command = ['bcftools', 'convert', '-o', compressed_vcf_file, '-O', 'z', vcf_file] | |
| 756 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
| 757 if run_successfully: | |
| 758 command = ['bcftools', 'index', compressed_vcf_file] | |
| 759 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
| 760 | |
| 761 if not run_successfully: | |
| 762 compressed_vcf_file = None | |
| 763 | |
| 764 return run_successfully, compressed_vcf_file | |
| 765 | |
| 766 | |
| 767 def parse_fasta_inMemory(fasta_memory): | |
| 768 fasta_memory = fasta_memory.splitlines() | |
| 769 sequence_dict = {} | |
| 770 for line in fasta_memory: | |
| 771 if len(line) > 0: | |
| 772 if line.startswith('>'): | |
| 773 sequence_dict = {'header': line[1:], 'sequence': ''} | |
| 774 else: | |
| 775 sequence_dict['sequence'] += line | |
| 776 | |
| 777 return sequence_dict | |
| 778 | |
| 779 | |
| 780 def compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, sufix): | |
| 781 sequence_dict = None | |
| 782 | |
| 783 gene_fasta = os.path.join(outdir, str(sequence_to_analyse + '.fasta')) | |
| 784 | |
| 785 run_successfully, stdout = index_fasta_samtools(reference_file, sequence_to_analyse, gene_fasta, False) | |
| 786 if run_successfully: | |
| 787 command = ['bcftools', 'norm', '-c', 's', '-f', gene_fasta, '-Ov', compressed_vcf_file] | |
| 788 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
| 789 if run_successfully: | |
| 790 command = ['bcftools', 'consensus', '-f', gene_fasta, compressed_vcf_file, '-H', '1'] | |
| 791 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
| 792 if run_successfully: | |
| 793 sequence_dict = parse_fasta_inMemory(stdout) | |
| 794 | |
| 795 return run_successfully, sequence_dict | |
| 796 | |
| 797 | |
| 798 def create_sample_consensus_sequence(outdir, sequence_to_analyse, reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence, length_extra_seq): | |
| 799 variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found = get_true_variants(variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence) | |
| 800 | |
| 801 variants_correct, variants_noMatter, variants_alignment, number_multi_alleles, number_diferences = cleanning_variants_extra_seq(variants_correct, variants_noMatter, variants_alignment, multiple_alleles_found, length_extra_seq, len(sequence)) | |
| 802 | |
| 803 run_successfully = False | |
| 804 consensus = {'correct': {}, 'noMatter': {}, 'alignment': {}} | |
| 805 for variant_type in ['variants_correct', 'variants_noMatter', 'variants_alignment']: | |
| 806 run_successfully, compressed_vcf_file = write_variants_vcf(eval(variant_type), outdir, sequence_to_analyse, variant_type.split('_', 1)[1]) | |
| 807 if run_successfully: | |
| 808 run_successfully, sequence_dict = compute_consensus_sequence(reference_file, sequence_to_analyse, compressed_vcf_file, outdir, variant_type.split('_', 1)[1]) | |
| 809 if run_successfully: | |
| 810 consensus[variant_type.split('_', 1)[1]] = {'header': sequence_dict['header'], 'sequence': sequence_dict['sequence'][length_extra_seq:len(sequence_dict['sequence']) - length_extra_seq]} | |
| 811 | |
| 812 return run_successfully, number_multi_alleles, consensus, number_diferences | |
| 813 | |
| 814 | |
| 815 @utils.trace_unhandled_exceptions | |
| 816 def analyse_sequence_data(bam_file, sequence_information, outdir, counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads): | |
| 817 count_absent = None | |
| 818 percentage_lowCoverage = None | |
| 819 meanCoverage = None | |
| 820 number_diferences = 0 | |
| 821 | |
| 822 # Create vcf file (for multiple alleles check) | |
| 823 run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) | |
| 824 if run_successfully: | |
| 825 # Create coverage tab file | |
| 826 run_successfully, gene_coverage = compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter) | |
| 827 | |
| 828 if run_successfully: | |
| 829 variants = get_variants(gene_vcf) | |
| 830 | |
| 831 coverage = get_coverage(gene_coverage) | |
| 832 | |
| 833 run_successfully, number_multi_alleles, consensus_sequence, number_diferences = create_sample_consensus_sequence(outdir, sequence_information['header'], reference_file, variants, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, sequence_information['sequence'], length_extra_seq) | |
| 834 | |
| 835 count_absent, percentage_lowCoverage, meanCoverage = get_coverage_report(coverage, sequence_information['length'], minimum_depth_presence, minimum_depth_call, length_extra_seq) | |
| 836 | |
| 837 utils.saveVariableToPickle([run_successfully, counter, number_multi_alleles, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences], outdir, str('coverage_info.' + str(counter))) | |
| 838 | |
| 839 | |
| 840 def clean_header(header): | |
| 841 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | |
| 842 new_header = str(header) | |
| 843 if any(x in header for x in problematic_characters): | |
| 844 for x in problematic_characters: | |
| 845 new_header = new_header.replace(x, '_') | |
| 846 return header, new_header | |
| 847 | |
| 848 | |
| 849 def get_sequence_information(fasta_file, length_extra_seq): | |
| 850 sequence_dict = {} | |
| 851 headers = {} | |
| 852 headers_changed = False | |
| 853 | |
| 854 with open(fasta_file, 'rtU') as reader: | |
| 855 blank_line_found = False | |
| 856 sequence_counter = 0 | |
| 857 temp_sequence_dict = {} | |
| 858 for line in reader: | |
| 859 line = line.splitlines()[0] | |
| 860 if len(line) > 0: | |
| 861 if not blank_line_found: | |
| 862 if line.startswith('>'): | |
| 863 if len(temp_sequence_dict) > 0: | |
| 864 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | |
| 865 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | |
| 866 else: | |
| 867 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) | |
| 868 del headers[temp_sequence_dict.values()[0]['header']] | |
| 869 temp_sequence_dict = {} | |
| 870 | |
| 871 original_header, new_header = clean_header(line[1:]) | |
| 872 if new_header in headers: | |
| 873 sys.exit('Found duplicated sequence headers: {original_header}'.format(original_header=original_header)) | |
| 874 | |
| 875 sequence_counter += 1 | |
| 876 temp_sequence_dict[sequence_counter] = {'header': new_header, 'sequence': '', 'length': 0} | |
| 877 headers[new_header] = str(original_header) | |
| 878 if new_header != original_header: | |
| 879 headers_changed = True | |
| 880 else: | |
| 881 temp_sequence_dict[sequence_counter]['sequence'] += line | |
| 882 temp_sequence_dict[sequence_counter]['length'] += len(line) | |
| 883 else: | |
| 884 sys.exit('It was found a blank line between the fasta file above line ' + line) | |
| 885 else: | |
| 886 blank_line_found = True | |
| 887 | |
| 888 if len(temp_sequence_dict) > 0: | |
| 889 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0: | |
| 890 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0] | |
| 891 else: | |
| 892 print '{header} sequence ignored due to length <= 0'.format(header=temp_sequence_dict.values()[0]['header']) | |
| 893 del headers[temp_sequence_dict.values()[0]['header']] | |
| 894 | |
| 895 return sequence_dict, headers, headers_changed | |
| 896 | |
| 897 | |
| 898 def determine_threads_2_use(number_sequences, threads): | |
| 899 if number_sequences >= threads: | |
| 900 return 1 | |
| 901 else: | |
| 902 return threads / number_sequences | |
| 903 | |
| 904 | |
| 905 def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus): | |
| 906 sequence_data_outdir = os.path.join(outdir, 'sequence_data', '') | |
| 907 utils.removeDirectory(sequence_data_outdir) | |
| 908 os.mkdir(sequence_data_outdir) | |
| 909 | |
| 910 sequences, headers, headers_changed = get_sequence_information(reference_file, length_extra_seq) | |
| 911 | |
| 912 threads_2_use = determine_threads_2_use(len(sequences), threads) | |
| 913 | |
| 914 pool = multiprocessing.Pool(processes=threads) | |
| 915 for sequence_counter in sequences: | |
| 916 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') | |
| 917 utils.removeDirectory(sequence_dir) | |
| 918 os.makedirs(sequence_dir) | |
| 919 pool.apply_async(analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,)) | |
| 920 pool.close() | |
| 921 pool.join() | |
| 922 | |
| 923 run_successfully, sample_data, consensus_files, consensus_sequences = gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, notWriteConsensus) | |
| 924 | |
| 925 return run_successfully, sample_data, consensus_files, consensus_sequences | |
| 926 | |
| 927 | |
| 928 def chunkstring(string, length): | |
| 929 return (string[0 + i:length + i] for i in range(0, len(string), length)) | |
| 930 | |
| 931 | |
| 932 def write_consensus(outdir, sample, consensus_sequence): | |
| 933 consensus_files = {} | |
| 934 for consensus_type in ['correct', 'noMatter', 'alignment']: | |
| 935 consensus_files[consensus_type] = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta')) | |
| 936 with open(consensus_files[consensus_type], 'at') as writer: | |
| 937 writer.write('>' + consensus_sequence[consensus_type]['header'] + '\n') | |
| 938 fasta_sequence_lines = chunkstring(consensus_sequence[consensus_type]['sequence'], 80) | |
| 939 for line in fasta_sequence_lines: | |
| 940 writer.write(line + '\n') | |
| 941 return consensus_files | |
| 942 | |
| 943 | |
| 944 def gather_data_together(sample, data_directory, sequences_information, outdir, debug_mode_true, length_extra_seq, notWriteConsensus): | |
| 945 run_successfully = True | |
| 946 counter = 0 | |
| 947 sample_data = {} | |
| 948 | |
| 949 consensus_files = None | |
| 950 consensus_sequences_together = {'correct': {}, 'noMatter': {}, 'alignment': {}} | |
| 951 | |
| 952 write_consensus_first_time = True | |
| 953 | |
| 954 genes_directories = [d for d in os.listdir(data_directory) if not d.startswith('.') and os.path.isdir(os.path.join(data_directory, d, ''))] | |
| 955 for gene_dir in genes_directories: | |
| 956 gene_dir_path = os.path.join(data_directory, gene_dir, '') | |
| 957 | |
| 958 files = [f for f in os.listdir(gene_dir_path) if not f.startswith('.') and os.path.isfile(os.path.join(gene_dir_path, f))] | |
| 959 for file_found in files: | |
| 960 if file_found.startswith('coverage_info.') and file_found.endswith('.pkl'): | |
| 961 file_path = os.path.join(gene_dir_path, file_found) | |
| 962 | |
| 963 if run_successfully: | |
| 964 run_successfully, sequence_counter, multiple_alleles_found, count_absent, percentage_lowCoverage, meanCoverage, consensus_sequence, number_diferences = utils.extractVariableFromPickle(file_path) | |
| 965 | |
| 966 if not notWriteConsensus: | |
| 967 for consensus_type in consensus_sequence: | |
| 968 consensus_sequences_together[consensus_type][sequence_counter] = {'header': consensus_sequence[consensus_type]['header'], 'sequence': consensus_sequence[consensus_type]['sequence']} | |
| 969 | |
| 970 if write_consensus_first_time: | |
| 971 for consensus_type in ['correct', 'noMatter', 'alignment']: | |
| 972 file_to_remove = os.path.join(outdir, str(sample + '.' + consensus_type + '.fasta')) | |
| 973 if os.path.isfile(file_to_remove): | |
| 974 os.remove(file_to_remove) | |
| 975 write_consensus_first_time = False | |
| 976 consensus_files = write_consensus(outdir, sample, consensus_sequence) | |
| 977 | |
| 978 gene_identity = 0 | |
| 979 if sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent > 0: | |
| 980 gene_identity = 100 - (float(number_diferences) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq - count_absent)) * 100 | |
| 981 | |
| 982 sample_data[sequence_counter] = {'header': sequences_information[sequence_counter]['header'], 'gene_coverage': 100 - (float(count_absent) / (sequences_information[sequence_counter]['length'] - 2 * length_extra_seq)) * 100, 'gene_low_coverage': percentage_lowCoverage, 'gene_number_positions_multiple_alleles': multiple_alleles_found, 'gene_mean_read_coverage': meanCoverage, 'gene_identity': gene_identity} | |
| 983 counter += 1 | |
| 984 | |
| 985 if not debug_mode_true: | |
| 986 utils.removeDirectory(gene_dir_path) | |
| 987 | |
| 988 if counter != len(sequences_information): | |
| 989 run_successfully = False | |
| 990 | |
| 991 return run_successfully, sample_data, consensus_files, consensus_sequences_together | |
| 992 | |
| 993 | |
| 994 rematch_timer = functools.partial(utils.timer, name='ReMatCh module') | |
| 995 | |
| 996 | |
| 997 @rematch_timer | |
| 998 def runRematchModule(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, conserved_True, debug_mode_true, numMapLoc, minimum_gene_identity, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT, gene_list_reference, notWriteConsensus): | |
| 999 rematch_folder = os.path.join(outdir, 'rematch_module', '') | |
| 1000 utils.removeDirectory(rematch_folder) | |
| 1001 os.mkdir(rematch_folder) | |
| 1002 | |
| 1003 # Map reads | |
| 1004 run_successfully, bam_file, reference_file = mapping_reads(fastq_files, reference_file, threads, rematch_folder, conserved_True, numMapLoc, rematch_run, softClip_baseQuality, softClip_recodeRun, reference_dict, softClip_cigarFlagRecode, bowtieOPT) | |
| 1005 | |
| 1006 if run_successfully: | |
| 1007 # Index reference file | |
| 1008 run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True) | |
| 1009 if run_successfully: | |
| 1010 print 'Analysing alignment data' | |
| 1011 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data(sample, reference_file, bam_file, rematch_folder, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, notWriteConsensus) | |
| 1012 | |
| 1013 if run_successfully: | |
| 1014 print 'Writing report file' | |
| 1015 number_absent_genes = 0 | |
| 1016 number_genes_multiple_alleles = 0 | |
| 1017 mean_sample_coverage = 0 | |
| 1018 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: | |
| 1019 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') | |
| 1020 for i in range(1, len(sample_data) + 1): | |
| 1021 writer.write('\t'.join([gene_list_reference[sample_data[i]['header']], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') | |
| 1022 | |
| 1023 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: | |
| 1024 number_absent_genes += 1 | |
| 1025 else: | |
| 1026 mean_sample_coverage += sample_data[i]['gene_mean_read_coverage'] | |
| 1027 if sample_data[i]['gene_number_positions_multiple_alleles'] > 0: | |
| 1028 number_genes_multiple_alleles += 1 | |
| 1029 | |
| 1030 if len(sample_data) - number_absent_genes > 0: | |
| 1031 mean_sample_coverage = float(mean_sample_coverage) / float(len(sample_data) - number_absent_genes) | |
| 1032 else: | |
| 1033 mean_sample_coverage = 0 | |
| 1034 | |
| 1035 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') | |
| 1036 | |
| 1037 print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) | |
| 1038 | |
| 1039 if not debug_mode_true: | |
| 1040 utils.removeDirectory(rematch_folder) | |
| 1041 | |
| 1042 return run_successfully, sample_data if 'sample_data' in locals() else None, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, consensus_files if 'consensus_files' in locals() else None, consensus_sequences if 'consensus_sequences' in locals() else None |
