comparison scripts/modules/run_rematch.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 import functools 1 import functools
2 import os 2 import os
3 import sys 3 import sys
4 import multiprocessing
4 5
5 import utils 6 try:
7 import modules.utils as utils
8 except ImportError:
9 from pathotyping.modules import utils as utils
10
11
12 # {'noMatter': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.noMatter.fasta', 'correct': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.correct.fasta', 'alignment': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.alignment.fasta'}
13
6 14
7 def remove_alignment(alignment_file): 15 def remove_alignment(alignment_file):
8 directory = os.path.dirname(alignment_file) 16 directory = os.path.dirname(alignment_file)
9 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))] 17 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))]
10 for file_found in files: 18 for file_found in files:
22 30
23 31
24 def clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true): 32 def clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true):
25 if not debug_mode_true: 33 if not debug_mode_true:
26 if not doNotRemoveConsensus: 34 if not doNotRemoveConsensus:
27 for consensus_type, file_path in consensus_files.items(): 35 for consensus_type, file_path in list(consensus_files.items()):
28 if os.path.isfile(file_path): 36 if os.path.isfile(file_path):
29 os.remove(file_path) 37 os.remove(file_path)
30 if bam_file is not None: 38 if bam_file is not None:
31 remove_alignment(bam_file) 39 remove_alignment(bam_file)
32 remove_reference_stuff(outdir, reference_file) 40 remove_reference_stuff(outdir, reference_file)
37 utils.removeDirectory(sequence_data_outdir) 45 utils.removeDirectory(sequence_data_outdir)
38 os.mkdir(sequence_data_outdir) 46 os.mkdir(sequence_data_outdir)
39 47
40 sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq) 48 sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq)
41 49
42 threads_2_use = rematch.determine_threads_2_use(len(sequences), threads)
43
44 import multiprocessing
45
46 pool = multiprocessing.Pool(processes=threads) 50 pool = multiprocessing.Pool(processes=threads)
47 for sequence_counter in sequences: 51 for sequence_counter in sequences:
48 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') 52 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '')
49 utils.removeDirectory(sequence_dir) 53 utils.removeDirectory(sequence_dir)
50 os.makedirs(sequence_dir) 54 os.makedirs(sequence_dir)
51 pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,)) 55 pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,))
52 pool.close() 56 pool.close()
53 pool.join() 57 pool.join()
54 58
55 run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False) 59 run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False)
56 60
57 return run_successfully, sample_data, consensus_files, consensus_sequences 61 return run_successfully, sample_data, consensus_files, consensus_sequences
58 62
59 63
60 def write_report(outdir, sample_data, minimum_gene_coverage, minimum_gene_identity): 64 def determine_general_statistics(sample_data, minimum_gene_coverage, minimum_gene_identity):
61 print 'Writing report file' 65 print('Writing report file')
62 number_absent_genes = 0 66 number_absent_genes = 0
63 number_genes_multiple_alleles = 0 67 number_genes_multiple_alleles = 0
64 mean_sample_coverage = 0 68 mean_sample_coverage = 0
65 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: 69
70 with open('output_dir/rematch/rematchModule_report.txt', 'wt') as writer:
66 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') 71 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n')
67 for i in range(1, len(sample_data) + 1): 72 for i in range(1, len(sample_data) + 1):
68 writer.write('\t'.join([sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') 73 writer.write('\t'.join([sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n')
69 74
70 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: 75 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity:
79 else: 84 else:
80 mean_sample_coverage = 0 85 mean_sample_coverage = 0
81 86
82 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') 87 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n')
83 88
84 print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) + '\n' 89 print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)),
90 str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)),
91 str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) + '\n')
85 92
86 return number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage 93 return number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage
87
88 94
89 module_timer = functools.partial(utils.timer, name='Module ReMatCh') 95 module_timer = functools.partial(utils.timer, name='Module ReMatCh')
90 96
91 97
92 @module_timer 98 @module_timer
93 def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus): 99 def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus):
94 module_dir = os.path.join(outdir, 'rematch', '') 100 module_dir = os.path.join(outdir, 'rematch', '')
95 utils.removeDirectory(module_dir) 101 utils.removeDirectory(module_dir)
96 os.makedirs(module_dir) 102 os.makedirs(module_dir)
97 103
98 sys.path.append(os.path.join(os.path.dirname(rematch), 'modules', '')) 104 sys.path.append(os.path.join(os.path.dirname(rematch), 'modules'))
99 import rematch_module as rematch 105 import rematch_module as rematch
100 106
101 print 'Analysing alignment data' 107 print('Analysing alignment data')
102 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch) 108 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch)
103 109
104 if run_successfully: 110 if run_successfully:
105 number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = write_report(outdir, sample_data, minimum_gene_coverage, minimum_gene_identity) 111 number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = \
112 determine_general_statistics(sample_data=sample_data, minimum_gene_coverage=minimum_gene_coverage,
113 minimum_gene_identity=minimum_gene_identity)
106 114
107 if not debug_mode_true: 115 if not debug_mode_true:
108 utils.removeDirectory(module_dir) 116 utils.removeDirectory(module_dir)
109 117
110 clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true) 118 clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true)