Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/modules/run_rematch.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Tue, 28 Jan 2020 10:42:31 -0500 |
| parents | 965517909457 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:6837f733b4aa | 3:0cbed1c0a762 |
|---|---|
| 1 import functools | 1 import functools |
| 2 import os | 2 import os |
| 3 import sys | 3 import sys |
| 4 import multiprocessing | |
| 4 | 5 |
| 5 import utils | 6 try: |
| 7 import modules.utils as utils | |
| 8 except ImportError: | |
| 9 from pathotyping.modules import utils as utils | |
| 10 | |
| 11 | |
| 12 # {'noMatter': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.noMatter.fasta', 'correct': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.correct.fasta', 'alignment': '/home/ubuntu/NGStools/patho_typing/mpmachado_stuff.out_test/rematch/sample.alignment.fasta'} | |
| 13 | |
| 6 | 14 |
| 7 def remove_alignment(alignment_file): | 15 def remove_alignment(alignment_file): |
| 8 directory = os.path.dirname(alignment_file) | 16 directory = os.path.dirname(alignment_file) |
| 9 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))] | 17 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))] |
| 10 for file_found in files: | 18 for file_found in files: |
| 22 | 30 |
| 23 | 31 |
| 24 def clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true): | 32 def clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true): |
| 25 if not debug_mode_true: | 33 if not debug_mode_true: |
| 26 if not doNotRemoveConsensus: | 34 if not doNotRemoveConsensus: |
| 27 for consensus_type, file_path in consensus_files.items(): | 35 for consensus_type, file_path in list(consensus_files.items()): |
| 28 if os.path.isfile(file_path): | 36 if os.path.isfile(file_path): |
| 29 os.remove(file_path) | 37 os.remove(file_path) |
| 30 if bam_file is not None: | 38 if bam_file is not None: |
| 31 remove_alignment(bam_file) | 39 remove_alignment(bam_file) |
| 32 remove_reference_stuff(outdir, reference_file) | 40 remove_reference_stuff(outdir, reference_file) |
| 37 utils.removeDirectory(sequence_data_outdir) | 45 utils.removeDirectory(sequence_data_outdir) |
| 38 os.mkdir(sequence_data_outdir) | 46 os.mkdir(sequence_data_outdir) |
| 39 | 47 |
| 40 sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq) | 48 sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq) |
| 41 | 49 |
| 42 threads_2_use = rematch.determine_threads_2_use(len(sequences), threads) | |
| 43 | |
| 44 import multiprocessing | |
| 45 | |
| 46 pool = multiprocessing.Pool(processes=threads) | 50 pool = multiprocessing.Pool(processes=threads) |
| 47 for sequence_counter in sequences: | 51 for sequence_counter in sequences: |
| 48 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') | 52 sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '') |
| 49 utils.removeDirectory(sequence_dir) | 53 utils.removeDirectory(sequence_dir) |
| 50 os.makedirs(sequence_dir) | 54 os.makedirs(sequence_dir) |
| 51 pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, threads_2_use,)) | 55 pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,)) |
| 52 pool.close() | 56 pool.close() |
| 53 pool.join() | 57 pool.join() |
| 54 | 58 |
| 55 run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False) | 59 run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False) |
| 56 | 60 |
| 57 return run_successfully, sample_data, consensus_files, consensus_sequences | 61 return run_successfully, sample_data, consensus_files, consensus_sequences |
| 58 | 62 |
| 59 | 63 |
| 60 def write_report(outdir, sample_data, minimum_gene_coverage, minimum_gene_identity): | 64 def determine_general_statistics(sample_data, minimum_gene_coverage, minimum_gene_identity): |
| 61 print 'Writing report file' | 65 print('Writing report file') |
| 62 number_absent_genes = 0 | 66 number_absent_genes = 0 |
| 63 number_genes_multiple_alleles = 0 | 67 number_genes_multiple_alleles = 0 |
| 64 mean_sample_coverage = 0 | 68 mean_sample_coverage = 0 |
| 65 with open(os.path.join(outdir, 'rematchModule_report.txt'), 'wt') as writer: | 69 |
| 70 with open('output_dir/rematch/rematchModule_report.txt', 'wt') as writer: | |
| 66 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') | 71 writer.write('\t'.join(['#gene', 'percentage_gene_coverage', 'gene_mean_read_coverage', 'percentage_gene_low_coverage', 'number_positions_multiple_alleles', 'percentage_gene_identity']) + '\n') |
| 67 for i in range(1, len(sample_data) + 1): | 72 for i in range(1, len(sample_data) + 1): |
| 68 writer.write('\t'.join([sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') | 73 writer.write('\t'.join([sample_data[i]['header'], str(round(sample_data[i]['gene_coverage'], 2)), str(round(sample_data[i]['gene_mean_read_coverage'], 2)), str(round(sample_data[i]['gene_low_coverage'], 2)), str(sample_data[i]['gene_number_positions_multiple_alleles']), str(round(sample_data[i]['gene_identity'], 2))]) + '\n') |
| 69 | 74 |
| 70 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: | 75 if sample_data[i]['gene_coverage'] < minimum_gene_coverage or sample_data[i]['gene_identity'] < minimum_gene_identity: |
| 79 else: | 84 else: |
| 80 mean_sample_coverage = 0 | 85 mean_sample_coverage = 0 |
| 81 | 86 |
| 82 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') | 87 writer.write('\n'.join(['#general', '>number_absent_genes', str(number_absent_genes), '>number_genes_multiple_alleles', str(number_genes_multiple_alleles), '>mean_sample_coverage', str(round(mean_sample_coverage, 2))]) + '\n') |
| 83 | 88 |
| 84 print '\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) + '\n' | 89 print('\n'.join([str('number_absent_genes: ' + str(number_absent_genes)), |
| 90 str('number_genes_multiple_alleles: ' + str(number_genes_multiple_alleles)), | |
| 91 str('mean_sample_coverage: ' + str(round(mean_sample_coverage, 2)))]) + '\n') | |
| 85 | 92 |
| 86 return number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage | 93 return number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage |
| 87 | |
| 88 | 94 |
| 89 module_timer = functools.partial(utils.timer, name='Module ReMatCh') | 95 module_timer = functools.partial(utils.timer, name='Module ReMatCh') |
| 90 | 96 |
| 91 | 97 |
| 92 @module_timer | 98 @module_timer |
| 93 def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus): | 99 def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus): |
| 94 module_dir = os.path.join(outdir, 'rematch', '') | 100 module_dir = os.path.join(outdir, 'rematch', '') |
| 95 utils.removeDirectory(module_dir) | 101 utils.removeDirectory(module_dir) |
| 96 os.makedirs(module_dir) | 102 os.makedirs(module_dir) |
| 97 | 103 |
| 98 sys.path.append(os.path.join(os.path.dirname(rematch), 'modules', '')) | 104 sys.path.append(os.path.join(os.path.dirname(rematch), 'modules')) |
| 99 import rematch_module as rematch | 105 import rematch_module as rematch |
| 100 | 106 |
| 101 print 'Analysing alignment data' | 107 print('Analysing alignment data') |
| 102 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch) | 108 run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch) |
| 103 | 109 |
| 104 if run_successfully: | 110 if run_successfully: |
| 105 number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = write_report(outdir, sample_data, minimum_gene_coverage, minimum_gene_identity) | 111 number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = \ |
| 112 determine_general_statistics(sample_data=sample_data, minimum_gene_coverage=minimum_gene_coverage, | |
| 113 minimum_gene_identity=minimum_gene_identity) | |
| 106 | 114 |
| 107 if not debug_mode_true: | 115 if not debug_mode_true: |
| 108 utils.removeDirectory(module_dir) | 116 utils.removeDirectory(module_dir) |
| 109 | 117 |
| 110 clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true) | 118 clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true) |
