Mercurial > repos > bgruening > bismark
diff bismark_wrapper.py @ 8:9bfe38410155 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 51299fa62f0566a4a897b1c149db564631282fff
author | bgruening |
---|---|
date | Wed, 22 Aug 2018 08:09:42 -0400 |
parents | |
children | 7bffcb6fc81d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_wrapper.py Wed Aug 22 08:09:42 2018 -0400 @@ -0,0 +1,289 @@ +#!/usr/bin/env python + +import argparse +import fileinput +import logging +import math +import os +import shutil +import subprocess +import sys +import tempfile +from glob import glob + + +def stop_err(logger, msg): + logger.critical(msg) + sys.exit(1) + + +def log_subprocess_output(logger, pipe): + for line in iter(pipe.readline, b''): + logger.debug(line.decode().rstrip()) + + +def __main__(): + # Parse Command Line + parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') + parser.add_argument('-p', '--num-threads', dest='num_threads', + type=int, default=4, help='Use this many threads to align reads. The default is 4.') + + # input options + parser.add_argument('--own-file', dest='own_file', help='') + parser.add_argument('-D', '--indexes-path', dest='index_path', + help='Indexes directory; location of .ebwt and .fa files.') + parser.add_argument('-O', '--output', dest='output') + + parser.add_argument('--output-report-file', dest='output_report_file') + parser.add_argument('--suppress-header', dest='suppress_header', action="store_true") + + parser.add_argument('--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', + default=False) + + parser.add_argument('-1', '--mate1', dest='mate1', + help='The forward reads file in Sanger FASTQ or FASTA format.') + parser.add_argument('-2', '--mate2', dest='mate2', + help='The reverse reads file in Sanger FASTQ or FASTA format.') + parser.add_argument('--sort-bam', dest='sort_bam', action="store_true") + + parser.add_argument('--output-unmapped-reads', dest='output_unmapped_reads', + help='Additional output file with unmapped reads (single-end).') + parser.add_argument('--output-unmapped-reads-l', dest='output_unmapped_reads_l', + help='File name for unmapped reads (left, paired-end).') + parser.add_argument('--output-unmapped-reads-r', dest='output_unmapped_reads_r', + help='File name for unmapped reads (right, paired-end).') + + parser.add_argument('--output-suppressed-reads', dest='output_suppressed_reads', + help='Additional output file with suppressed reads (single-end).') + parser.add_argument('--output-suppressed-reads-l', dest='output_suppressed_reads_l', + help='File name for suppressed reads (left, paired-end).') + parser.add_argument('--output-suppressed-reads-r', dest='output_suppressed_reads_r', + help='File name for suppressed reads (right, paired-end).') + parser.add_argument('--stdout', dest='output_stdout', + help='File name for the standard output of bismark.') + + parser.add_argument('--single-paired', dest='single_paired', + help='The single-end reads file in Sanger FASTQ or FASTA format.') + + parser.add_argument('--fastq', action='store_true', help='Query filetype is in FASTQ format') + parser.add_argument('--fasta', action='store_true', help='Query filetype is in FASTA format') + parser.add_argument('--phred64-quals', dest='phred64', action="store_true") + + parser.add_argument('--skip-reads', dest='skip_reads', type=int) + parser.add_argument('--qupto', type=int) + + # paired end options + parser.add_argument('-I', '--minins', dest='min_insert') + parser.add_argument('-X', '--maxins', dest='max_insert') + parser.add_argument('--no-mixed', dest='no_mixed', action="store_true") + parser.add_argument('--no-discordant', dest='no_discordant', action="store_true") + + # parse general options + # default 20 + parser.add_argument('--seed-len', dest='seed_len', type=int) + # default 15 + parser.add_argument('--seed-extention-attempts', dest='seed_extention_attempts', type=int) + # default 0 + parser.add_argument('--seed-mismatches', dest='seed_mismatches', type=int) + # default 2 + parser.add_argument('--max-reseed', dest='max_reseed', type=int) + """ + # default 70 + parser.add_argument( '--maqerr', dest='maqerr', type=int ) + """ + + """ + The number of megabytes of memory a given thread is given to store path + descriptors in --best mode. Best-first search must keep track of many paths + at once to ensure it is always extending the path with the lowest cumulative + cost. Bowtie tries to minimize the memory impact of the descriptors, but + they can still grow very large in some cases. If you receive an error message + saying that chunk memory has been exhausted in --best mode, try adjusting + this parameter up to dedicate more memory to the descriptors. Default: 512. + """ + parser.add_argument('--chunkmbs', type=int, default=512) + + args = parser.parse_args() + + logger = logging.getLogger('bismark_wrapper') + logger.setLevel(logging.DEBUG) + ch = logging.StreamHandler(sys.stdout) + if args.output_stdout: + ch.setLevel(logging.WARNING) + handler = logging.FileHandler(args.output_stdout) + handler.setLevel(logging.DEBUG) + logger.addHandler(handler) + else: + ch.setLevel(logging.DEBUG) + logger.addHandler(ch) + + # Create bismark index if necessary. + index_dir = "" + tmp_index_dir = None + if args.own_file: + logger.info("Create a temporary index with the offered files from the user. " + "Utilizing the script: bismark_genome_preparation") + tmp_index_dir = tempfile.mkdtemp() + index_path = os.path.join(tmp_index_dir, '.'.join(os.path.split(args.own_file)[1].split('.')[:-1])) + try: + # Create a hard link pointing to args.own_file named 'index_path'.fa. + os.symlink(args.own_file, index_path + '.fa') + except Exception as e: + if os.path.exists(tmp_index_dir): + shutil.rmtree(tmp_index_dir) + stop_err(logger, 'Error in linking the reference database!\n%s' % e) + # bismark_genome_preparation needs the complete path to the folder in which the database is stored + cmd_index = ['bismark_genome_preparation', '--bowtie2', tmp_index_dir] + try: + logger.info("Generating index with: '%s'", " ".join(cmd_index)) + process = subprocess.Popen(cmd_index, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + with process.stdout: + log_subprocess_output(logger, process.stdout) + exitcode = process.wait() + if exitcode != 0: + raise Exception(process.stderr) + except Exception as e: + if os.path.exists(tmp_index_dir): + shutil.rmtree(tmp_index_dir) + stop_err(logger, 'Error indexing reference sequence!\n%s' % e) + index_dir = tmp_index_dir + else: + # bowtie path is the path to the index directory and the first path of the index file name + index_dir = os.path.dirname(args.index_path) + + # Build bismark command + tmp_bismark_dir = tempfile.mkdtemp() + output_dir = os.path.join(tmp_bismark_dir, 'results') + cmd = ['bismark', '--bam', '--gzip', '--temp_dir', tmp_bismark_dir, '-o', output_dir, '--quiet'] + + if args.fasta: + # the query input files (specified as mate1,mate2 or singles) are FastA + cmd.append('--fasta') + elif args.fastq: + cmd.append('--fastq') + + # alignment options + if args.num_threads > 2: + # divide num_threads by 2 here since bismark will spawn 2 jobs with -p threads each + cmd.extend(['-p', str(math.ceil(args.num_threads / 2))]) + if args.seed_mismatches: + cmd.extend(['-N', str(args.seed_mismatches)]) + if args.seed_len: + cmd.extend(['-L', str(args.seed_len)]) + if args.seed_extention_attempts: + cmd.extend(['-D', str(args.seed_extention_attempts)]) + if args.max_reseed: + cmd.extend(['-R', str(args.max_reseed)]) + if args.no_discordant: + cmd.append('--no-discordant') + if args.no_mixed: + cmd.append('--no-mixed') + if args.skip_reads: + cmd.extend(['--skip', str(args.skip_reads)]) + if args.qupto: + cmd.extend(['--upto', 'args.qupto']) + if args.phred64: + cmd.append('--phred64-quals') + if args.suppress_header: + cmd.append('--sam-no-hd') + if args.output_unmapped_reads or (args.output_unmapped_reads_l and args.output_unmapped_reads_r): + cmd.append('--un') + if args.output_suppressed_reads or (args.output_suppressed_reads_l and args.output_suppressed_reads_r): + cmd.append('--ambiguous') + + cmd.append(index_dir) + # Set up the reads + if args.mate_paired: + # paired-end reads library + cmd.extend(['-1', args.mate1, '-2', args.mate2, '-I', str(args.min_insert), '-X', str(args.max_insert)]) + else: + # single paired reads library + cmd.append(args.single_paired) + + # Run + try: + logger.info("Running bismark with: '%s'", " ".join(cmd)) + process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + with process.stdout: + log_subprocess_output(logger, process.stdout) + exitcode = process.wait() + if exitcode != 0: + raise Exception(process.stderr) + except Exception as e: + stop_err(logger, 'Error in running bismark!\n%s' % e) + + # collect and copy output files + if args.output_report_file: + output_report_file = open(args.output_report_file, 'w+') + for line in fileinput.input(glob(os.path.join(output_dir, '*report.txt'))): + output_report_file.write(line) + output_report_file.close() + + if args.output_suppressed_reads: + if glob(os.path.join(output_dir, '*ambiguous_reads.txt')): + shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads) + if args.output_suppressed_reads_l: + if glob(os.path.join(output_dir, '*ambiguous_reads_1.txt')): + shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l) + if args.output_suppressed_reads_r: + if glob(os.path.join(output_dir, '*ambiguous_reads_2.txt')): + shutil.move(glob(os.path.join(output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r) + + if args.output_unmapped_reads: + if glob(os.path.join(output_dir, '*unmapped_reads.txt')): + shutil.move(glob(os.path.join(output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads) + if args.output_unmapped_reads_l: + if glob(os.path.join(output_dir, '*unmapped_reads_1.txt')): + shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l) + if args.output_unmapped_reads_r: + if glob(os.path.join(output_dir, '*unmapped_reads_2.txt')): + shutil.move(glob(os.path.join(output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r) + + try: + # merge all bam files + tmp_res = tempfile.NamedTemporaryFile(dir=output_dir).name + + bam_files = glob(os.path.join(output_dir, '*.bam')) + if len(bam_files) > 1: + cmd = ['samtools', 'merge', '-@', str(args.num_threads), '-f', tmp_res, ' '.join(bam_files)] + logger.info("Merging bams with: '%s'", cmd) + process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + with process.stdout: + log_subprocess_output(logger, process.stdout) + exitcode = process.wait() + if exitcode != 0: + raise Exception(process.stderr) + else: + tmp_res = bam_files[0] + + bam_path = "%s" % tmp_res + + if os.path.exists(bam_path): + if args.sort_bam: + cmd = ['samtools', 'sort', '-@', str(args.num_threads), bam_path, 'sorted_bam'] + logger.info("Sorting bam with: '%s'", cmd) + process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + with process.stdout: + log_subprocess_output(logger, process.stdout) + exitcode = process.wait() + if exitcode != 0: + raise Exception(process.stderr) + shutil.move('sorted_bam.bam', args.output) + else: + shutil.move(bam_path, args.output) + else: + stop_err(logger, 'BAM file no found:\n%s' % bam_path) + + except Exception as e: + stop_err(logger, 'Error in merging bam files!\n%s' % e) + + # Clean up temp dirs + if tmp_index_dir and os.path.exists(tmp_index_dir): + shutil.rmtree(tmp_index_dir) + if os.path.exists(tmp_bismark_dir): + shutil.rmtree(tmp_bismark_dir) + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + +if __name__ == "__main__": __main__()