diff bismark_methylation_extractor.py @ 8:9bfe38410155 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 51299fa62f0566a4a897b1c149db564631282fff
author bgruening
date Wed, 22 Aug 2018 08:09:42 -0400
parents
children ff6ee551b153
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bismark_methylation_extractor.py	Wed Aug 22 08:09:42 2018 -0400
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+import math
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import zipfile
+from glob import glob
+
+
+def stop_err(logger, msg):
+    logger.critical(msg)
+    sys.exit(1)
+
+
+def log_subprocess_output(logger, pipe):
+    for line in iter(pipe.readline, b''):
+        logger.debug(line.decode().rstrip())
+
+
+def zipper(dir, zip_file):
+    output_files_regex = re.compile('^(Non_)?C[pH][GH]_.*')
+    bedgraph_regex = re.compile('.*bedGraph.gz')
+    zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED)
+    root_len = len(os.path.abspath(dir))
+    for root, dirs, files in os.walk(dir):
+        archive_root = os.path.abspath(root)[root_len:]
+        for f in files:
+            if re.search(output_files_regex, f) or re.search(bedgraph_regex, f):
+                fullpath = os.path.join(root, f)
+                archive_name = os.path.join(archive_root, f)
+                zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED)
+    zip.close()
+    return zip_file
+
+
+def build_genome_dir(genome_file):
+    tmp_genome_dir = tempfile.mkdtemp(prefix='tmp')
+    genome_path = os.path.join(tmp_genome_dir, '.'.join(os.path.split(genome_file)[1].split('.')[:-1]))
+    try:
+        # Create a hard link pointing to genome_file named 'genome_path'.fa.
+        os.symlink(genome_file, genome_path + '.fa')
+    except Exception as e:
+        if os.path.exists(tmp_genome_dir):
+            shutil.rmtree(tmp_genome_dir)
+        stop_err('Error in linking the reference database!\n%s' % e)
+    return tmp_genome_dir
+
+
+def __main__():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(description='Wrapper for the bismark methylation caller.')
+
+    # input options
+    parser.add_argument('--infile', help='Input file in SAM or BAM format.')
+    parser.add_argument('--single-end', dest='single_end', action="store_true")
+    parser.add_argument('--paired-end', dest='paired_end', action="store_true")
+
+    parser.add_argument('--multicore', dest='multicore', type=int, default=1)
+    parser.add_argument('--splitting_report', dest='splitting_report')
+    parser.add_argument('--mbias_report', dest='mbias_report')
+    parser.add_argument('--cytosine_report', dest="cytosine_report")
+    parser.add_argument('--genome_file', dest="genome_file")
+    parser.add_argument('--cx_context', action="store_true")
+
+    parser.add_argument('--comprehensive', action="store_true")
+    parser.add_argument('--merge-non-cpg', dest='merge_non_cpg', action="store_true")
+    parser.add_argument('--no-overlap', dest='no_overlap', action="store_true")
+    parser.add_argument('--compress', dest='compress')
+    parser.add_argument('--ignore', dest='ignore', type=int)
+    parser.add_argument('--ignore_r2', dest='ignore_r2', type=int)
+    parser.add_argument('--ignore_3prime', dest='ignore_3prime', type=int)
+    parser.add_argument('--ignore_3prime_r2', dest='ignore_3prime_r2', type=int)
+    parser.add_argument('--log_report', dest='log_report', metavar='log_filename', type=str)
+    args = parser.parse_args()
+
+    logger = logging.getLogger('bismark_methylation_extractor_wrapper')
+    logger.setLevel(logging.DEBUG)
+    ch = logging.StreamHandler(sys.stdout)
+    if args.log_report:
+        ch.setLevel(logging.WARNING)
+        handler = logging.FileHandler(args.log_report)
+        handler.setLevel(logging.DEBUG)
+        logger.addHandler(handler)
+    else:
+        ch.setLevel(logging.DEBUG)
+    logger.addHandler(ch)
+
+    # Build methylation extractor command
+    output_dir = tempfile.mkdtemp()
+    cmd = ['bismark_methylation_extractor', '--no_header', '-o', output_dir]
+    # Set up all options
+    if args.multicore > 3:
+        # divide multicore by 3 here since bismark will spawn ~3 jobs.
+        cmd.extend(['--multicore', str(math.ceil(args.multicore / 3))])
+    if args.single_end:
+        cmd.append('--single-end')
+    else:
+        cmd.append('--paired-end')
+    if args.no_overlap:
+        cmd.append('--no_overlap')
+    if args.ignore:
+        cmd.extend(['--ignore', str(args.ignore)])
+    if args.ignore_r2:
+        cmd.extend(['--ignore_r2', str(args.ignore_r2)])
+    if args.ignore_3prime:
+        cmd.extend(['--ignore_3prime', str(args.ignore_3prime)])
+    if args.ignore_3prime_r2:
+        cmd.extend(['--ignore_3prime_r2', str(args.ignore_3prime_r2)])
+    if args.comprehensive:
+        cmd.append('--comprehensive')
+    if args.merge_non_cpg:
+        cmd.append('--merge_non_CpG')
+    if args.splitting_report:
+        cmd.append('--report')
+    tmp_genome_dir = None
+    if args.cytosine_report:
+        tmp_genome_dir = build_genome_dir(args.genome_file)
+        if args.cx_context:
+            cmd.extend(
+                ['--bedGraph', '--CX_context', '--cytosine_report', '--CX_context', '--genome_folder', tmp_genome_dir])
+        else:
+            cmd.extend(['--bedGraph', '--cytosine_report', '--genome_folder', tmp_genome_dir])
+
+    cmd.append(args.infile)
+
+    # Run
+    logger.info("Methylation extractor run with: '%s'", " ".join(cmd))
+    prev_dir = os.getcwd()
+    os.chdir(output_dir) # needed due to a bug in bismark where the coverage file cannot be found
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    with process.stdout:
+        log_subprocess_output(logger, process.stdout)
+    exitcode = process.wait()
+    if exitcode != 0:
+        stop_err(logger, "Bismark methylation extractor error (also check the log file if any)!\n%s" % process.stderr)
+    logger.info("Finished methylation extractor.")
+    # collect and copy output files
+    logger.debug("Zip output files to '%s'.", args.compress)
+    os.chdir(prev_dir)
+    zipper(output_dir, args.compress)
+
+    # cytosine report
+    if args.cytosine_report:
+        logger.debug("Collecting cytosine report.")
+        if args.cx_context:
+            shutil.move(glob(os.path.join(output_dir, '*CX_report.txt'))[0], args.cytosine_report)
+        else:
+            shutil.move(glob(os.path.join(output_dir, '*CpG_report.txt'))[0], args.cytosine_report)
+    # splitting report
+    if args.splitting_report:
+        logger.debug("Collecting splitting report.")
+        shutil.move(glob(os.path.join(output_dir, '*_splitting_report.txt'))[0], args.splitting_report)
+    if args.mbias_report:
+        logger.debug("Collecting M-Bias file.")
+        shutil.move(glob(os.path.join(output_dir, '*M-bias.txt'))[0], args.mbias_report)
+
+    # Clean up temp dirs
+    logger.debug("Cleanup temp dirs.")
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    if tmp_genome_dir and os.path.exists(tmp_genome_dir):
+        shutil.rmtree(tmp_genome_dir)
+    logger.info("Done.")
+
+if __name__ == "__main__": __main__()