view bismark_methylation_extractor.py @ 21:120b7b35e442 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 8fdc76a99a9dcf34549898a208317607afd18798"
author bgruening
date Thu, 22 Apr 2021 17:05:46 +0000
parents ff6ee551b153
children 8c191acde702
line wrap: on
line source

#!/usr/bin/env python

import argparse
import logging
import math
import os
import re
import shutil
import subprocess
import sys
import tempfile
import zipfile
from glob import glob


def stop_err(logger, msg):
    logger.critical(msg)
    sys.exit(1)


def log_subprocess_output(logger, pipe):
    for line in iter(pipe.readline, b""):
        logger.debug(line.decode().rstrip())


def zipper(dir, zip_file):
    output_files_regex = re.compile("^(Non_)?C[pH][GH]_.*")
    bedgraph_regex = re.compile(".*bedGraph.gz")
    zip = zipfile.ZipFile(zip_file, "w", compression=zipfile.ZIP_DEFLATED)
    root_len = len(os.path.abspath(dir))
    for root, dirs, files in os.walk(dir):
        archive_root = os.path.abspath(root)[root_len:]
        for f in files:
            if re.search(output_files_regex, f) or re.search(bedgraph_regex, f):
                fullpath = os.path.join(root, f)
                archive_name = os.path.join(archive_root, f)
                zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED)
    zip.close()
    return zip_file


def build_genome_dir(genome_file):
    tmp_genome_dir = tempfile.mkdtemp(prefix="tmp")
    genome_path = os.path.join(
        tmp_genome_dir, ".".join(os.path.split(genome_file)[1].split(".")[:-1])
    )
    try:
        # Create a hard link pointing to genome_file named 'genome_path'.fa.
        os.symlink(genome_file, genome_path + ".fa")
    except Exception as e:
        if os.path.exists(tmp_genome_dir):
            shutil.rmtree(tmp_genome_dir)
        stop_err("Error in linking the reference database!\n%s" % e)
    return tmp_genome_dir


def __main__():
    # Parse Command Line
    parser = argparse.ArgumentParser(
        description="Wrapper for the bismark methylation caller."
    )

    # input options
    parser.add_argument("--infile", help="Input file in SAM or BAM format.")
    parser.add_argument("--single-end", dest="single_end", action="store_true")
    parser.add_argument("--paired-end", dest="paired_end", action="store_true")

    parser.add_argument("--multicore", dest="multicore", type=int, default=1)
    parser.add_argument("--splitting_report", dest="splitting_report")
    parser.add_argument("--mbias_report", dest="mbias_report")
    parser.add_argument("--cytosine_report", dest="cytosine_report")
    parser.add_argument("--genome_file", dest="genome_file")
    parser.add_argument("--cx_context", action="store_true")

    parser.add_argument("--comprehensive", action="store_true")
    parser.add_argument("--merge-non-cpg", dest="merge_non_cpg", action="store_true")
    parser.add_argument("--no-overlap", dest="no_overlap", action="store_true")
    parser.add_argument("--compress", dest="compress")
    parser.add_argument("--ignore", dest="ignore", type=int)
    parser.add_argument("--ignore_r2", dest="ignore_r2", type=int)
    parser.add_argument("--ignore_3prime", dest="ignore_3prime", type=int)
    parser.add_argument("--ignore_3prime_r2", dest="ignore_3prime_r2", type=int)
    parser.add_argument(
        "--log_report", dest="log_report", metavar="log_filename", type=str
    )
    args = parser.parse_args()

    logger = logging.getLogger("bismark_methylation_extractor_wrapper")
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler(sys.stdout)
    if args.log_report:
        ch.setLevel(logging.WARNING)
        handler = logging.FileHandler(args.log_report)
        handler.setLevel(logging.DEBUG)
        logger.addHandler(handler)
    else:
        ch.setLevel(logging.DEBUG)
    logger.addHandler(ch)

    # Build methylation extractor command
    output_dir = tempfile.mkdtemp()
    cmd = ["bismark_methylation_extractor", "--no_header", "-o", output_dir]
    # Set up all options
    if args.multicore > 3:
        # divide multicore by 3 here since bismark will spawn ~3 jobs.
        cmd.extend(["--multicore", str(int(math.floor(args.multicore / 3)))])
    if args.single_end:
        cmd.append("--single-end")
    else:
        cmd.append("--paired-end")
    if args.no_overlap:
        cmd.append("--no_overlap")
    if args.ignore:
        cmd.extend(["--ignore", str(args.ignore)])
    if args.ignore_r2:
        cmd.extend(["--ignore_r2", str(args.ignore_r2)])
    if args.ignore_3prime:
        cmd.extend(["--ignore_3prime", str(args.ignore_3prime)])
    if args.ignore_3prime_r2:
        cmd.extend(["--ignore_3prime_r2", str(args.ignore_3prime_r2)])
    if args.comprehensive:
        cmd.append("--comprehensive")
    if args.merge_non_cpg:
        cmd.append("--merge_non_CpG")
    if args.splitting_report:
        cmd.append("--report")
    tmp_genome_dir = None
    if args.cytosine_report:
        tmp_genome_dir = build_genome_dir(args.genome_file)
        if args.cx_context:
            cmd.extend(
                [
                    "--bedGraph",
                    "--CX_context",
                    "--cytosine_report",
                    "--CX_context",
                    "--genome_folder",
                    tmp_genome_dir,
                ]
            )
        else:
            cmd.extend(
                ["--bedGraph", "--cytosine_report", "--genome_folder", tmp_genome_dir]
            )

    cmd.append(args.infile)

    # Run
    logger.info("Methylation extractor run with: '%s'", " ".join(cmd))
    prev_dir = os.getcwd()
    os.chdir(
        output_dir
    )  # needed due to a bug in bismark where the coverage file cannot be found
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    with process.stdout:
        log_subprocess_output(logger, process.stdout)
    exitcode = process.wait()
    if exitcode != 0:
        stop_err(
            logger,
            "Bismark methylation extractor error (also check the log file if any)!\n%s"
            % process.stderr,
        )
    logger.info("Finished methylation extractor.")
    # collect and copy output files
    logger.debug("Zip output files to '%s'.", args.compress)
    os.chdir(prev_dir)
    zipper(output_dir, args.compress)

    # cytosine report
    if args.cytosine_report:
        logger.debug("Collecting cytosine report.")
        if args.cx_context:
            shutil.move(
                glob(os.path.join(output_dir, "*CX_report.txt"))[0],
                args.cytosine_report,
            )
        else:
            shutil.move(
                glob(os.path.join(output_dir, "*CpG_report.txt"))[0],
                args.cytosine_report,
            )
    # splitting report
    if args.splitting_report:
        logger.debug("Collecting splitting report.")
        shutil.move(
            glob(os.path.join(output_dir, "*_splitting_report.txt"))[0],
            args.splitting_report,
        )
    if args.mbias_report:
        logger.debug("Collecting M-Bias file.")
        shutil.move(glob(os.path.join(output_dir, "*M-bias.txt"))[0], args.mbias_report)

    # Clean up temp dirs
    logger.debug("Cleanup temp dirs.")
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    if tmp_genome_dir and os.path.exists(tmp_genome_dir):
        shutil.rmtree(tmp_genome_dir)
    logger.info("Done.")


if __name__ == "__main__":
    __main__()