view macs21_wrapper.py @ 3:4124781932db draft

Updated to MACS 2.1.1 and use conda for dependency resolution.
author pjbriggs
date Tue, 20 Mar 2018 11:25:04 -0400
parents 00d73c812399
children
line wrap: on
line source

#!/bin/env python
#
# Galaxy wrapper to run MACS 2.1
#
# Completely rewritten from the original macs2 wrapped by Ziru Zhou
# taken from http://toolshed.g2.bx.psu.edu/view/modencode-dcc/macs2

import sys
import os
import subprocess
import tempfile
import shutil

def move_file(working_dir,name,destination):
    """Move a file 'name' from 'working_dir' to 'destination'

    """
    if destination is None:
        # Nothing to do
        return
    source = os.path.join(working_dir,name)
    if os.path.exists(source):
        shutil.move(source,destination)

def convert_xls_to_interval(xls_file,interval_file,header=None):
    """Convert MACS XLS file to interval

    From the MACS readme: "Coordinates in XLS is 1-based which is different with
    BED format."

    However this function no longer performs any coordinate conversions, it
    simply ensures that any blank or non-data lines are commented out

    """
    fp = open(interval_file,'wb')
    if header:
        fp.write('#%s\n' % header)
    for line in open(xls_file):
        # Keep all existing comment lines
        if line.startswith('#'):
            fp.write(line)
        else:
            # Split line into fields and test to see if
            # the 'start' field is actually an integer
            fields = line.split('\t')
            if len(fields) > 1:
                try:
                    int(fields[1])
                except ValueError:
                    # Integer conversion failed so comment out
                    # "bad" line instead
                    fields[0] = "#%s" % fields[0]
            fp.write( '\t'.join( fields ) )
    fp.close()

def make_bigwig_from_bedgraph(bedgraph_file,bigwig_file,
                              chrom_sizes,working_dir=None):
    """Make bigWig file from a bedGraph

    The protocol is:

    $ fetchChromSizes.sh mm9 > mm9.chrom.sizes
    $ bedClip treat.bedgraph mm9.chrom.sizes treat.clipped
    $ bedSort treat.clipped treat.clipped.sorted
    $ bedGraphToBigWig treat.clipped.sorted mm9.chrom.sizes treat.bw

    Get the binaries from
    http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/

    We skip the fetchChromSizes step if the 'chrom_sizes'
    argument supplied a valid file with the chromosome sizes
    for the genome build in question.

    """
    print "Generating bigWig from bedGraph..."
    # Check for chromosome sizes
    if not os.path.exists(chrom_sizes):
        # Determine genome build
        chrom_sizes = os.path.basename(chrom_sizes)
        genome_build = chrom_sizes.split('.')[0]
        if genome_build == '?':
            # No genome build set
            sys.stderr.write("ERROR genome build not set, cannot get sizes for '?'\n")
            sys.stderr.write("Assign a genome build to your input dataset and rerun\n")
            sys.exit(1)
        print "Missing chrom sizes file, attempting to fetch for '%s'" % genome_build
        # Run fetchChromSizes
        chrom_sizes = os.path.join(working_dir,chrom_sizes)
        stderr_file = os.path.join(working_dir,"fetchChromSizes.stderr")
        cmd = "fetchChromSizes %s" % genome_build
        print "Running %s" % cmd
        proc = subprocess.Popen(args=cmd,shell=True,cwd=working_dir,
                                stdout=open(chrom_sizes,'wb'),
                                stderr=open(stderr_file,'wb'))
        proc.wait()
        # Copy stderr from fetchChromSizes for information only
        for line in open(stderr_file,'r'):
            print line.strip()
        os.remove(stderr_file)
        # Check that the sizes file was downloaded
        if not os.path.exists(chrom_sizes):
            sys.stderr.write("Failed to download chrom sizes for '%s'\n" % genome_build)
            sys.exit(1)
    # Run bedClip
    treat_clipped = "%s.clipped" % os.path.basename(bedgraph_file)
    cmd = "bedClip %s %s %s" % (bedgraph_file,chrom_sizes,treat_clipped)
    print "Running %s" % cmd
    proc = subprocess.Popen(args=cmd,shell=True,cwd=working_dir)
    proc.wait()
    # Check that clipped file exists
    treat_clipped = os.path.join(working_dir,treat_clipped)
    if not os.path.exists(treat_clipped):
        sys.stderr.write("Failed to create clipped bed file\n")
        sys.exit(1)
    # Run bedSort
    treat_clipped_sorted = "%s.sorted" % os.path.basename(treat_clipped)
    cmd = "bedSort %s %s" % (treat_clipped,treat_clipped_sorted)
    print "Running %s" % cmd
    proc = subprocess.Popen(args=cmd,shell=True,cwd=working_dir)
    proc.wait()
    # Check that sorted file exists
    treat_clipped_sorted = os.path.join(working_dir,treat_clipped_sorted)
    if not os.path.exists(treat_clipped_sorted):
        sys.stderr.write("Failed to create sorted clipped bed file\n")
        sys.exit(1)
    # Run bedGraphToBigWig
    cmd = "bedGraphToBigWig %s %s %s" % (treat_clipped_sorted,
                                         chrom_sizes,
                                         bigwig_file)
    print "Running %s" % cmd
    proc = subprocess.Popen(args=cmd,shell=True,cwd=working_dir)
    proc.wait()
    # Clean up temporary chrom length file
    if os.path.dirname(chrom_sizes) == working_dir:
        print "Removing temporary chrom sizes file"
        os.remove(chrom_sizes)

if __name__ == "__main__":

    # Echo the command line
    print ' '.join(sys.argv)

    # Initialise output files - values are set by reading from
    # the command line supplied by the Galaxy wrapper
    output_extra_html = None
    output_extra_path = None
    output_broadpeaks = None
    output_gappedpeaks = None
    output_narrowpeaks = None
    output_treat_pileup = None
    output_lambda_bedgraph = None
    output_bigwig = None
    output_xls_to_interval_peaks_file = None
    output_peaks = None
    output_bdgcmp = None

    # Other initialisations
    chrom_sizes_file = None

    # Build the MACS 2.1 command line
    # Initial arguments are always the same: command & input ChIP-seq file name
    cmdline = ["macs2 %s -t %s" % (sys.argv[1],sys.argv[2])]

    # Process remaining args
    for arg in sys.argv[3:]:
        if arg.startswith('--format='):
            # Convert format to uppercase
            format_ = arg.split('=')[1].upper()
            cmdline.append("--format=%s" % format_)
        elif arg.startswith('--name='):
            # Replace whitespace in name with underscores
            experiment_name = '_'.join(arg.split('=')[1].split())
            cmdline.append("--name=%s" % experiment_name)
        elif arg.startswith('--length='):
            # Extract chromosome size file
            chrom_sizes_file = arg.split('=')[1]
        elif arg.startswith('--output-'):
            # Handle destinations for output files
            arg0,filen = arg.split('=')
            if arg0 == '--output-summits':
                output_summits = filen
            elif arg0 == '--output-extra-files':
                output_extra_html = filen
            elif  arg0 == '--output-extra-files-path':
                output_extra_path = filen
            elif  arg0 == '--output-broadpeaks':
                output_broadpeaks = filen
            elif  arg0 == '--output-gappedpeaks':
                output_gappedpeaks = filen
            elif  arg0 == '--output-narrowpeaks':
                output_narrowpeaks = filen
            elif  arg0 == '--output-pileup':
                output_treat_pileup = filen
            elif  arg0 == '--output-lambda-bedgraph':
                output_lambda_bedgraph = filen
            elif  arg0 == '--output-bigwig':
                output_bigwig = filen
            elif  arg0 == '--output-xls-to-interval':
                output_xls_to_interval_peaks_file = filen
            elif  arg0 == '--output-peaks':
                output_peaks = filen
        else:
            # Pass remaining args directly to MACS
            # command line
            cmdline.append(arg)
    
    cmdline = ' '.join(cmdline)
    print "Generated command line:\n%s" % cmdline

    # Execute MACS2
    #
    # Make a working directory
    working_dir = tempfile.mkdtemp()
    #
    # Collect stderr in a file for reporting later
    stderr_filen = tempfile.NamedTemporaryFile().name
    #
    # Run MACS2
    proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir,
                            stderr=open(stderr_filen,'wb'))
    proc.wait()
    exit_code = proc.returncode
    if exit_code != 0:
        sys.stderr.write(open(stderr_filen,'rb').read())
        sys.exit(exit_code)
    
    # Run R script to create PDF from model script
    if os.path.exists(os.path.join(working_dir,"%s_model.r" % experiment_name)):
        cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % \
                  (experiment_name, experiment_name)
        proc = subprocess.Popen(args=cmdline,shell=True,cwd=working_dir)
        proc.wait()

    # Convert XLS to interval, if requested
    if output_xls_to_interval_peaks_file is not None:
        peaks_xls_file = os.path.join(working_dir,'%s_peaks.xls' % experiment_name )
        if os.path.exists(peaks_xls_file):
            convert_xls_to_interval(peaks_xls_file,output_xls_to_interval_peaks_file,
                                    header='peaks file')

    # Create bigWig from bedGraph, if requested
    if output_bigwig is not None:
        treat_bedgraph_file = os.path.join(working_dir,'%s_treat_pileup.bdg' % experiment_name)
        if os.path.exists(treat_bedgraph_file):
            make_bigwig_from_bedgraph(treat_bedgraph_file,output_bigwig,
                                      chrom_sizes_file,working_dir)
        
    # Move MACS2 output files from working dir to their final destinations
    move_file(working_dir,"%s_summits.bed" % experiment_name,output_summits)
    move_file(working_dir,"%s_peaks.xls" % experiment_name,output_peaks)
    move_file(working_dir,"%s_peaks.narrowPeak" % experiment_name,output_narrowpeaks)
    move_file(working_dir,"%s_peaks.broadPeak" % experiment_name,output_broadpeaks)
    move_file(working_dir,"%s_peaks.gappedPeak" % experiment_name,output_gappedpeaks)
    move_file(working_dir,"%s_treat_pileup.bdg" % experiment_name,output_treat_pileup)
    move_file(working_dir,"%s_control_lambda.bdg" % experiment_name,output_lambda_bedgraph)
    move_file(working_dir,"bdgcmp_out.bdg",output_bdgcmp)

    # Move remaining file to the 'extra files' path and link from the HTML
    # file to allow user to access them from within Galaxy
    html_file = open(output_extra_html,'wb')
    html_file.write('<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name)
    # Make the 'extra files' directory
    os.mkdir(output_extra_path)
    # Move the files
    for filen in sorted(os.listdir(working_dir)):
        shutil.move(os.path.join(working_dir,filen),
                    os.path.join(output_extra_path,filen))
        html_file.write( '<li><a href="%s">%s</a></li>\n' % (filen,filen))
    # All files moved, close out HTML
    html_file.write( '</ul></p>\n' )
    # Append any stderr output
    html_file.write('<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' %
                    open(stderr_filen,'rb').read())
    html_file.write('</body></html>\n')
    html_file.close()

    # Clean up the working directory and files
    os.unlink(stderr_filen)
    os.rmdir(working_dir)