diff shear_wrapper.py @ 0:65255beda972

Uploaded
author jjohnson
date Fri, 08 Nov 2013 16:23:10 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/shear_wrapper.py	Fri Nov 08 16:23:10 2013 -0500
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+import optparse, os, shutil, subprocess, sys, tempfile, os.path
+
+def stop_err( msg, code ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit(code)
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-j', '--jar_path', dest='jar_path', help='Path to SHEAR.jar' )
+    parser.add_option( '-C', '--command', dest='command', help='SHEAR command to run' )
+    parser.add_option( '-p', '--prefix', dest='prefix', help='Prefix for all generated files. Required.' )
+    parser.add_option( '-b', '--bam', dest='bam', help='BAM alignment file containing the input sequences to the assembly.' )
+    parser.add_option( '-B', '--bai', dest='bai', help='BAM index file (.bai)' )
+    parser.add_option( '-f', '--fasta_reference', dest='fasta', help='The reference sequence fasta file' )
+    parser.add_option( '-F', '--fasta_index', dest='fai', help='The .fai index file for the reference sequence fasta' )
+    parser.add_option( '-t', '--twobit', dest='twobit', help='The .2bit encoding of the reference sequence fasta generated by faToTwoBit' )
+    parser.add_option( '-i', '--bwa_index', dest='bwa_index', help='The bwa index of the reference sequence' )
+    parser.add_option( '-R', '--report', dest='report', help='The SHEAR output report' )
+    parser.add_option( '-s', '--sdi', dest='sdi', help='The SHEAR sdi input from the SHEAR sv command' )
+    parser.add_option( '-o', '--output', dest='output', help='The SHEAR output assembly fasta file' )
+    parser.add_option( '-D', '--svidx_dir', dest='svidx_dir', help='The SHEAR output assembly fasta file' )
+    parser.add_option( '-S', '--sv-only', dest='sv_only', action="store_true", help='SV Only prediction mode.' )
+    parser.add_option( '-r', '--region', dest='region', help='Region of the input alignment to analyze' )
+    (options, args) = parser.parse_args()
+
+    def make_ref(src, dest, copy=False):
+        if copy:
+            shutil.copy( src, dest )
+        else:
+            os.symlink( src, dest )
+
+    # output version # of tool
+    #try:
+    #except:
+    #    sys.stdout.write( 'Could not determine BWA version\n' )
+    if not options.jar_path:
+        stop_err('path to SHEAR.jar is required',1)
+    if options.command:
+        command = options.command
+    else:
+        stop_err('SHEAR command is required',1)
+    args = [ 'java','-jar' ]
+    args.append( options.jar_path )
+    args.append( command )
+    # Check required params for command
+    buffsize = 1048576 
+    copy_ref = True
+    prefix = options.prefix if options.prefix and len(options.prefix) > 0 else 'ref'
+    if command in ['sv']:
+        args.append( '-p' )
+        args.append( prefix )
+        if options.sv_only:
+            args.append('--sv-only')
+        if options.region:
+            args.append('-r')
+            args.append(options.region)
+        copy_ref = True # GATK has issue with symlinks for .fa and .fai 
+    if options.svidx_dir and command in ['sv']:
+        if not os.path.isdir(options.svidx_dir):
+            os.makedirs(options.svidx_dir)
+        ref_prefix = os.path.join(options.svidx_dir,prefix)
+        copy_ref = True
+    else:
+        ref_prefix = os.path.join(os.getcwd(),prefix)
+
+    if command in ['sv','assemble']:
+        ref_fasta = ref_prefix + '.fa'
+        ref_index = ref_fasta + '.fai'
+        if options.fasta:
+            make_ref( options.fasta, ref_fasta, copy=copy_ref )
+        else:
+            stop_err('Reference fasta file is required',3)
+        # if reference fasta index is not supllied, generate it
+        if options.fai:
+            make_ref( options.fai, ref_index, copy=copy_ref )
+        elif os.path.exists(options.fasta + '.fai'):
+            make_ref( options.fasta + '.fai', ref_index, copy=copy_ref )
+        else:
+            # generate fasta fai index
+            cmd1 = 'samtools faidx %s' % (ref_fasta )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='fai_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),5 )
+        args.append( '-f' )
+        args.append( ref_fasta )
+
+    if command in ['sv']:
+        if not options.bam:
+            stop_err('BAM alignment file is required',2)
+
+        # if reference 2bit is not supplied, generate it
+        twobit = ref_prefix + '.2bit'
+        if options.twobit:
+            make_ref( options.twobit, twobit, copy=copy_ref )
+        else:
+            # generate 2bit index
+            cmd1 = 'faToTwoBit %s %s' % (ref_fasta, twobit )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='twobit_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing reference sequence fasta file. ' + str( e ),6 )
+        args.append( '-t' )
+        args.append( twobit )
+
+        # if bwa index is not supplied, generate it
+        bwa_index = ref_fasta
+        if options.bwa_index:
+            if copy_ref:
+                shutil.copytree(os.path.dirname(bwa_index),options.svidx_dir)
+            bwa_index = options.bwa_index
+        else:
+            ONE_GB = 2**30
+            if os.stat( ref_fasta ).st_size <= ONE_GB: #use 1 GB as cut off for memory vs. max of 2gb database size; this is somewhat arbitrary
+                index_algorithm = 'is'
+            else:
+                index_algorithm = 'bwtsw'
+            cmd1 = 'bwa index -a %s %s' % ( index_algorithm, ref_fasta )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='bwa_index_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+                bwa_index = ref_fasta
+            except Exception, e:
+                # clean up temp dirs
+                stop_err( 'Error creating bwa index. ' + str( e ),7 )
+        args.append( '-i' )
+        args.append( bwa_index )
+
+        bam_file = os.path.join(os.getcwd(),prefix + '.bam')
+        bai_file = bam_file + '.bai'
+        if options.bam:
+            os.symlink( options.bam, bam_file )
+        else:
+            stop_err('BAM alignment file is required',2)
+        # if bam index is not supplied, generate it
+        if options.bai:
+            os.symlink( options.bai, bai_file )
+        elif os.path.exists(options.bam + '.bai'):
+            os.symlink( options.bam + '.bai', bai_file )
+        else:
+            # generate bam index
+            cmd1 = 'samtools index %s %s' % (bam_file, bai_file )
+            try:
+                tmp_name = tempfile.NamedTemporaryFile(prefix='bai_', suffix='.err').name
+                tmp_stderr = open( tmp_name, 'wb' )
+                proc = subprocess.Popen( args=cmd1, shell=True, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                if returncode != 0:
+                    stderr = ''
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    raise Exception, stderr
+            except Exception, e:
+                stop_err( 'Error indexing BAM alignment file. ' + str( e ),4)
+        args.append( '-b' )
+        args.append( bam_file )
+    
+    if command in ['assemble']:
+        if not options.sdi or not options.output:
+            stop_err('SHEAR .sdi file and output file are required for assemble',2)
+        args.append( '-s' )
+        args.append( options.sdi )
+        args.append( '-o' )
+        args.append( options.output )
+
+    # Run SHEAR command
+    print >> sys.stdout, "%s" % ' '.join(args)
+    try:
+        proc = subprocess.Popen( args=args, shell=False )
+        returncode = proc.wait()
+        if returncode != 0:
+            stop_err( 'Error running SHEAR ' + command, returncode )
+    except Exception, e:
+        # clean up temp dirs
+        stop_err( 'Error running SHEAR %s %s' % (command,str(e)),9 )
+
+if __name__=="__main__": __main__()
+