view novocraft_wrapper.py @ 3:c43f9a578e05 draft

Uploaded
author subazini
date Wed, 17 Dec 2014 10:17:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

import optparse, os, shutil, subprocess, sys, tempfile, fileinput,re
import time
def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def __main__():
    #Parse Command Line
    start = time.time()
    print "start time", start
    parser = optparse.OptionParser()
    # Wrapper options.
    parser.add_option( '-0', '--output', dest='output' )
    parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
    parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
    parser.add_option( '-Q', '--Quality', dest='quality', help='' )
    parser.add_option( '-t', '--align', dest='align' )
    parser.add_option( '-g', '--open', dest='open' )
    parser.add_option( '-x', '--extend', dest='extend' )
    parser.add_option( '-n', '--trunc', dest='trunc' )
    parser.add_option( '-r', '--refer', dest='genome_reference' )
    parser.add_option( '-f', '--ref1', dest='genome_index' )
    parser.add_option( '-k', '--kmer', dest='kmer' )
    parser.add_option( '-s', '--step', dest='step' )
    parser.add_option( '-l', '--qual', dest='quality' )
    parser.add_option( '-m', '--repeat', dest='repeat' )
    parser.add_option( '-H', '--hclip', dest='hclip' )
    parser.add_option( '-p', '--pam', dest='pam' )
    parser.add_option( '-d', '--sd', dest='sd' )
    parser.add_option( '-i', '--insert', dest='insert' )
    parser.add_option( '-e', '--settings', dest='settings' )
    (options, args) = parser.parse_args()
    print options.genome_reference    
    print options.output
    print options.input1
    print options.input2

    tmp_index_dir = tempfile.mkdtemp()
    #indexing genome
    if options.genome_reference:
        index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.genome_reference )[1].split( '.' )[:-1] ) )
        try:
            os.link( options.genome_reference, index_path + '.fa' )
            Mtest= os.link( options.genome_reference, index_path + '.fa' )
            print Mtest
        except:
            pass
        cmd_index = 'novoindex %s %s' % ( index_path, options.genome_reference)
       	if options.settings == "full":
            opts = ' -k %s -s %s ' %(options.kmer,options.step)
            cmd_index +=  opts   
        print cmd_index
        try:
            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
            tmp_stderr = open( tmp, 'wb' )
            proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
            pid = proc.pid
            with file('/proc/%s/smaps' % pid) as fp:
       		sx1= sum([int(x) for x in re.findall('^Pss:\s+(\d+)',fp.read(),re.M)])
            returncode = proc.wait()
            tmp_stderr.close()
            # get stderr, allowing for case where it's very large
            tmp_stderr = open( tmp, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()
            if returncode != 0:
                raise Exception, stderr
        except Exception, e:
            if os.path.exists( tmp_index_dir ):
                shutil.rmtree( tmp_index_dir )
            stop_err( 'Error indexing reference sequence\n' + str( e ) )
    opts = ''
    #using genome reference from history
    if options.genome_index:
        index_path = options.genome_index
        sx1 = 0
    #Aligning with reference
    if options.input2:
    	cmd_index2 = 'novoalign -f %s %s -d %s '  %( options.input1,options.input2,index_path) 
    else:
	cmd_index2 = 'novoalign -f %s -d %s '  %( options.input1,index_path) 
    # Selected settimgs
    if options.settings == "full":
    	opts = ' -t %s -g %s -x %s -n %s -H %s -h %s -i %s %s,%s > %s' %(options.align,options.open,options.extend,options.trunc,options.hclip,options.repeat,options.pam,options.insert,options.sd,options.output)
    else:
    	opts = '  > %s' %(options.output)
    cmd_index2 +=  opts
    print cmd_index2
    try:
        tmp_out = tempfile.NamedTemporaryFile().name
        tmp_stdout = open( tmp_out, 'wb' )
        tmp_err = tempfile.NamedTemporaryFile().name
        tmp_stderr = open( tmp_err, 'wb' )
        proc = subprocess.Popen( args=cmd_index2, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
        pid = proc.pid
        with file('/proc/%s/smaps' % pid) as fp:
       		sx2= sum([int(x) for x in re.findall('^Pss:\s+(\d+)',fp.read(),re.M)])
       	sx = sx1 + sx2
       	print cmd_index2
        print "memory usage (KiBs)", sx
        returncode = proc.wait()
        tmp_stderr.close()
        # get stderr, allowing for case where it's very large
        tmp_stderr = open( tmp_err, 'rb' )
        stderr = ''
        buffsize = 1048576
        try:
            while True:
                stderr += tmp_stderr.read( buffsize )
                if not stderr or len( stderr ) % buffsize != 0:
                    break
        except OverflowError:
            pass
        tmp_stdout.close()
        tmp_stderr.close()
        if returncode != 0:
            raise Exception, stderr
        end =  time.time() 
        print "end time", end
        elapsed = end - start  
        print ("Time elapsed: {0:4f}sec",format(elapsed)) 
    except Exception, e:
        stop_err( 'Error in Novoalign:\n' + str( e ) ) 
if __name__=="__main__": __main__()