comparison novocraft_wrapper.py @ 3:c43f9a578e05 draft

Uploaded
author subazini
date Wed, 17 Dec 2014 10:17:19 -0500
parents
children
comparison
equal deleted inserted replaced
2:55b3da30e4e5 3:c43f9a578e05
1 #!/usr/bin/env python
2
3 import optparse, os, shutil, subprocess, sys, tempfile, fileinput,re
4 import time
5 def stop_err( msg ):
6 sys.stderr.write( "%s\n" % msg )
7 sys.exit()
8
9 def __main__():
10 #Parse Command Line
11 start = time.time()
12 print "start time", start
13 parser = optparse.OptionParser()
14 # Wrapper options.
15 parser.add_option( '-0', '--output', dest='output' )
16 parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
17 parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
18 parser.add_option( '-Q', '--Quality', dest='quality', help='' )
19 parser.add_option( '-t', '--align', dest='align' )
20 parser.add_option( '-g', '--open', dest='open' )
21 parser.add_option( '-x', '--extend', dest='extend' )
22 parser.add_option( '-n', '--trunc', dest='trunc' )
23 parser.add_option( '-r', '--refer', dest='genome_reference' )
24 parser.add_option( '-f', '--ref1', dest='genome_index' )
25 parser.add_option( '-k', '--kmer', dest='kmer' )
26 parser.add_option( '-s', '--step', dest='step' )
27 parser.add_option( '-l', '--qual', dest='quality' )
28 parser.add_option( '-m', '--repeat', dest='repeat' )
29 parser.add_option( '-H', '--hclip', dest='hclip' )
30 parser.add_option( '-p', '--pam', dest='pam' )
31 parser.add_option( '-d', '--sd', dest='sd' )
32 parser.add_option( '-i', '--insert', dest='insert' )
33 parser.add_option( '-e', '--settings', dest='settings' )
34 (options, args) = parser.parse_args()
35 print options.genome_reference
36 print options.output
37 print options.input1
38 print options.input2
39
40 tmp_index_dir = tempfile.mkdtemp()
41 #indexing genome
42 if options.genome_reference:
43 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.genome_reference )[1].split( '.' )[:-1] ) )
44 try:
45 os.link( options.genome_reference, index_path + '.fa' )
46 Mtest= os.link( options.genome_reference, index_path + '.fa' )
47 print Mtest
48 except:
49 pass
50 cmd_index = 'novoindex %s %s' % ( index_path, options.genome_reference)
51 if options.settings == "full":
52 opts = ' -k %s -s %s ' %(options.kmer,options.step)
53 cmd_index += opts
54 print cmd_index
55 try:
56 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
57 tmp_stderr = open( tmp, 'wb' )
58 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
59 pid = proc.pid
60 with file('/proc/%s/smaps' % pid) as fp:
61 sx1= sum([int(x) for x in re.findall('^Pss:\s+(\d+)',fp.read(),re.M)])
62 returncode = proc.wait()
63 tmp_stderr.close()
64 # get stderr, allowing for case where it's very large
65 tmp_stderr = open( tmp, 'rb' )
66 stderr = ''
67 buffsize = 1048576
68 try:
69 while True:
70 stderr += tmp_stderr.read( buffsize )
71 if not stderr or len( stderr ) % buffsize != 0:
72 break
73 except OverflowError:
74 pass
75 tmp_stderr.close()
76 if returncode != 0:
77 raise Exception, stderr
78 except Exception, e:
79 if os.path.exists( tmp_index_dir ):
80 shutil.rmtree( tmp_index_dir )
81 stop_err( 'Error indexing reference sequence\n' + str( e ) )
82 opts = ''
83 #using genome reference from history
84 if options.genome_index:
85 index_path = options.genome_index
86 sx1 = 0
87 #Aligning with reference
88 if options.input2:
89 cmd_index2 = 'novoalign -f %s %s -d %s ' %( options.input1,options.input2,index_path)
90 else:
91 cmd_index2 = 'novoalign -f %s -d %s ' %( options.input1,index_path)
92 # Selected settimgs
93 if options.settings == "full":
94 opts = ' -t %s -g %s -x %s -n %s -H %s -h %s -i %s %s,%s > %s' %(options.align,options.open,options.extend,options.trunc,options.hclip,options.repeat,options.pam,options.insert,options.sd,options.output)
95 else:
96 opts = ' > %s' %(options.output)
97 cmd_index2 += opts
98 print cmd_index2
99 try:
100 tmp_out = tempfile.NamedTemporaryFile().name
101 tmp_stdout = open( tmp_out, 'wb' )
102 tmp_err = tempfile.NamedTemporaryFile().name
103 tmp_stderr = open( tmp_err, 'wb' )
104 proc = subprocess.Popen( args=cmd_index2, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
105 pid = proc.pid
106 with file('/proc/%s/smaps' % pid) as fp:
107 sx2= sum([int(x) for x in re.findall('^Pss:\s+(\d+)',fp.read(),re.M)])
108 sx = sx1 + sx2
109 print cmd_index2
110 print "memory usage (KiBs)", sx
111 returncode = proc.wait()
112 tmp_stderr.close()
113 # get stderr, allowing for case where it's very large
114 tmp_stderr = open( tmp_err, 'rb' )
115 stderr = ''
116 buffsize = 1048576
117 try:
118 while True:
119 stderr += tmp_stderr.read( buffsize )
120 if not stderr or len( stderr ) % buffsize != 0:
121 break
122 except OverflowError:
123 pass
124 tmp_stdout.close()
125 tmp_stderr.close()
126 if returncode != 0:
127 raise Exception, stderr
128 end = time.time()
129 print "end time", end
130 elapsed = end - start
131 print ("Time elapsed: {0:4f}sec",format(elapsed))
132 except Exception, e:
133 stop_err( 'Error in Novoalign:\n' + str( e ) )
134 if __name__=="__main__": __main__()