comparison smalt_wrapper.py @ 1:54855bd8d107 draft default tip

First attempt to get tool_dependencies.xml right.
author cjav
date Wed, 20 Mar 2013 17:07:14 -0400
parents
children
comparison
equal deleted inserted replaced
0:747433a6de00 1:54855bd8d107
1 #!/usr/bin/env python
2
3 """
4 Runs Smalt on single-end or paired-end data.
5 Produces a SAM file containing the mappings.
6 Works with Smalt version 0.7.1.
7
8 usage: smalt_wrapper.py [options]
9
10 See below for options
11 """
12
13 import optparse, os, shutil, subprocess, sys, tempfile
14
15 def stop_err( msg ):
16 sys.stderr.write( '%s\n' % msg )
17 sys.exit()
18
19 def __main__():
20 #Parse Command Line
21 parser = optparse.OptionParser()
22 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
23 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
24 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
25 parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
26 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
27 parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
28 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
29 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
30 parser.add_option( '-x', '--exhaustiveSearch', dest='exhaustiveSearch', help='This flag triggers a more exhaustive search for alignments at the cost of decreased speed' )
31 parser.add_option( '-c', '--minCover', dest='minCover', help='Only consider mappings where the k-mer word seeds cover the query read to a minimum extent' )
32 parser.add_option( '-d', '--scorDiff', dest='scorDiff', help='Set a threshold of the Smith-Waterman alignment score relative to the maximum score' )
33 parser.add_option( '-i', '--insertMax', dest='insertMax', help='Maximum insert size (Only in paired-end mode)' )
34 parser.add_option( '-j', '--insertMin', dest='insertMin', help='Minimum insert size (Only in paired-end mode)' )
35 parser.add_option( '-l', '--pairTyp', dest='pairTyp', help='Type of read pair library, can be either pe, mp or pp' )
36 parser.add_option( '-m', '--minScor', dest='minScor', help='Sets an absolute threshold of the Smith-Waterman scores' )
37 parser.add_option( '-a', '--partialAlignments', dest='partialAlignments', help='Report partial alignments if they are complementary on the read (split reads)' )
38 parser.add_option( '-q', '--minBasq', dest='minBasq', help='Sets a base quality threshold (0 <= minbasq <= 10, default 0)' )
39 parser.add_option( '-e', '--seed', dest='seed', help='If <seed> >= 0 report an alignment selected at random where there are multiple mappings with the same best alignment score. With <seed> = 0 (default) a seed is derived from the current calendar time. If <seed> < 0 reads with multiple best mappings are reported as "not mapped".' )
40 parser.add_option( '-w', '--complexityWeighted', dest='complexityWeighted', help='Smith-Waterman scores are complexity weighted' )
41 parser.add_option( '-y', '--minId', dest='minId', help='Sets an identity threshold for a mapping to be reported' )
42 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
43 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
44 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
45 (options, args) = parser.parse_args()
46
47 # output version # of tool
48 try:
49 tmp = tempfile.NamedTemporaryFile().name
50 tmp_stdout = open( tmp, 'wb' )
51 proc = subprocess.Popen( args='smalt_`uname -i` 2>&1', shell=True, stdout=tmp_stdout )
52 tmp_stdout.close()
53 returncode = proc.wait()
54 stdout = None
55 for line in open( tmp_stdout.name, 'rb' ):
56 if line.lower().find( 'version' ) >= 0:
57 stdout = line.strip()
58 break
59 if stdout:
60 sys.stdout.write( 'SMALT %s\n' % stdout )
61 else:
62 raise Exception
63 except:
64 sys.stdout.write( 'Could not determine SMALT version\n' )
65
66 fastq = options.fastq
67 if options.rfastq:
68 rfastq = options.rfastq
69
70 # make temp directory for placement of indices
71 tmp_index_dir = tempfile.mkdtemp()
72 tmp_dir = tempfile.mkdtemp()
73 # index if necessary
74 if options.fileSource == 'history' and not options.do_not_build_index:
75 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
76 ref_file_name = ref_file.name
77 ref_file.close()
78 os.symlink( options.ref, ref_file_name )
79 cmd1 = 'smalt_`uname -i` index %s %s' % ( ref_file_name, ref_file_name )
80 try:
81 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
82 tmp_stderr = open( tmp, 'wb' )
83 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
84 returncode = proc.wait()
85 tmp_stderr.close()
86 # get stderr, allowing for case where it's very large
87 tmp_stderr = open( tmp, 'rb' )
88 stderr = ''
89 buffsize = 1048576
90 try:
91 while True:
92 stderr += tmp_stderr.read( buffsize )
93 if not stderr or len( stderr ) % buffsize != 0:
94 break
95 except OverflowError:
96 pass
97 tmp_stderr.close()
98 if returncode != 0:
99 raise Exception, stderr
100 except Exception, e:
101 # clean up temp dirs
102 if os.path.exists( tmp_index_dir ):
103 shutil.rmtree( tmp_index_dir )
104 if os.path.exists( tmp_dir ):
105 shutil.rmtree( tmp_dir )
106 stop_err( 'Error indexing reference sequence. ' + str( e ) )
107 else:
108 ref_file_name = options.ref
109
110 # set up aligning and generate aligning command options
111 if options.params == 'pre_set':
112 aligning_cmds = '-n %s ' % ( options.threads )
113 gen_alignment_cmds = ''
114 else:
115 if options.exhaustiveSearch == 'true':
116 exhaustiveSearch = '-x'
117 minCover = '-c %s' % options.minCover
118 else:
119 exhaustiveSearch = ''
120 minCover = ''
121 if options.partialAlignments == 'true':
122 partialAlignments = '-x'
123 else:
124 partialAlignments = ''
125 if options.complexityWeighted == 'true':
126 complexityWeighted = '-w'
127 else:
128 complexityWeighted = ''
129 aligning_cmds = '-d %s -m %s -q %s -r %s -y %s' % \
130 ( options.scorDiff, options.minScor, options.minBasq, options.seed, options.minId )
131 if options.genAlignType == 'paired':
132 gen_alignment_cmds = '-i %s -j %s -l %s' % ( options.insertMax, options.insertMin, options.pairTyp )
133 else:
134 gen_alignment_cmds = ''
135 # prepare actual aligning and generate aligning commands
136 if options.genAlignType == 'paired':
137 cmd = 'smalt_`uname -i` map %s %s -o %s %s %s ' % ( aligning_cmds, gen_alignment_cmds, options.output, ref_file_name, fastq, rfastq )
138 else:
139 cmd = 'smalt_`uname -i` map %s -o %s %s %s ' % ( aligning_cmds, options.output, ref_file_name, fastq )
140 # perform alignments
141 buffsize = 1048576
142 try:
143 # need to nest try-except in try-finally to handle 2.4
144 try:
145 # align
146 try:
147 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
148 tmp_stderr = open( tmp, 'wb' )
149 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
150 returncode = proc.wait()
151 tmp_stderr.close()
152 # get stderr, allowing for case where it's very large
153 tmp_stderr = open( tmp, 'rb' )
154 stderr = ''
155 try:
156 while True:
157 stderr += tmp_stderr.read( buffsize )
158 if not stderr or len( stderr ) % buffsize != 0:
159 break
160 except OverflowError:
161 pass
162 tmp_stderr.close()
163 if returncode != 0:
164 raise Exception, stderr
165 except Exception, e:
166 raise Exception, 'Error aligning sequence. ' + str( e )
167 # remove header if necessary
168 if options.suppressHeader == 'true':
169 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
170 tmp_out_name = tmp_out.name
171 tmp_out.close()
172 try:
173 shutil.move( options.output, tmp_out_name )
174 except Exception, e:
175 raise Exception, 'Error moving output file before removing headers. ' + str( e )
176 fout = file( options.output, 'w' )
177 for line in file( tmp_out.name, 'r' ):
178 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
179 fout.write( line )
180 fout.close()
181 # check that there are results in the output file
182 if os.path.getsize( options.output ) > 0:
183 sys.stdout.write( 'SMALT run on %s-end data' % options.genAlignType )
184 else:
185 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
186 except Exception, e:
187 stop_err( 'The alignment failed.\n' + str( e ) )
188 finally:
189 # clean up temp dir
190 if os.path.exists( tmp_index_dir ):
191 shutil.rmtree( tmp_index_dir )
192 if os.path.exists( tmp_dir ):
193 shutil.rmtree( tmp_dir )
194
195 if __name__=="__main__": __main__()