0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Runs Smalt on single-end or paired-end data.
|
|
5 Produces a SAM file containing the mappings.
|
|
6 Works with Smalt version 0.7.1.
|
|
7
|
|
8 usage: smalt_wrapper.py [options]
|
|
9
|
|
10 See below for options
|
|
11 """
|
|
12
|
|
13 import optparse, os, shutil, subprocess, sys, tempfile
|
|
14
|
|
15 def stop_err( msg ):
|
|
16 sys.stderr.write( '%s\n' % msg )
|
|
17 sys.exit()
|
|
18
|
|
19 def __main__():
|
|
20 #Parse Command Line
|
|
21 parser = optparse.OptionParser()
|
|
22 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
|
|
23 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
|
|
24 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
|
|
25 parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
|
|
26 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
|
|
27 parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
|
|
28 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
|
|
29 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
|
|
30 parser.add_option( '-x', '--exhaustiveSearch', dest='exhaustiveSearch', help='This flag triggers a more exhaustive search for alignments at the cost of decreased speed' )
|
|
31 parser.add_option( '-c', '--minCover', dest='minCover', help='Only consider mappings where the k-mer word seeds cover the query read to a minimum extent' )
|
|
32 parser.add_option( '-d', '--scorDiff', dest='scorDiff', help='Set a threshold of the Smith-Waterman alignment score relative to the maximum score' )
|
|
33 parser.add_option( '-i', '--insertMax', dest='insertMax', help='Maximum insert size (Only in paired-end mode)' )
|
|
34 parser.add_option( '-j', '--insertMin', dest='insertMin', help='Minimum insert size (Only in paired-end mode)' )
|
|
35 parser.add_option( '-l', '--pairTyp', dest='pairTyp', help='Type of read pair library, can be either pe, mp or pp' )
|
|
36 parser.add_option( '-m', '--minScor', dest='minScor', help='Sets an absolute threshold of the Smith-Waterman scores' )
|
|
37 parser.add_option( '-a', '--partialAlignments', dest='partialAlignments', help='Report partial alignments if they are complementary on the read (split reads)' )
|
|
38 parser.add_option( '-q', '--minBasq', dest='minBasq', help='Sets a base quality threshold (0 <= minbasq <= 10, default 0)' )
|
|
39 parser.add_option( '-e', '--seed', dest='seed', help='If <seed> >= 0 report an alignment selected at random where there are multiple mappings with the same best alignment score. With <seed> = 0 (default) a seed is derived from the current calendar time. If <seed> < 0 reads with multiple best mappings are reported as "not mapped".' )
|
|
40 parser.add_option( '-w', '--complexityWeighted', dest='complexityWeighted', help='Smith-Waterman scores are complexity weighted' )
|
|
41 parser.add_option( '-y', '--minId', dest='minId', help='Sets an identity threshold for a mapping to be reported' )
|
|
42 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
|
|
43 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
|
|
44 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
|
|
45 (options, args) = parser.parse_args()
|
|
46
|
|
47 # output version # of tool
|
|
48 try:
|
|
49 tmp = tempfile.NamedTemporaryFile().name
|
|
50 tmp_stdout = open( tmp, 'wb' )
|
|
51 proc = subprocess.Popen( args='smalt 2>&1', shell=True, stdout=tmp_stdout )
|
|
52 tmp_stdout.close()
|
|
53 returncode = proc.wait()
|
|
54 stdout = None
|
|
55 for line in open( tmp_stdout.name, 'rb' ):
|
|
56 if line.lower().find( 'version' ) >= 0:
|
|
57 stdout = line.strip()
|
|
58 break
|
|
59 if stdout:
|
|
60 sys.stdout.write( 'SMALT %s\n' % stdout )
|
|
61 else:
|
|
62 raise Exception
|
|
63 except:
|
|
64 sys.stdout.write( 'Could not determine SMALT version\n' )
|
|
65
|
|
66 fastq = options.fastq
|
|
67 if options.rfastq:
|
|
68 rfastq = options.rfastq
|
|
69
|
|
70 # make temp directory for placement of indices
|
|
71 tmp_index_dir = tempfile.mkdtemp()
|
|
72 tmp_dir = tempfile.mkdtemp()
|
|
73 # index if necessary
|
|
74 if options.fileSource == 'history' and not options.do_not_build_index:
|
|
75 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
|
|
76 ref_file_name = ref_file.name
|
|
77 ref_file.close()
|
|
78 os.symlink( options.ref, ref_file_name )
|
|
79 cmd1 = 'smalt index %s %s' % ( ref_file_name, ref_file_name )
|
|
80 try:
|
|
81 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
|
|
82 tmp_stderr = open( tmp, 'wb' )
|
|
83 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
|
|
84 returncode = proc.wait()
|
|
85 tmp_stderr.close()
|
|
86 # get stderr, allowing for case where it's very large
|
|
87 tmp_stderr = open( tmp, 'rb' )
|
|
88 stderr = ''
|
|
89 buffsize = 1048576
|
|
90 try:
|
|
91 while True:
|
|
92 stderr += tmp_stderr.read( buffsize )
|
|
93 if not stderr or len( stderr ) % buffsize != 0:
|
|
94 break
|
|
95 except OverflowError:
|
|
96 pass
|
|
97 tmp_stderr.close()
|
|
98 if returncode != 0:
|
|
99 raise Exception, stderr
|
|
100 except Exception, e:
|
|
101 # clean up temp dirs
|
|
102 if os.path.exists( tmp_index_dir ):
|
|
103 shutil.rmtree( tmp_index_dir )
|
|
104 if os.path.exists( tmp_dir ):
|
|
105 shutil.rmtree( tmp_dir )
|
|
106 stop_err( 'Error indexing reference sequence. ' + str( e ) )
|
|
107 else:
|
|
108 ref_file_name = options.ref
|
|
109
|
|
110 # set up aligning and generate aligning command options
|
|
111 if options.params == 'pre_set':
|
|
112 aligning_cmds = '-n %s ' % ( options.threads )
|
|
113 gen_alignment_cmds = ''
|
|
114 else:
|
|
115 if options.exhaustiveSearch == 'true':
|
|
116 exhaustiveSearch = '-x'
|
|
117 minCover = '-c %s' % options.minCover
|
|
118 else:
|
|
119 exhaustiveSearch = ''
|
|
120 minCover = ''
|
|
121 if options.partialAlignments == 'true':
|
|
122 partialAlignments = '-x'
|
|
123 else:
|
|
124 partialAlignments = ''
|
|
125 if options.complexityWeighted == 'true':
|
|
126 complexityWeighted = '-w'
|
|
127 else:
|
|
128 complexityWeighted = ''
|
|
129 aligning_cmds = '-d %s -m %s -q %s -r %s -y %s' % \
|
|
130 ( options.scorDiff, options.minScor, options.minBasq, options.seed, options.minId )
|
|
131 if options.genAlignType == 'paired':
|
|
132 gen_alignment_cmds = '-i %s -j %s -l %s' % ( options.insertMax, options.insertMin, options.pairTyp )
|
|
133 else:
|
|
134 gen_alignment_cmds = ''
|
|
135 # prepare actual aligning and generate aligning commands
|
|
136 if options.genAlignType == 'paired':
|
|
137 cmd = 'smalt map %s %s -o %s %s %s ' % ( aligning_cmds, gen_alignment_cmds, options.output, ref_file_name, fastq, rfastq )
|
|
138 else:
|
|
139 cmd = 'smalt map %s -o %s %s %s ' % ( aligning_cmds, options.output, ref_file_name, fastq )
|
|
140 # perform alignments
|
|
141 buffsize = 1048576
|
|
142 try:
|
|
143 # need to nest try-except in try-finally to handle 2.4
|
|
144 try:
|
|
145 # align
|
|
146 try:
|
|
147 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
|
|
148 tmp_stderr = open( tmp, 'wb' )
|
|
149 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
|
|
150 returncode = proc.wait()
|
|
151 tmp_stderr.close()
|
|
152 # get stderr, allowing for case where it's very large
|
|
153 tmp_stderr = open( tmp, 'rb' )
|
|
154 stderr = ''
|
|
155 try:
|
|
156 while True:
|
|
157 stderr += tmp_stderr.read( buffsize )
|
|
158 if not stderr or len( stderr ) % buffsize != 0:
|
|
159 break
|
|
160 except OverflowError:
|
|
161 pass
|
|
162 tmp_stderr.close()
|
|
163 if returncode != 0:
|
|
164 raise Exception, stderr
|
|
165 except Exception, e:
|
|
166 raise Exception, 'Error aligning sequence. ' + str( e )
|
|
167 # remove header if necessary
|
|
168 if options.suppressHeader == 'true':
|
|
169 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
|
|
170 tmp_out_name = tmp_out.name
|
|
171 tmp_out.close()
|
|
172 try:
|
|
173 shutil.move( options.output, tmp_out_name )
|
|
174 except Exception, e:
|
|
175 raise Exception, 'Error moving output file before removing headers. ' + str( e )
|
|
176 fout = file( options.output, 'w' )
|
|
177 for line in file( tmp_out.name, 'r' ):
|
|
178 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
|
|
179 fout.write( line )
|
|
180 fout.close()
|
|
181 # check that there are results in the output file
|
|
182 if os.path.getsize( options.output ) > 0:
|
|
183 sys.stdout.write( 'SMALT run on %s-end data' % options.genAlignType )
|
|
184 else:
|
|
185 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
|
|
186 except Exception, e:
|
|
187 stop_err( 'The alignment failed.\n' + str( e ) )
|
|
188 finally:
|
|
189 # clean up temp dir
|
|
190 if os.path.exists( tmp_index_dir ):
|
|
191 shutil.rmtree( tmp_index_dir )
|
|
192 if os.path.exists( tmp_dir ):
|
|
193 shutil.rmtree( tmp_dir )
|
|
194
|
|
195 if __name__=="__main__": __main__()
|