0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import optparse, os, shutil, subprocess, sys, tempfile, fileinput
|
|
4
|
|
5 def stop_err( msg ):
|
|
6 sys.stderr.write( "%s\n" % msg )
|
|
7 sys.exit()
|
|
8
|
|
9 def __main__():
|
|
10 #Parse Command Line
|
|
11 parser = optparse.OptionParser()
|
|
12 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
|
|
13 parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' )
|
|
14 parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' )
|
|
15 parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' )
|
|
16 parser.add_option( '', '--own-file', dest='own_file', help='' )
|
|
17 parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
|
|
18 parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \
|
|
19 For, example, for paired end runs with fragments selected at 300bp, \
|
|
20 where each end is 50bp, you should set -r to be 200. There is no default, \
|
|
21 and this parameter is required for paired end runs.')
|
|
22 parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' )
|
|
23 parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length',
|
|
24 help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' )
|
|
25 parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' )
|
|
26 parser.add_option( '-i', '--min-intron-length', dest='min_intron_length',
|
|
27 help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' )
|
|
28 parser.add_option( '-I', '--max-intron-length', dest='max_intron_length',
|
|
29 help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
|
|
30 parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
|
|
31 parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' )
|
|
32 parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
|
|
33 parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
|
|
34 parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' )
|
|
35 parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' )
|
|
36 parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' )
|
|
37 parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' )
|
|
38
|
|
39 # Options for supplying own junctions
|
|
40 parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \
|
|
41 TopHat will use the exon records in this file to build \
|
|
42 a set of known splice junctions for each gene, and will \
|
|
43 attempt to align reads to these junctions even if they \
|
|
44 would not normally be covered by the initial mapping.')
|
|
45 parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \
|
|
46 specified one per line, in a tab-delimited format. Records \
|
|
47 look like: <chrom> <left> <right> <+/-> left and right are \
|
|
48 zero-based coordinates, and specify the last character of the \
|
|
49 left sequenced to be spliced to the first character of the right \
|
|
50 sequence, inclusive.')
|
|
51 parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \
|
|
52 supplied GFF file. (ignored without -G)")
|
|
53 parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.")
|
|
54 # Types of search.
|
|
55 parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
|
|
56 parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
|
|
57 parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
|
|
58 parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.')
|
|
59 parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' )
|
|
60 parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' )
|
|
61 parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' )
|
|
62 parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' )
|
|
63 parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' )
|
|
64 parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
|
|
65 parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
|
|
66 parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
|
|
67
|
|
68 # Wrapper options.
|
|
69 parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
|
|
70 parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
|
|
71 parser.add_option( '', '--single-paired', dest='single_paired', help='' )
|
|
72 parser.add_option( '', '--settings', dest='settings', help='' )
|
|
73
|
|
74 (options, args) = parser.parse_args()
|
|
75
|
|
76 # output version # of tool
|
|
77 try:
|
|
78 tmp = tempfile.NamedTemporaryFile().name
|
|
79 tmp_stdout = open( tmp, 'wb' )
|
|
80 proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout )
|
|
81 tmp_stdout.close()
|
|
82 returncode = proc.wait()
|
|
83 stdout = open( tmp_stdout.name, 'rb' ).readline().strip()
|
|
84 if stdout:
|
|
85 sys.stdout.write( '%s\n' % stdout )
|
|
86 else:
|
|
87 raise Exception
|
|
88 except:
|
|
89 sys.stdout.write( 'Could not determine Tophat version\n' )
|
|
90
|
|
91 # Color or base space
|
|
92 space = ''
|
|
93 if options.color_space:
|
|
94 space = '-C'
|
|
95
|
|
96 # Creat bowtie index if necessary.
|
|
97 tmp_index_dir = tempfile.mkdtemp()
|
|
98 if options.own_file:
|
|
99 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) )
|
|
100 try:
|
|
101 os.link( options.own_file, index_path + '.fa' )
|
|
102 except:
|
|
103 # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension
|
|
104 pass
|
|
105 cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path )
|
|
106 try:
|
|
107 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
|
|
108 tmp_stderr = open( tmp, 'wb' )
|
|
109 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
|
|
110 returncode = proc.wait()
|
|
111 tmp_stderr.close()
|
|
112 # get stderr, allowing for case where it's very large
|
|
113 tmp_stderr = open( tmp, 'rb' )
|
|
114 stderr = ''
|
|
115 buffsize = 1048576
|
|
116 try:
|
|
117 while True:
|
|
118 stderr += tmp_stderr.read( buffsize )
|
|
119 if not stderr or len( stderr ) % buffsize != 0:
|
|
120 break
|
|
121 except OverflowError:
|
|
122 pass
|
|
123 tmp_stderr.close()
|
|
124 if returncode != 0:
|
|
125 raise Exception, stderr
|
|
126 except Exception, e:
|
|
127 if os.path.exists( tmp_index_dir ):
|
|
128 shutil.rmtree( tmp_index_dir )
|
|
129 stop_err( 'Error indexing reference sequence\n' + str( e ) )
|
|
130 else:
|
|
131 index_path = options.index_path
|
|
132
|
|
133 # Build tophat command.
|
|
134 cmd = 'tophat %s %s %s'
|
|
135 reads = options.input1
|
|
136 if options.input2:
|
|
137 reads += ' ' + options.input2
|
|
138 opts = '-p %s %s' % ( options.num_threads, space )
|
|
139 if options.single_paired == 'paired':
|
|
140 opts += ' -r %s' % options.mate_inner_dist
|
|
141 if options.settings == 'preSet':
|
|
142 cmd = cmd % ( opts, index_path, reads )
|
|
143 else:
|
|
144 try:
|
|
145 if int( options.min_anchor_length ) >= 3:
|
|
146 opts += ' -a %s' % options.min_anchor_length
|
|
147 else:
|
|
148 raise Exception, 'Minimum anchor length must be 3 or greater'
|
|
149 opts += ' -m %s' % options.splice_mismatches
|
|
150 opts += ' -i %s' % options.min_intron_length
|
|
151 opts += ' -I %s' % options.max_intron_length
|
|
152 opts += ' -g %s' % options.max_multihits
|
|
153 # Custom junctions options.
|
|
154 if options.gene_model_annotations:
|
|
155 opts += ' -G %s' % options.gene_model_annotations
|
|
156 if options.raw_juncs:
|
|
157 opts += ' -j %s' % options.raw_juncs
|
|
158 if options.no_novel_juncs:
|
|
159 opts += ' --no-novel-juncs'
|
|
160 if options.library_type:
|
|
161 opts += ' --library-type %s' % options.library_type
|
|
162 if options.no_novel_indels:
|
|
163 opts += ' --no-novel-indels'
|
|
164 else:
|
|
165 if options.max_insertion_length:
|
|
166 opts += ' --max-insertion-length %i' % int( options.max_insertion_length )
|
|
167 if options.max_deletion_length:
|
|
168 opts += ' --max-deletion-length %i' % int( options.max_deletion_length )
|
|
169 # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1)
|
|
170 # need to warn user of this fact
|
|
171 #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" )
|
|
172
|
|
173 # Search type options.
|
|
174 if options.coverage_search:
|
|
175 opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron )
|
|
176 else:
|
|
177 opts += ' --no-coverage-search'
|
|
178 if options.closure_search:
|
|
179 opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron )
|
|
180 else:
|
|
181 opts += ' --no-closure-search'
|
|
182 if options.microexon_search:
|
|
183 opts += ' --microexon-search'
|
|
184 if options.single_paired == 'paired':
|
|
185 opts += ' --mate-std-dev %s' % options.mate_std_dev
|
|
186 if options.initial_read_mismatches:
|
|
187 opts += ' --initial-read-mismatches %d' % int( options.initial_read_mismatches )
|
|
188 if options.seg_mismatches:
|
|
189 opts += ' --segment-mismatches %d' % int( options.seg_mismatches )
|
|
190 if options.seg_length:
|
|
191 opts += ' --segment-length %d' % int( options.seg_length )
|
|
192 if options.min_segment_intron:
|
|
193 opts += ' --min-segment-intron %d' % int( options.min_segment_intron )
|
|
194 if options.max_segment_intron:
|
|
195 opts += ' --max-segment-intron %d' % int( options.max_segment_intron )
|
|
196 cmd = cmd % ( opts, index_path, reads )
|
|
197 except Exception, e:
|
|
198 # Clean up temp dirs
|
|
199 if os.path.exists( tmp_index_dir ):
|
|
200 shutil.rmtree( tmp_index_dir )
|
|
201 stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
|
|
202 #print cmd
|
|
203
|
|
204 # Run
|
|
205 try:
|
|
206 tmp_out = tempfile.NamedTemporaryFile().name
|
|
207 tmp_stdout = open( tmp_out, 'wb' )
|
|
208 tmp_err = tempfile.NamedTemporaryFile().name
|
|
209 tmp_stderr = open( tmp_err, 'wb' )
|
|
210 print cmd
|
|
211 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
|
|
212 returncode = proc.wait()
|
|
213 tmp_stderr.close()
|
|
214 # get stderr, allowing for case where it's very large
|
|
215 tmp_stderr = open( tmp_err, 'rb' )
|
|
216 stderr = ''
|
|
217 buffsize = 1048576
|
|
218 try:
|
|
219 while True:
|
|
220 stderr += tmp_stderr.read( buffsize )
|
|
221 if not stderr or len( stderr ) % buffsize != 0:
|
|
222 break
|
|
223 except OverflowError:
|
|
224 pass
|
|
225 tmp_stdout.close()
|
|
226 tmp_stderr.close()
|
|
227 if returncode != 0:
|
|
228 raise Exception, stderr
|
|
229
|
|
230 # TODO: look for errors in program output.
|
|
231 except Exception, e:
|
|
232 stop_err( 'Error in tophat:\n' + str( e ) )
|
|
233
|
|
234 # Clean up temp dirs
|
|
235 if os.path.exists( tmp_index_dir ):
|
|
236 shutil.rmtree( tmp_index_dir )
|
|
237
|
|
238 if __name__=="__main__": __main__()
|