Mercurial > repos > yufei-luo > s_mart
comparison SMART/DiffExpAnal/tophat_parallel.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
comparison
equal
deleted
inserted
replaced
30:5677346472b5 | 31:0ab839023fe4 |
---|---|
1 | |
2 #!/usr/bin/env python | |
3 | |
4 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile,random | |
5 | |
6 def stop_err( msg ): | |
7 sys.stderr.write( "%s\n" % msg ) | |
8 sys.exit() | |
9 | |
10 def toTar(tarFileName, accepted_hits_outputNames): | |
11 fileName = os.path.splitext(tarFileName)[0] | |
12 fileNameBaseName = os.path.basename(fileName) | |
13 dir = os.path.dirname(tarFileName) | |
14 tfile = tarfile.open(tarFileName + ".tmp.tar", "w") | |
15 currentPath = os.getcwd() | |
16 os.chdir(dir) | |
17 for file in accepted_hits_outputNames: | |
18 relativeFileName = os.path.basename(file) | |
19 tfile.add(relativeFileName) | |
20 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) | |
21 tfile.close() | |
22 os.chdir(currentPath) | |
23 | |
24 | |
25 def __main__(): | |
26 #Parse Command Line | |
27 parser = optparse.OptionParser() | |
28 parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') | |
29 parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) | |
30 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) | |
31 parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) | |
32 parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', default='junctions_output.bed', help='Junctions output file; formate is BED.' ) | |
33 parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', default='hits_output_%s.bam' % random.randrange(0, 10000), help='Accepted hits output file; formate is BAM.' ) | |
34 parser.add_option( '', '--own-file', dest='own_file', help='' ) | |
35 parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
36 parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ | |
37 For, example, for paired end runs with fragments selected at 300bp, \ | |
38 where each end is 50bp, you should set -r to be 200. There is no default, \ | |
39 and this parameter is required for paired end runs.') | |
40 parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) | |
41 parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', | |
42 help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) | |
43 parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) | |
44 parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', | |
45 help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) | |
46 parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', | |
47 help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) | |
48 parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' ) | |
49 parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) | |
50 parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' ) | |
51 parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) | |
52 parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) | |
53 parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) | |
54 parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' ) | |
55 parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) | |
56 parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) | |
57 | |
58 # Options for supplying own junctions | |
59 parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ | |
60 TopHat will use the exon records in this file to build \ | |
61 a set of known splice junctions for each gene, and will \ | |
62 attempt to align reads to these junctions even if they \ | |
63 would not normally be covered by the initial mapping.') | |
64 parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ | |
65 specified one per line, in a tab-delimited format. Records \ | |
66 look like: <chrom> <left> <right> <+/-> left and right are \ | |
67 zero-based coordinates, and specify the last character of the \ | |
68 left sequenced to be spliced to the first character of the right \ | |
69 sequence, inclusive.') | |
70 parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ | |
71 supplied GFF file. (ignored without -G)") | |
72 parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.") | |
73 # Types of search. | |
74 parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') | |
75 parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') | |
76 parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) | |
77 parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') | |
78 parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) | |
79 parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) | |
80 parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) | |
81 parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) | |
82 parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) | |
83 parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) | |
84 parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) | |
85 parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) | |
86 | |
87 # Wrapper options. | |
88 parser.add_option( '-1', '--input1', dest='input1', help='A list of the (forward or single-end) reads files of Sanger FASTQ format, txt format' ) | |
89 #parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) | |
90 #parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) | |
91 parser.add_option( '-2', '--input2', dest='input2', help='The list of reverse reads file in Sanger FASTQ format' ) | |
92 parser.add_option( '', '--single-paired', dest='single_paired', help='' ) | |
93 parser.add_option( '', '--settings', dest='settings', help='' ) | |
94 | |
95 (options, args) = parser.parse_args() | |
96 | |
97 # output version # of tool | |
98 try: | |
99 tmp_files = [] | |
100 tmp = tempfile.NamedTemporaryFile().name | |
101 tmp_files.append(tmp) | |
102 tmp_stdout = open( tmp, 'wb' ) | |
103 proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout ) | |
104 tmp_stdout.close() | |
105 returncode = proc.wait() | |
106 stdout = open( tmp_stdout.name, 'rb' ).readline().strip() | |
107 if stdout: | |
108 sys.stdout.write( '%s\n' % stdout ) | |
109 else: | |
110 raise Exception | |
111 except: | |
112 sys.stdout.write( 'Could not determine Tophat version\n' ) | |
113 | |
114 # Color or base space | |
115 space = '' | |
116 if options.color_space: | |
117 space = '-C' | |
118 | |
119 | |
120 #reads = options.input1 | |
121 file = open(options.input1,"r") | |
122 lines = file.readlines() | |
123 inputFileNames = [] | |
124 accepted_hits_outputNames = [] | |
125 outputName = options.outputTxtFile | |
126 resDirName = os.path.dirname(outputName) + '/' | |
127 out = open(outputName, "w") | |
128 for line in lines: | |
129 tab = line.split() | |
130 inputFileNames.append(tab[1]) | |
131 aHitOutName = resDirName + tab[0] + '_' + options.accepted_hits_output_file | |
132 accepted_hits_outputNames.append(aHitOutName) | |
133 out.write(tab[0] + '\t' + aHitOutName + '\n') | |
134 file.close() | |
135 out.close() | |
136 | |
137 if options.input2: | |
138 revFile = open(options.input2,"r") | |
139 lines = revFile.readlines() | |
140 inputRevFileNames = [] | |
141 for line in lines: | |
142 revTab = line.split() | |
143 inputRevFileNames.append(revTab[1]) | |
144 revFile.close() | |
145 | |
146 | |
147 # Creat bowtie index if necessary. | |
148 tmp_index_dirs = [] | |
149 index_paths = [] | |
150 tmp_index_dir = tempfile.mkdtemp() | |
151 tmp_index_dirs.append(tmp_index_dir) | |
152 if options.own_file: | |
153 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) | |
154 index_paths.append(index_path) | |
155 try: | |
156 os.link( options.own_file, index_path + '.fa' ) | |
157 except: | |
158 # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension | |
159 pass | |
160 cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path ) | |
161 try: | |
162 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
163 tmp_stderr = open( tmp, 'wb' ) | |
164 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) | |
165 returncode = proc.wait() | |
166 tmp_stderr.close() | |
167 # get stderr, allowing for case where it's very large | |
168 tmp_stderr = open( tmp, 'rb' ) | |
169 stderr = '' | |
170 buffsize = 1048576 | |
171 try: | |
172 while True: | |
173 stderr += tmp_stderr.read( buffsize ) | |
174 if not stderr or len( stderr ) % buffsize != 0: | |
175 break | |
176 except OverflowError: | |
177 pass | |
178 tmp_stderr.close() | |
179 if returncode != 0: | |
180 raise Exception, stderr | |
181 except Exception, e: | |
182 if os.path.exists( tmp_index_dir ): | |
183 shutil.rmtree( tmp_index_dir ) | |
184 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
185 else: | |
186 for file in inputFileNames: | |
187 tmp_index_dir = tempfile.mkdtemp() | |
188 index_path = tmp_index_dir + '/' + os.path.basename(file).split('.')[0] | |
189 index_paths.append(index_path) | |
190 tmp_index_dirs.append(tmp_index_dir) | |
191 | |
192 | |
193 | |
194 # Build tophat command. | |
195 cmds = [] | |
196 # for inputFileName in inputFileNames: | |
197 for i in range(len(inputFileNames)): | |
198 cmd = 'tophat %s %s %s ' | |
199 input_files = inputFileNames[i] | |
200 if options.input2: | |
201 input_files += ' ' + inputRevFileNames[i] | |
202 opts = '-p %s %s' % ( options.num_threads, space ) | |
203 if options.single_paired == 'paired': | |
204 opts += '-r %s ' % options.mate_inner_dist | |
205 if options.settings == 'preSet': | |
206 if options.own_file: | |
207 cmd = cmd % ( opts, index_paths[0], input_files ) #here add paired end file | |
208 else: | |
209 cmd = cmd % ( opts, index_paths[i], input_files ) #here add paired end file | |
210 else: | |
211 try: | |
212 if int( options.min_anchor_length ) >= 3: | |
213 opts += '-a %s ' % options.min_anchor_length | |
214 else: | |
215 raise Exception, 'Minimum anchor length must be 3 or greater' | |
216 opts += '-m %s ' % options.splice_mismatches | |
217 opts += '-i %s ' % options.min_intron_length | |
218 opts += '-I %s ' % options.max_intron_length | |
219 if float( options.junction_filter ) != 0.0: | |
220 opts += '-F %s ' % options.junction_filter | |
221 opts += '-g %s ' % options.max_multihits | |
222 # Custom junctions options. | |
223 if options.gene_model_annotations: | |
224 opts += '-G %s ' % options.gene_model_annotations | |
225 if options.raw_juncs: | |
226 opts += '-j %s ' % options.raw_juncs | |
227 if options.no_novel_juncs: | |
228 opts += '--no-novel-juncs ' | |
229 if options.library_type: | |
230 opts += '--library-type %s ' % options.library_type | |
231 if options.no_novel_indels: | |
232 opts += '--no-novel-indels ' | |
233 else: | |
234 if options.max_insertion_length: | |
235 opts += '--max-insertion-length %i ' % int( options.max_insertion_length ) | |
236 if options.max_deletion_length: | |
237 opts += '--max-deletion-length %i ' % int( options.max_deletion_length ) | |
238 # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1) | |
239 # need to warn user of this fact | |
240 #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) | |
241 | |
242 # Search type options. | |
243 if options.coverage_search: | |
244 opts += '--coverage-search --min-coverage-intron %s --max-coverage-intron %s ' % ( options.min_coverage_intron, options.max_coverage_intron ) | |
245 else: | |
246 opts += '--no-coverage-search ' | |
247 if options.closure_search: | |
248 opts += '--closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s ' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) | |
249 else: | |
250 opts += '--no-closure-search ' | |
251 if options.microexon_search: | |
252 opts += '--microexon-search ' | |
253 if options.single_paired == 'paired': | |
254 opts += '--mate-std-dev %s ' % options.mate_std_dev | |
255 if options.initial_read_mismatches: | |
256 opts += '--initial-read-mismatches %d ' % int( options.initial_read_mismatches ) | |
257 if options.seg_mismatches: | |
258 opts += '--segment-mismatches %d ' % int( options.seg_mismatches ) | |
259 if options.seg_length: | |
260 opts += '--segment-length %d ' % int( options.seg_length ) | |
261 if options.min_segment_intron: | |
262 opts += '--min-segment-intron %d ' % int( options.min_segment_intron ) | |
263 if options.max_segment_intron: | |
264 opts += '--max-segment-intron %d ' % int( options.max_segment_intron ) | |
265 if options.own_file: | |
266 cmd = cmd % ( opts, index_paths[0], input_files ) #here to add paired end file | |
267 else: | |
268 cmd = cmd % ( opts, index_paths[i], input_files ) #here to add paired end file | |
269 except Exception, e: | |
270 # Clean up temp dirs | |
271 if os.path.exists( tmp_index_dir ): | |
272 shutil.rmtree( tmp_index_dir ) | |
273 stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) | |
274 | |
275 cmds.append(cmd) | |
276 | |
277 # Run the command line for each file. | |
278 for i in range(len(cmds)): | |
279 try: | |
280 tmp_out = tempfile.NamedTemporaryFile().name | |
281 tmp_files.append(tmp_out) | |
282 tmp_stdout = open( tmp_out, 'wb' ) | |
283 tmp_err = tempfile.NamedTemporaryFile().name | |
284 tmp_files.append(tmp_err) | |
285 tmp_stderr = open( tmp_err, 'wb' ) | |
286 proc = subprocess.Popen( args=cmds[i], shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
287 returncode = proc.wait() | |
288 tmp_stderr.close() | |
289 # get stderr, allowing for case where it's very large | |
290 tmp_stderr = open( tmp_err, 'rb' ) | |
291 stderr = '' | |
292 buffsize = 1048576 | |
293 try: | |
294 while True: | |
295 stderr += tmp_stderr.read( buffsize ) | |
296 if not stderr or len( stderr ) % buffsize != 0: | |
297 break | |
298 except OverflowError: | |
299 pass | |
300 tmp_stdout.close() | |
301 tmp_stderr.close() | |
302 if returncode != 0: | |
303 raise Exception, stderr | |
304 | |
305 # Copy output files from tmp directory to specified files. | |
306 #shutil.copyfile( os.path.join( "tophat_out", "junctions.bed" ), junctions_outputNames[i] ) | |
307 shutil.copyfile( os.path.join( "tophat_out", "accepted_hits.bam" ), accepted_hits_outputNames[i] ) | |
308 # TODO: look for errors in program output. | |
309 except Exception, e: | |
310 stop_err( 'Error in tophat:\n' + str( e ) ) | |
311 | |
312 if options.outputTar != None: | |
313 toTar(options.outputTar, accepted_hits_outputNames) | |
314 | |
315 | |
316 # Clean up temp dirs | |
317 for tmp_index_dir in tmp_index_dirs: | |
318 if os.path.exists( tmp_index_dir ): | |
319 shutil.rmtree( tmp_index_dir ) | |
320 | |
321 for tmp in tmp_files: | |
322 os.remove(tmp) | |
323 | |
324 | |
325 if __name__=="__main__": __main__() |