0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Wrapper supports Cuffdiff versions v1.3.0-v2.0
|
|
4
|
|
5 import optparse, os, shutil, subprocess, sys, tempfile
|
|
6
|
|
7 def group_callback( option, op_str, value, parser ):
|
|
8 groups = []
|
|
9 flist = []
|
|
10 for arg in parser.rargs:
|
|
11 arg = arg.strip()
|
|
12 if arg[0] is "-":
|
|
13 break
|
|
14 elif arg[0] is ",":
|
|
15 groups.append(flist)
|
|
16 flist = []
|
|
17 else:
|
|
18 flist.append(arg)
|
|
19 groups.append(flist)
|
|
20
|
|
21 setattr(parser.values, option.dest, groups)
|
|
22
|
|
23 def label_callback( option, op_str, value, parser ):
|
|
24 labels = []
|
|
25 for arg in parser.rargs:
|
|
26 arg = arg.strip()
|
|
27 if arg[0] is "-":
|
|
28 break
|
|
29 else:
|
|
30 labels.append(arg)
|
|
31
|
|
32 setattr(parser.values, option.dest, labels)
|
|
33
|
|
34 def stop_err( msg ):
|
|
35 sys.stderr.write( "%s\n" % msg )
|
|
36 sys.exit()
|
|
37
|
|
38 # Copied from sam_to_bam.py:
|
|
39 def check_seq_file( dbkey, cached_seqs_pointer_file ):
|
|
40 seq_path = ''
|
|
41 for line in open( cached_seqs_pointer_file ):
|
|
42 line = line.rstrip( '\r\n' )
|
|
43 if line and not line.startswith( '#' ) and line.startswith( 'index' ):
|
|
44 fields = line.split( '\t' )
|
|
45 if len( fields ) < 3:
|
|
46 continue
|
|
47 if fields[1] == dbkey:
|
|
48 seq_path = fields[2].strip()
|
|
49 break
|
|
50 return seq_path
|
|
51
|
|
52 def __main__():
|
|
53 #Parse Command Line
|
|
54 parser = optparse.OptionParser()
|
|
55
|
|
56 # Cuffdiff options.
|
|
57 parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
|
|
58 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
|
|
59 parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
|
|
60 For, example, for paired end runs with fragments selected at 300bp, \
|
|
61 where each end is 50bp, you should set -r to be 200. The default is 45bp.')
|
|
62 parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' )
|
|
63 parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' )
|
|
64 parser.add_option( '-u', '--multi-read-correct', dest='multi_read_correct', action="store_true", help='Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome')
|
|
65
|
|
66 # Advanced Options:
|
|
67 parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
|
|
68 parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
|
|
69
|
|
70 # Wrapper / Galaxy options.
|
|
71 parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )
|
|
72 parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.')
|
|
73 parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
|
|
74 parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
|
|
75
|
|
76 # Label options
|
|
77 parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.")
|
|
78
|
|
79 # Normalization options.
|
|
80 parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
|
|
81
|
|
82 # Bias correction options.
|
|
83 parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
|
|
84 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
|
|
85 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
|
|
86 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
|
|
87
|
|
88 # Outputs.
|
|
89 parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" )
|
|
90 parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" )
|
|
91 parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" )
|
|
92 parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" )
|
|
93 parser.add_option( "--isoforms_read_group_tracking_output", dest="isoforms_read_group_tracking_output", default=None)
|
|
94 parser.add_option( "--genes_read_group_tracking_output", dest="genes_read_group_tracking_output", default=None )
|
|
95 parser.add_option( "--cds_read_group_tracking_output", dest="cds_read_group_tracking_output", default=None )
|
|
96 parser.add_option( "--tss_groups_read_group_tracking_output", dest="tss_groups_read_group_tracking_output", default=None )
|
|
97 parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" )
|
|
98 parser.add_option( "--genes_exp_output", dest="genes_exp_output" )
|
|
99 parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" )
|
|
100 parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" )
|
|
101 parser.add_option( "--isoforms_count_tracking_output", dest="isoforms_count_tracking_output" )
|
|
102 parser.add_option( "--genes_count_tracking_output", dest="genes_count_tracking_output" )
|
|
103 parser.add_option( "--cds_count_tracking_output", dest="cds_count_tracking_output" )
|
|
104 parser.add_option( "--tss_groups_count_tracking_output", dest="tss_groups_count_tracking_output" )
|
|
105 parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" )
|
|
106 parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
|
|
107 parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" )
|
|
108 parser.add_option( "--run_info_output", dest="run_info_output" )
|
|
109 parser.add_option( "--read_groups_info_output", dest="read_groups_info_output" )
|
|
110 parser.add_option( "--cuffdatadir", dest="cuffdatadir", default=None)
|
|
111 parser.add_option( "--cummeRbund_db_output", dest="cummeRbund_db_output", default=None)
|
|
112
|
|
113 (options, args) = parser.parse_args()
|
|
114
|
|
115 # output version # of tool
|
|
116 try:
|
|
117 tmp = tempfile.NamedTemporaryFile().name
|
|
118 tmp_stdout = open( tmp, 'wb' )
|
|
119 proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
|
|
120 tmp_stdout.close()
|
|
121 returncode = proc.wait()
|
|
122 stdout = None
|
|
123 for line in open( tmp_stdout.name, 'rb' ):
|
|
124 if line.lower().find( 'cuffdiff v' ) >= 0:
|
|
125 stdout = line.strip()
|
|
126 break
|
|
127 if stdout:
|
|
128 sys.stdout.write( '%s\n' % stdout )
|
|
129 else:
|
|
130 raise Exception
|
|
131 except:
|
|
132 sys.stdout.write( 'Could not determine Cuffdiff version\n' )
|
|
133
|
|
134 # Make temp directory for output.
|
|
135 tmp_output_dir = tempfile.mkdtemp()
|
|
136 cuffdatadir = options.cuffdatadir if options.cuffdatadir else tmp_output_dir
|
|
137 if not os.path.exists( cuffdatadir ):
|
|
138 os.makedirs( cuffdatadir )
|
|
139
|
|
140
|
|
141 # If doing bias correction, set/link to sequence file.
|
|
142 if options.do_bias_correction:
|
|
143 if options.ref_file != 'None':
|
|
144 # Sequence data from history.
|
|
145 # Create symbolic link to ref_file so that index will be created in working directory.
|
|
146 seq_path = os.path.join( cuffdatadir, "ref.fa" )
|
|
147 os.symlink( options.ref_file, seq_path )
|
|
148 else:
|
|
149 # Sequence data from loc file.
|
|
150 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
|
|
151 if not os.path.exists( cached_seqs_pointer_file ):
|
|
152 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
|
|
153 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
|
|
154 # and the equCab2.fa file will contain fasta sequences.
|
|
155 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
|
|
156 if seq_path == '':
|
|
157 stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey )
|
|
158
|
|
159 # Build command.
|
|
160
|
|
161 # Base; always use quiet mode to avoid problems with storing log output.
|
|
162 cmd = "cuffdiff --no-update-check -q"
|
|
163
|
|
164 # Add options.
|
|
165 if options.inner_dist_std_dev:
|
|
166 cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
|
|
167 if options.num_threads:
|
|
168 cmd += ( " -p %i" % int ( options.num_threads ) )
|
|
169 if options.inner_mean_dist:
|
|
170 cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
|
|
171 if options.min_alignment_count:
|
|
172 cmd += ( " -c %i" % int ( options.min_alignment_count ) )
|
|
173 if options.FDR:
|
|
174 cmd += ( " --FDR %f" % float( options.FDR ) )
|
|
175 if options.multi_read_correct:
|
|
176 cmd += ( " -u" )
|
|
177 if options.num_importance_samples:
|
|
178 cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
|
|
179 if options.max_mle_iterations:
|
|
180 cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
|
|
181 if options.do_normalization:
|
|
182 cmd += ( " -N" )
|
|
183 if options.do_bias_correction:
|
|
184 cmd += ( " -b %s" % seq_path )
|
|
185
|
|
186 # Add inputs.
|
|
187 # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2
|
|
188 if options.groups:
|
|
189 cmd += " --labels "
|
|
190 for label in options.labels:
|
|
191 cmd += label + ","
|
|
192 cmd = cmd[:-1]
|
|
193
|
|
194 cmd += " " + options.inputA + " "
|
|
195
|
|
196 for group in options.groups:
|
|
197 for filename in group:
|
|
198 cmd += filename + ","
|
|
199 cmd = cmd[:-1] + " "
|
|
200 else:
|
|
201 cmd += " " + options.inputA + " " + options.input1 + " " + options.input2
|
|
202
|
|
203 # Debugging.
|
|
204 print cmd
|
|
205
|
|
206 # Run command.
|
|
207 try:
|
|
208 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
|
|
209 tmp_stderr = open( tmp_name, 'wb' )
|
|
210 proc = subprocess.Popen( args=cmd, shell=True, cwd=cuffdatadir, stderr=tmp_stderr.fileno() )
|
|
211 returncode = proc.wait()
|
|
212 tmp_stderr.close()
|
|
213
|
|
214 # Get stderr, allowing for case where it's very large.
|
|
215 tmp_stderr = open( tmp_name, 'rb' )
|
|
216 stderr = ''
|
|
217 buffsize = 1048576
|
|
218 try:
|
|
219 while True:
|
|
220 stderr += tmp_stderr.read( buffsize )
|
|
221 if not stderr or len( stderr ) % buffsize != 0:
|
|
222 break
|
|
223 except OverflowError:
|
|
224 pass
|
|
225 tmp_stderr.close()
|
|
226
|
|
227 # Error checking.
|
|
228 if returncode != 0:
|
|
229 raise Exception, stderr
|
|
230
|
|
231 # check that there are results in the output file
|
|
232 if len( open( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0:
|
|
233 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
|
|
234 except Exception, e:
|
|
235 stop_err( 'Error running cuffdiff. ' + str( e ) )
|
|
236
|
|
237
|
|
238 # Copy output files from tmp directory to specified files.
|
|
239 try:
|
|
240 try:
|
|
241 if options.isoforms_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.fpkm_tracking" )):
|
|
242 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output )
|
|
243 if options.genes_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.fpkm_tracking" )):
|
|
244 shutil.copyfile( os.path.join( cuffdatadir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output )
|
|
245 if options.cds_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.fpkm_tracking" )):
|
|
246 shutil.copyfile( os.path.join( cuffdatadir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output )
|
|
247 if options.tss_groups_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" )):
|
|
248 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output )
|
|
249
|
|
250 if options.isoforms_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.read_group_tracking" )):
|
|
251 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.read_group_tracking" ), options.isoforms_read_group_tracking_output )
|
|
252 if options.genes_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.read_group_tracking" )):
|
|
253 shutil.copyfile( os.path.join( cuffdatadir, "genes.read_group_tracking" ), options.genes_read_group_tracking_output )
|
|
254 if options.cds_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.read_group_tracking" )):
|
|
255 shutil.copyfile( os.path.join( cuffdatadir, "cds.read_group_tracking" ), options.cds_read_group_tracking_output )
|
|
256 if options.tss_groups_read_group_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.read_group_tracking" )):
|
|
257 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.read_group_tracking" ), options.tss_groups_read_group_tracking_output )
|
|
258
|
|
259 if options.isoforms_exp_output and os.path.exists(os.path.join( cuffdatadir, "isoform_exp.diff" )):
|
|
260 shutil.copyfile( os.path.join( cuffdatadir, "isoform_exp.diff" ), options.isoforms_exp_output )
|
|
261 if options.genes_exp_output and os.path.exists(os.path.join( cuffdatadir, "gene_exp.diff" )):
|
|
262 shutil.copyfile( os.path.join( cuffdatadir, "gene_exp.diff" ), options.genes_exp_output )
|
|
263 if options.cds_exp_fpkm_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds_exp.diff" )):
|
|
264 shutil.copyfile( os.path.join( cuffdatadir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output )
|
|
265 if options.tss_groups_exp_output and os.path.exists(os.path.join( cuffdatadir, "tss_group_exp.diff" )):
|
|
266 shutil.copyfile( os.path.join( cuffdatadir, "tss_group_exp.diff" ), options.tss_groups_exp_output )
|
|
267
|
|
268 if options.isoforms_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "isoforms.count_tracking" )):
|
|
269 shutil.copyfile( os.path.join( cuffdatadir, "isoforms.count_tracking" ), options.isoforms_count_tracking_output )
|
|
270 if options.genes_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "genes.count_tracking" )):
|
|
271 shutil.copyfile( os.path.join( cuffdatadir, "genes.count_tracking" ), options.genes_count_tracking_output )
|
|
272 if options.cds_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "cds.count_tracking" )):
|
|
273 shutil.copyfile( os.path.join( cuffdatadir, "cds.count_tracking" ), options.cds_count_tracking_output )
|
|
274 if options.tss_groups_count_tracking_output and os.path.exists(os.path.join( cuffdatadir, "tss_groups.count_tracking" )):
|
|
275 shutil.copyfile( os.path.join( cuffdatadir, "tss_groups.count_tracking" ), options.tss_groups_count_tracking_output )
|
|
276
|
|
277 if options.cds_diff_output and os.path.exists(os.path.join( cuffdatadir, "cds.diff" )):
|
|
278 shutil.copyfile( os.path.join( cuffdatadir, "cds.diff" ), options.cds_diff_output )
|
|
279
|
|
280 if options.splicing_diff_output and os.path.exists(os.path.join( cuffdatadir, "splicing.diff" )):
|
|
281 shutil.copyfile( os.path.join( cuffdatadir, "splicing.diff" ), options.splicing_diff_output )
|
|
282 if options.promoters_diff_output and os.path.exists(os.path.join( cuffdatadir, "promoters.diff" )):
|
|
283 shutil.copyfile( os.path.join( cuffdatadir, "promoters.diff" ), options.promoters_diff_output )
|
|
284
|
|
285 if options.run_info_output and os.path.exists(os.path.join( cuffdatadir, "run.info" )):
|
|
286 shutil.copyfile( os.path.join( cuffdatadir, "run.info" ), options.run_info_output )
|
|
287 if options.read_groups_info_output and os.path.exists(os.path.join( cuffdatadir, "read_groups.info" )):
|
|
288 shutil.copyfile( os.path.join( cuffdatadir, "read_groups.info" ), options.read_groups_info_output )
|
|
289
|
|
290 except Exception, e:
|
|
291 stop_err( 'Error in cuffdiff:\n' + str( e ) )
|
|
292 if options.cummeRbund_db_output:
|
|
293 try:
|
|
294 dbFile = 'cuffData.db'
|
|
295 rscript = tempfile.NamedTemporaryFile( dir=tmp_output_dir,suffix='.r' ).name
|
|
296 rscript_fh = open( rscript, 'wb' )
|
|
297 rscript_fh.write('library(cummeRbund)\n')
|
|
298 if options.inputA and options.ref_file:
|
|
299 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", gtfFile = "%s", genome = "%s", rebuild = T)\n' % (cuffdatadir,dbFile,options.inputA,options.ref_file))
|
|
300 else:
|
|
301 rscript_fh.write('cuff<-readCufflinks(dir = "%s", dbFile = "%s", rebuild = T)\n' % (cuffdatadir,dbFile))
|
|
302 rscript_fh.close()
|
|
303 cmd = ( "Rscript --vanilla %s" % rscript )
|
|
304 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
|
|
305 tmp_stderr = open( tmp_name, 'wb' )
|
|
306 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
|
|
307 #proc = subprocess.Popen( args=cmd, shell=True)
|
|
308 returncode = proc.wait()
|
|
309 tmp_stderr.close()
|
|
310 if os.path.exists(os.path.join( cuffdatadir, dbFile )):
|
|
311 shutil.copyfile( os.path.join( cuffdatadir, dbFile ), options.cummeRbund_db_output )
|
|
312 shutil.rmtree(os.path.join( cuffdatadir, dbFile ))
|
|
313 except Exception, e:
|
|
314 stop_err( 'Error generating cummeRbund cuffData.db:\n' + str( e ) )
|
|
315 finally:
|
|
316 # Clean up temp dirs
|
|
317 if os.path.exists( tmp_output_dir ):
|
|
318 shutil.rmtree( tmp_output_dir )
|
|
319
|
|
320 if __name__=="__main__": __main__()
|