0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import optparse, os, shutil, subprocess, sys, tempfile
|
|
4
|
|
5 def group_callback( option, op_str, value, parser ):
|
|
6 groups = []
|
|
7 flist = []
|
|
8 for arg in parser.rargs:
|
|
9 arg = arg.strip()
|
|
10 if arg[0] is "-":
|
|
11 break
|
|
12 elif arg[0] is ",":
|
|
13 groups.append(flist)
|
|
14 flist = []
|
|
15 else:
|
|
16 flist.append(arg)
|
|
17 groups.append(flist)
|
|
18
|
|
19 setattr(parser.values, option.dest, groups)
|
|
20
|
|
21 def label_callback( option, op_str, value, parser ):
|
|
22 labels = []
|
|
23 for arg in parser.rargs:
|
|
24 arg = arg.strip()
|
|
25 if arg[0] is "-":
|
|
26 break
|
|
27 else:
|
|
28 labels.append(arg)
|
|
29
|
|
30 setattr(parser.values, option.dest, labels)
|
|
31
|
|
32 def stop_err( msg ):
|
|
33 sys.stderr.write( "%s\n" % msg )
|
|
34 sys.exit()
|
|
35
|
|
36 # Copied from sam_to_bam.py:
|
|
37 def check_seq_file( dbkey, cached_seqs_pointer_file ):
|
|
38 seq_path = ''
|
|
39 for line in open( cached_seqs_pointer_file ):
|
|
40 line = line.rstrip( '\r\n' )
|
|
41 if line and not line.startswith( '#' ) and line.startswith( 'index' ):
|
|
42 fields = line.split( '\t' )
|
|
43 if len( fields ) < 3:
|
|
44 continue
|
|
45 if fields[1] == dbkey:
|
|
46 seq_path = fields[2].strip()
|
|
47 break
|
|
48 return seq_path
|
|
49
|
|
50 def __main__():
|
|
51 #Parse Command Line
|
|
52 parser = optparse.OptionParser()
|
|
53
|
|
54 # Cuffdiff options.
|
|
55 parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
|
|
56 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
|
|
57 parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
|
|
58 For, example, for paired end runs with fragments selected at 300bp, \
|
|
59 where each end is 50bp, you should set -r to be 200. The default is 45bp.')
|
|
60 parser.add_option( '-c', '--min-alignment-count', dest='min_alignment_count', help='The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\' observed changes don\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).' )
|
|
61 parser.add_option( '--FDR', dest='FDR', help='The allowed false discovery rate. The default is 0.05.' )
|
|
62
|
|
63 # Advanced Options:
|
|
64 parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
|
|
65 parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
|
|
66
|
|
67 # Wrapper / Galaxy options.
|
|
68 parser.add_option( '-f', '--files', dest='groups', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )
|
|
69 parser.add_option( '-A', '--inputA', dest='inputA', help='A transcript GTF file produced by cufflinks, cuffcompare, or other source.')
|
|
70 parser.add_option( '-1', '--input1', dest='input1', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
|
|
71 parser.add_option( '-2', '--input2', dest='input2', help='File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
|
|
72
|
|
73 # Label options
|
|
74 parser.add_option('-L', '--labels', dest='labels', action="callback", callback=label_callback, help="Labels for the groups the replicates are in.")
|
|
75
|
|
76 # Normalization options.
|
|
77 parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
|
|
78
|
|
79 # Bias correction options.
|
|
80 parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
|
|
81 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
|
|
82 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
|
|
83 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
|
|
84
|
|
85 # Outputs.
|
|
86 parser.add_option( "--isoforms_fpkm_tracking_output", dest="isoforms_fpkm_tracking_output" )
|
|
87 parser.add_option( "--genes_fpkm_tracking_output", dest="genes_fpkm_tracking_output" )
|
|
88 parser.add_option( "--cds_fpkm_tracking_output", dest="cds_fpkm_tracking_output" )
|
|
89 parser.add_option( "--tss_groups_fpkm_tracking_output", dest="tss_groups_fpkm_tracking_output" )
|
|
90 parser.add_option( "--isoforms_exp_output", dest="isoforms_exp_output" )
|
|
91 parser.add_option( "--genes_exp_output", dest="genes_exp_output" )
|
|
92 parser.add_option( "--tss_groups_exp_output", dest="tss_groups_exp_output" )
|
|
93 parser.add_option( "--cds_exp_fpkm_tracking_output", dest="cds_exp_fpkm_tracking_output" )
|
|
94 parser.add_option( "--splicing_diff_output", dest="splicing_diff_output" )
|
|
95 parser.add_option( "--cds_diff_output", dest="cds_diff_output" )
|
|
96 parser.add_option( "--promoters_diff_output", dest="promoters_diff_output" )
|
|
97
|
|
98 (options, args) = parser.parse_args()
|
|
99
|
|
100 # output version # of tool
|
|
101 try:
|
|
102 tmp = tempfile.NamedTemporaryFile().name
|
|
103 tmp_stdout = open( tmp, 'wb' )
|
|
104 proc = subprocess.Popen( args='cuffdiff --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
|
|
105 tmp_stdout.close()
|
|
106 returncode = proc.wait()
|
|
107 stdout = None
|
|
108 for line in open( tmp_stdout.name, 'rb' ):
|
|
109 if line.lower().find( 'cuffdiff v' ) >= 0:
|
|
110 stdout = line.strip()
|
|
111 break
|
|
112 if stdout:
|
|
113 sys.stdout.write( '%s\n' % stdout )
|
|
114 else:
|
|
115 raise Exception
|
|
116 except:
|
|
117 sys.stdout.write( 'Could not determine Cuffdiff version\n' )
|
|
118
|
|
119 # Make temp directory for output.
|
|
120 tmp_output_dir = tempfile.mkdtemp()
|
|
121
|
|
122 # If doing bias correction, set/link to sequence file.
|
|
123 if options.do_bias_correction:
|
|
124 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
|
|
125 if not os.path.exists( cached_seqs_pointer_file ):
|
|
126 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
|
|
127 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
|
|
128 # and the equCab2.fa file will contain fasta sequences.
|
|
129 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
|
|
130 if options.ref_file != 'None':
|
|
131 # Create symbolic link to ref_file so that index will be created in working directory.
|
|
132 seq_path = os.path.join( tmp_output_dir, "ref.fa" )
|
|
133 os.symlink( options.ref_file, seq_path )
|
|
134
|
|
135 # Build command.
|
|
136
|
|
137 # Base; always use quiet mode to avoid problems with storing log output.
|
|
138 cmd = "cuffdiff --no-update-check -q"
|
|
139
|
|
140 # Add options.
|
|
141 if options.inner_dist_std_dev:
|
|
142 cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
|
|
143 if options.num_threads:
|
|
144 cmd += ( " -p %i" % int ( options.num_threads ) )
|
|
145 if options.inner_mean_dist:
|
|
146 cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
|
|
147 if options.min_alignment_count:
|
|
148 cmd += ( " -c %i" % int ( options.min_alignment_count ) )
|
|
149 if options.FDR:
|
|
150 cmd += ( " --FDR %f" % float( options.FDR ) )
|
|
151 if options.num_importance_samples:
|
|
152 cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
|
|
153 if options.max_mle_iterations:
|
|
154 cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
|
|
155 if options.do_normalization:
|
|
156 cmd += ( " -N" )
|
|
157 if options.do_bias_correction:
|
|
158 cmd += ( " -b %s" % seq_path )
|
|
159
|
|
160 # Add inputs.
|
|
161 # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2
|
|
162 if options.groups:
|
|
163 cmd += " --labels "
|
|
164 for label in options.labels:
|
|
165 cmd += label + ","
|
|
166 cmd = cmd[:-1]
|
|
167
|
|
168 cmd += " " + options.inputA + " "
|
|
169
|
|
170 for group in options.groups:
|
|
171 for filename in group:
|
|
172 cmd += filename + ","
|
|
173 cmd = cmd[:-1] + " "
|
|
174 else:
|
|
175 cmd += " " + options.inputA + " " + options.input1 + " " + options.input2
|
|
176
|
|
177 # Debugging.
|
|
178 print cmd
|
|
179
|
|
180 # Run command.
|
|
181 try:
|
|
182 tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
|
|
183 tmp_stderr = open( tmp_name, 'wb' )
|
|
184 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stderr=tmp_stderr.fileno() )
|
|
185 returncode = proc.wait()
|
|
186 tmp_stderr.close()
|
|
187
|
|
188 # Get stderr, allowing for case where it's very large.
|
|
189 tmp_stderr = open( tmp_name, 'rb' )
|
|
190 stderr = ''
|
|
191 buffsize = 1048576
|
|
192 try:
|
|
193 while True:
|
|
194 stderr += tmp_stderr.read( buffsize )
|
|
195 if not stderr or len( stderr ) % buffsize != 0:
|
|
196 break
|
|
197 except OverflowError:
|
|
198 pass
|
|
199 tmp_stderr.close()
|
|
200
|
|
201 # Error checking.
|
|
202 if returncode != 0:
|
|
203 raise Exception, stderr
|
|
204
|
|
205 # check that there are results in the output file
|
|
206 if len( open( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), 'rb' ).read().strip() ) == 0:
|
|
207 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
|
|
208 except Exception, e:
|
|
209 stop_err( 'Error running cuffdiff. ' + str( e ) )
|
|
210
|
|
211
|
|
212 # Copy output files from tmp directory to specified files.
|
|
213 try:
|
|
214 try:
|
|
215 shutil.copyfile( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output )
|
|
216 shutil.copyfile( os.path.join( tmp_output_dir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output )
|
|
217 shutil.copyfile( os.path.join( tmp_output_dir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output )
|
|
218 shutil.copyfile( os.path.join( tmp_output_dir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output )
|
|
219 shutil.copyfile( os.path.join( tmp_output_dir, "isoform_exp.diff" ), options.isoforms_exp_output )
|
|
220 shutil.copyfile( os.path.join( tmp_output_dir, "gene_exp.diff" ), options.genes_exp_output )
|
|
221 shutil.copyfile( os.path.join( tmp_output_dir, "tss_group_exp.diff" ), options.tss_groups_exp_output )
|
|
222 shutil.copyfile( os.path.join( tmp_output_dir, "splicing.diff" ), options.splicing_diff_output )
|
|
223 shutil.copyfile( os.path.join( tmp_output_dir, "cds.diff" ), options.cds_diff_output )
|
|
224 shutil.copyfile( os.path.join( tmp_output_dir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output )
|
|
225 shutil.copyfile( os.path.join( tmp_output_dir, "promoters.diff" ), options.promoters_diff_output )
|
|
226 except Exception, e:
|
|
227 stop_err( 'Error in cuffdiff:\n' + str( e ) )
|
|
228 finally:
|
|
229 # Clean up temp dirs
|
|
230 if os.path.exists( tmp_output_dir ):
|
|
231 shutil.rmtree( tmp_output_dir )
|
|
232
|
|
233 if __name__=="__main__": __main__()
|