comparison tools/ngs_rna/cuffdiff_wrapper.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 <tool id="cuffdiff" name="Cuffdiff" version="0.0.5">
2 <!-- Wrapper supports Cuffdiff versions v1.0.0-v1.0.3 -->
3 <description>find significant changes in transcript expression, splicing, and promoter use</description>
4 <requirements>
5 <requirement type="package">cufflinks</requirement>
6 </requirements>
7 <command interpreter="python">
8 cuffdiff_wrapper.py
9 --FDR=$fdr
10 --num-threads="4"
11 --min-alignment-count=$min_alignment_count
12
13 --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking
14 --genes_fpkm_tracking_output=$genes_fpkm_tracking
15 --cds_fpkm_tracking_output=$cds_fpkm_tracking
16 --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking
17 --isoforms_exp_output=$isoforms_exp
18 --genes_exp_output=$genes_exp
19 --tss_groups_exp_output=$tss_groups_exp
20 --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking
21 --splicing_diff_output=$splicing_diff
22 --cds_diff_output=$cds_diff
23 --promoters_diff_output=$promoters_diff
24
25 ## Set paired-end data parameters?
26 #if $singlePaired.sPaired == "Yes":
27 -m $singlePaired.mean_inner_distance
28 -s $singlePaired.inner_distance_std_dev
29 #end if
30
31 ## Normalization?
32 #if str($do_normalization) == "Yes":
33 -N
34 #end if
35
36
37 ## Bias correction?
38 #if $bias_correction.do_bias_correction == "Yes":
39 -b
40 #if $bias_correction.seq_source.index_source == "history":
41 --ref_file=$bias_correction.seq_source.ref_file
42 #else:
43 --ref_file="None"
44 #end if
45 --dbkey=${gtf_input.metadata.dbkey}
46 --index_dir=${GALAXY_DATA_INDEX_DIR}
47 #end if
48
49 ## Inputs.
50 --inputA=$gtf_input
51 #if $group_analysis.do_groups == "No":
52 --input1=$aligned_reads1
53 --input2=$aligned_reads2
54 #else:
55 ## Replicates.
56 --labels
57 #for $group in $group_analysis.groups
58 ${group.group}
59 #end for
60 --files
61 #for $group in $group_analysis.groups
62 #for $file in $group.files:
63 ${file.file}
64 #end for
65 ,
66 #end for
67 #end if
68
69 </command>
70 <inputs>
71 <param format="gtf" name="gtf_input" type="data" label="Transcripts" help="A transcript GTF file produced by cufflinks, cuffcompare, or other source."/>
72 <conditional name="group_analysis">
73 <param name="do_groups" type="select" label="Perform replicate analysis" help="Perform cuffdiff with replicates in each group.">
74 <option value="No">No</option>
75 <option value="Yes">Yes</option>
76 </param>
77 <when value="Yes">
78 <repeat name="groups" title="Group">
79 <param name="group" title="Group name" type="text" label="Group name (no spaces or commas)"/>
80 <repeat name="files" title="Replicate">
81 <param name="file" label="Add file" type="data" format="sam,bam"/>
82 </repeat>
83 </repeat>
84 </when>
85 <when value="No">
86 <param format="sam,bam" name="aligned_reads1" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
87 <param format="sam,bam" name="aligned_reads2" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
88 </when>
89 </conditional>
90
91 <param name="fdr" type="float" value="0.05" label="False Discovery Rate" help="The allowed false discovery rate."/>
92 <param name="min_alignment_count" type="integer" value="1000" label="Min Alignment Count" help="The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples."/>
93 <param name="do_normalization" type="select" label="Perform quartile normalization" help="Removes top 25% of genes from FPKM denominator to improve accuracy of differential expression calls for low abundance transcripts.">
94 <option value="No">No</option>
95 <option value="Yes">Yes</option>
96 </param>
97 <conditional name="bias_correction">
98 <param name="do_bias_correction" type="select" label="Perform Bias Correction" help="Bias detection and correction can significantly improve accuracy of transcript abundance estimates.">
99 <option value="Yes">Yes</option>
100 <option value="No">No</option>
101 </param>
102 <when value="Yes">
103 <conditional name="seq_source">
104 <param name="index_source" type="select" label="Reference sequence data">
105 <option value="cached">Locally cached</option>
106 <option value="history">History</option>
107 </param>
108 <when value="cached"></when>
109 <when value="history">
110 <param name="ref_file" type="data" format="fasta" label="Using reference file" />
111 </when>
112 </conditional>
113 </when>
114 <when value="No"></when>
115 </conditional>
116 <conditional name="singlePaired">
117 <param name="sPaired" type="select" label="Set Parameters for Paired-end Reads? (not recommended)">
118 <option value="No">No</option>
119 <option value="Yes">Yes</option>
120 </param>
121 <when value="No"></when>
122 <when value="Yes">
123 <param name="mean_inner_distance" type="integer" value="20" label="Mean Inner Distance between Mate Pairs"/>
124 <param name="inner_distance_std_dev" type="integer" value="20" label="Standard Deviation for Inner Distance between Mate Pairs"/>
125 </when>
126 </conditional>
127 </inputs>
128
129 <outputs>
130 <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing"/>
131 <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing"/>
132 <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing"/>
133 <data format="tabular" name="cds_exp_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM differential expression testing"/>
134 <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking"/>
135 <data format="tabular" name="tss_groups_exp" label="${tool.name} on ${on_string}: TSS groups differential expression testing"/>
136 <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking" />
137 <data format="tabular" name="genes_exp" label="${tool.name} on ${on_string}: gene differential expression testing"/>
138 <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking"/>
139 <data format="tabular" name="isoforms_exp" label="${tool.name} on ${on_string}: transcript differential expression testing"/>
140 <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking"/>
141 </outputs>
142
143 <tests>
144 <test>
145 <!--
146 cuffdiff cuffcompare_out5.gtf cuffdiff_in1.sam cuffdiff_in2.sam
147 -->
148 <param name="gtf_input" value="cuffcompare_out5.gtf" ftype="gtf" />
149 <param name="do_groups" value="No" />
150 <param name="aligned_reads1" value="cuffdiff_in1.sam" ftype="sam" />
151 <param name="aligned_reads2" value="cuffdiff_in2.sam" ftype="sam" />
152 <!-- Defaults. -->
153 <param name="fdr" value="0.05" />
154 <param name="min_alignment_count" value="0" ftype="sam" />
155 <param name="do_bias_correction" value="No" />
156 <param name="do_normalization" value="No" />
157 <param name="sPaired" value="single" ftype="sam" />
158 <!--
159 Line diffs are needed because cuffdiff does not produce deterministic output.
160 TODO: can we find datasets that lead to deterministic behavior?
161 -->
162 <output name="splicing_diff" file="cuffdiff_out9.txt"/>
163 <output name="promoters_diff" file="cuffdiff_out10.txt"/>
164 <output name="cds_diff" file="cuffdiff_out11.txt"/>
165 <output name="cds_exp_fpkm_tracking" file="cuffdiff_out4.txt"/>
166 <output name="cds_fpkm_tracking" file="cuffdiff_out8.txt"/>
167 <output name="tss_groups_exp" file="cuffdiff_out3.txt"/>
168 <output name="tss_groups_fpkm_tracking" file="cuffdiff_out7.txt"/>
169 <output name="genes_exp" file="cuffdiff_out2.txt" lines_diff="200"/>
170 <output name="genes_fpkm_tracking" file="cuffdiff_out6.txt" lines_diff="200"/>
171 <output name="isoforms_exp" file="cuffdiff_out1.txt" lines_diff="200"/>
172 <output name="isoforms_fpkm_tracking" file="cuffdiff_out5.txt" lines_diff="200"/>
173 </test>
174 </tests>
175
176 <help>
177 **Cuffdiff Overview**
178
179 Cuffdiff is part of Cufflinks_. Cuffdiff find significant changes in transcript expression, splicing, and promoter use. Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621
180
181 .. _Cufflinks: http://cufflinks.cbcb.umd.edu/
182
183 ------
184
185 **Know what you are doing**
186
187 .. class:: warningmark
188
189 There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
190
191 .. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffdiff
192
193 ------
194
195 **Input format**
196
197 Cuffdiff takes Cufflinks or Cuffcompare GTF files as input along with two SAM files containing the fragment alignments for two or more samples.
198
199 ------
200
201 **Outputs**
202
203 Cuffdiff produces many output files:
204
205 1. Transcript FPKM expression tracking.
206 2. Gene FPKM expression tracking; tracks the summed FPKM of transcripts sharing each gene_id
207 3. Primary transcript FPKM tracking; tracks the summed FPKM of transcripts sharing each tss_id
208 4. Coding sequence FPKM tracking; tracks the summed FPKM of transcripts sharing each p_id, independent of tss_id
209 5. Transcript differential FPKM.
210 6. Gene differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each gene_id
211 7. Primary transcript differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each tss_id
212 8. Coding sequence differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each p_id independent of tss_id
213 9. Differential splicing tests: this tab delimited file lists, for each primary transcript, the amount of overloading detected among its isoforms, i.e. how much differential splicing exists between isoforms processed from a single primary transcript. Only primary transcripts from which two or more isoforms are spliced are listed in this file.
214 10. Differential promoter tests: this tab delimited file lists, for each gene, the amount of overloading detected among its primary transcripts, i.e. how much differential promoter use exists between samples. Only genes producing two or more distinct primary transcripts (i.e. multi-promoter genes) are listed here.
215 11. Differential CDS tests: this tab delimited file lists, for each gene, the amount of overloading detected among its coding sequences, i.e. how much differential CDS output exists between samples. Only genes producing two or more distinct CDS (i.e. multi-protein genes) are listed here.
216
217 -------
218
219 **Settings**
220
221 All of the options have a default value. You can change any of them. Most of the options in Cuffdiff have been implemented here.
222
223 ------
224
225 **Cuffdiff parameter list**
226
227 This is a list of implemented Cuffdiff options::
228
229 -m INT This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments selected at 300bp, where each end is 50bp, you should set -r to be 200. The default is 45bp.
230 -s INT The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.
231 -c INT The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not significant, and the locus' observed changes don't contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).
232 --FDR FLOAT The allowed false discovery rate. The default is 0.05.
233 --num-importance-samples INT Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000
234 --max-mle-iterations INT Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000
235 -N With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts.
236
237 </help>
238 </tool>