comparison featurecounts.xml @ 0:ab0323782bb5 draft

planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/featurecounts commit cc900436bad9c6cca1f73d438c1f158d3bfc4318-dirty
author yhoogstrate
date Mon, 18 May 2015 04:49:19 -0400
parents
children 63bd455ed299
comparison
equal deleted inserted replaced
-1:000000000000 0:ab0323782bb5
1 <?xml version="1.0" encoding="UTF-8"?>
2 <tool id="featurecounts" name="featureCounts" version="1.4.6.p1">
3 <description>Measure gene expression in RNA-Seq experiments from SAM or BAM files.</description>
4 <requirements>
5 <requirement type="package" version="1.4.6.p1">featurecounts</requirement>
6 <requirement type="package" version="1.0.0">featurecounts2bed</requirement>
7 </requirements>
8 <version_command>featureCounts -v</version_command>
9 <command>
10 #*
11 The following script is written in the "Cheetah" language:
12 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
13 *#
14
15 ## Check 01: do the alignments have a dbkey and is the option set to using it?
16 #if $reference_gene_sets_source.source_select == "attribute" and len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) != 1
17 echo "Invalid number of dbkeys are found: ${ len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) }, while only one should be used. Make sure that the alignments are done on the same reference genome and that 'tool-data/gene_sets.loc' is configured properly!" >&amp;2
18 #else
19 ## Check 02: are all alignments from the same type (bam || sam)
20 #if len({ alignment.extension:True for alignment in $alignments }.keys()) != 1
21 echo "Either all files must be SAM or all files must be BAM, no mixture is allowed." >&amp;2
22 #else
23 featureCounts
24 -a
25 #if $reference_gene_sets_source.source_select == "indexed_filtered"
26 "$reference_gene_sets_source.reference_gene_sets"
27 #else if $reference_gene_sets_source.source_select == "indexed_all"
28 "$reference_gene_sets_source.reference_gene_sets"
29 #else if $reference_gene_sets_source.source_select == "history"
30 "$reference_gene_sets_source.reference_gene_sets"
31 #else
32 #*
33 This is a workaround to obtain the "genome.fa" file that
34 corresponds to the dbkey of the alignments.
35 Because this file is "calculated" during run-time, it can
36 be used in a workflow.
37 *#
38 "${ filter( lambda x: str( x[0] ) == str( { alignment.metadata.dbkey:True for alignment in $alignments }.keys()[0] ), $__app__.tool_data_tables[ 'gene_sets' ].get_fields() )[0][2] }"
39 #end if
40
41 -o "$output"
42 -T $threads
43
44 #if $extended_parameters.parameters == "extended"
45 -t $extended_parameters.gff_feature_type
46 -g $extended_parameters.gff_feature_attribute
47 $extended_parameters.summarization_level
48 $extended_parameters.contribute_to_multiple_features
49 $extended_parameters.protocol
50 $extended_parameters.multimapping_counts
51 -Q $extended_parameters.mapping_quality
52 $extended_parameters.fragment_counting
53 $extended_parameters.check_distance
54 -d $extended_parameters.minimum_fragment_length
55 -D $extended_parameters.maximum_fragment_length
56 $extended_parameters.only_both_ends
57 $extended_parameters.exclude_chimerics
58 $extended_parameters.namesort
59 #end if
60
61 #for $alignment in $alignments
62 ${alignment}
63 #end for
64
65 2>&amp;1
66
67 #set $columns = [str(i+7) for i, alignment in enumerate($alignments)]
68 #set $columns=",".join($columns)
69
70 #if $format == "tabdel_default" or $format.value == "tabdel_default"
71 ; cp $output tmp.txt
72 ; egrep -v "^#" tmp.txt > tmp2.txt
73 ; cut -f 1,$columns tmp2.txt > tmp_left.txt
74 ; cut -f 6 tmp2.txt > tmp_right.txt
75 ; paste tmp_left.txt tmp_right.txt > $output
76 #elif $format == "tabdel_short" or $format.value == "tabdel_short"
77 ; cp $output tmp.txt
78 ; egrep -v "^#" tmp.txt | cut -f 1,$columns > $output
79 #end if
80
81 ## For every alignment, replace its filename for: "hid: sample name"
82 #for $alignment in $alignments
83 #set $alignment_escaped = str($alignment).replace('/', '\/').replace('.', '\.')
84 #set $alignment_name_escaped = str(alignment.hid)+": "+str($alignment.name).replace('\t',' ').replace('\\','\\\\').replace("'","\\'").replace('/','\/')
85
86 #if $format.value == "tabdel_default" or $format.value == "tabdel_short"
87 ; sed -e '1 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt
88 #elif $format.value == "bed":
89 ; featurecounts2bed.sh -f "$output" > tmp.txt
90 #else
91 ; sed -e '1,2 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt
92 #end if
93
94 ; mv tmp.txt $output
95
96 ; sed -e '1 s/$alignment_escaped/${alignment_name_escaped}/g' $output".summary" > tmp.txt
97 ; mv tmp.txt $output".summary"
98 #end for
99 ; mv $output".summary" $output_summary
100 #end if
101 #end if
102 </command>
103
104 <inputs>
105 <param name="alignments" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files in the series must be in THE SAME format." multiple="true" />
106
107 <!-- Find out how to access the the GTF/GFF file(s) -->
108 <conditional name="reference_gene_sets_source">
109 <param name="source_select" type="select" label="GFF/GTF Source">
110 <option value="indexed_filtered">Use a built-in index (which fits your reference)</option>
111 <option value="history">Use reference from the history</option>
112 <option value="indexed_all">Use a built-in index (entire list) - avoid this option if possible; only useful if you design a workflow</option>
113 <option value="attribute">Use a built-in index based on the 'metadata.dbkey' attribute; ideal in workflows</option>
114 </param>
115 <when value="indexed_filtered">
116 <param name="reference_gene_sets" type="select" label="Reference Gene Sets used during alignment (GFF/GTF)" >
117 <options from_data_table="gene_sets"><!-- replaces 'from_file="gene_sets"' - more strict -->
118 <column name="name" index="0"/>
119 <column name="dbkey" index="1"/>
120 <column name="value" index="2"/>
121 <filter type="data_meta" ref="alignments" multiple="false" key="dbkey" column="1" />
122 <validator type="no_options" message="No indexes are available for the selected input dataset" />
123 </options>
124 </param>
125 </when>
126 <when value="history">
127 <param name="reference_gene_sets" format="gff" type="data" label="Gene annotation file" help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotation file corresponds to the same reference genome as used for the alignment." />
128 </when>
129 <when value="indexed_all">
130 <param name="reference_gene_sets" type="select" label="Reference Gene Sets used during alignment (GFF/GTF)" >
131 <options from_data_table="gene_sets"><!-- replaces 'from_file="gene_sets"' - more strict -->
132 <column name="name" index="0"/>
133 <column name="dbkey" index="1"/>
134 <column name="value" index="2"/>
135 <validator type="no_options" message="No indexes are available for the selected input dataset" />
136 </options>
137 </param>
138 </when>
139 <when value="attribute">
140 <!-- Do nothing, determine GTF/GFF file at runtime -->
141 </when>
142 </conditional>
143
144 <param name="format" type="select" label="Output format">
145 <option value="complex">featureCounts 1.4.0+ default (extensive; complex)</option>
146 <option value="tabdel_default" selected="true">Gene-name "\t" gene-count "\t" gene-length (tab-delimited)</option>
147 <option value="tabdel_short">Gene-name "\t" gene-count (tab-delimited)</option>
148 <option value="bed">BED format (line per exon): chr "\t" start "\t" stop "\t" description "\t" readcount (tab-delimited)</option>
149 </param>
150
151 <param name="threads" type="integer" value="2" min="1" label="Number of the CPU threads. Higher numbers only make sense with a higher number of samples." />
152
153 <conditional name="extended_parameters">
154 <param name="parameters" type="select" label="featureCounts parameters" help="For more advanced featureCounts settings.">
155 <option value="default">Default settings</option>
156 <option value="extended">Extended settings</option>
157 </param>
158 <when value="default">
159 </when>
160 <when value="extended">
161 <param name="gff_feature_type" type="text" value="exon" label="GFF feature type filter" help="Specify the feature type. Only rows which have the matched matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default." />
162
163 <param name="gff_feature_attribute" type="text" value="gene_id" label="GFF gene identifier" help="Specify the attribute type used to group features (eg. exons) into meta-features (eg. genes), when GTF annotation is provided. `gene_id' by default. This attribute type is usually the gene identifier. This argument is useful for the meta-feature level summarization." />
164
165 <param name ="summarization_level" type="boolean" truevalue=" -f" falsevalue="" label="On feature level" help="If specified, read summarization will be performed at the feature level. By default (-f is not specified), the read summarization is performed at the meta-feature level." />
166
167 <param name ="contribute_to_multiple_features" type="boolean" truevalue=" -O" falsevalue="" label="Allow read to contribute to multiple features" help="If specified, reads (or fragments if -p is specified) will be allowed to be assigned to more than one matched meta- feature (or matched feature if -f is specified)" />
168
169 <param name="protocol" type="select" label="Strand specific protocol" help="Indicate if strand-specific read counting should be performed. It has three possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default.">
170 <option value=" -s 0" selected="true">Unstranded</option>
171 <option value=" -s 1">Stranded (forwards)</option>
172 <option value=" -s 2">Stranded (reverse)</option>
173 </param>
174
175 <param name="multimapping_counts" type="boolean" truevalue=" -M" falsevalue="" label="Count multi-mapping reads/fragments" help="If specified, multi-mapping reads/fragments will be counted (ie. a multi-mapping read will be counted up to N times if it has N reported mapping locations). The program uses the `NH' tag to find multi-mapping reads." />
176
177 <param name="mapping_quality" type="integer" value="0" label="Minimum read quality" help="The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default." />
178
179 <param name="fragment_counting" type="boolean" truevalue=" -p" falsevalue="" label="PE: Count fragments instead of reads" help="Paired-end specific: If specified, fragments (or templates) will be counted instead of reads. The two reads from the same fragment must be adjacent to each other in the provided SAM/BAM file. If SAM/BAM input does not meet this requirement, the -S (sorting) option should be provided as well." />
180
181 <param name="check_distance" type="boolean" truevalue=" -P" falsevalue="" label="PE: Check paired-end distance" help="Paired-end specific: If specified, paired-end distance will be checked when assigning fragments to meta-features or features. This option is only applicable when -p (Count fragments instead of reads) is specified. The distance thresholds should be specified using -d and -D (minimum and maximum fragment/template length) options." />
182
183 <param name="minimum_fragment_length" type="integer" value="50" label="PE: Minimum fragment/template length." />
184 <param name="maximum_fragment_length" type="integer" value="600" label="PE: Maximum fragment/template length." />
185
186 <param name="only_both_ends" type="boolean" truevalue=" -B" falsevalue="" label="PE: only allow fragments with both reads aligned" help="Paired-end specific: If specified, only fragments that have both ends successfully aligned will be considered for summarization. This option is only applicable for paired-end reads." />
187
188 <param name="exclude_chimerics" type="boolean" truevalue=" -C" falsevalue="" label="PE: Exclude chimeric fragments" help="Paired-end specific: If specified, the chimeric fragments (those fragments that have their two ends aligned to different chromosomes) will NOT be included for summarization. This option is only applicable for paired-end read data." />
189
190 <param name="namesort" type="boolean" truevalue=" -S" falsevalue="" label="PE: Name-sort reads (slow!)" help="Paired-end specific: If specified, the program will reorder input reads according to their names and make reads from the same pair be adjacent to each other. This option should be provided when reads from the same pair are not adjacent to each other in input SAM/BAM files (for instance sorting reads by chromosomal locations could decouple reads from the same pair)." />
191 </when>
192 </conditional>
193 </inputs>
194
195 <outputs>
196 <data format="tabular" name="output" label="${tool.name} on ${', '.join([ str(a.hid)+': '+a.name for a in $alignments ])}" />
197 <data format="tabular" name="output_summary" label="${tool.name} on ${', '.join([ str(a.hid)+': '+a.name for a in $alignments ])} summary" />
198 </outputs>
199
200 <tests>
201 <test>
202 <param name="alignments" value="featureCounts_input1.bam,featureCounts_input2.bam" ftype="bam" />
203 <param name="source_select" value="history" />
204 <param name="reference_gene_sets" value="featureCounts_guide.gff" ftype="gff" />
205 <output name="output" file="output.tab"/>
206 <output name="output_summary" file="output_summary.tab"/>
207 </test>
208 </tests>
209
210 <help>
211 featureCounts
212 #############
213
214 Overview
215 --------
216 FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files.
217 It has a variety of advanced parameters but its major strength is its outstanding performance: analysis of a 10GB SE BAM file takes about 7 minutes on a single average CPU (Homo Sapiens genome) [1].
218
219 Input formats
220 -------------
221 Alignments should be provided in either:
222
223 - SAM format, http://samtools.sourceforge.net/samtools.shtml#5
224 - BAM format
225
226 Gene regions should be provided in the GFF/GTF format:
227
228 - http://genome.ucsc.edu/FAQ/FAQformat.html#format3
229 - http://www.ensembl.org/info/website/upload/gff.html
230
231 Installation
232 ------------
233
234 1. Make sure you have proper GFF/GTF files (corresponding to your reference genome used for the aligment) uploaded to your history.
235
236 2. Make sure that your gene_sets.loc is configured properly as data table. This is generally done by copying the right information into: tool_data_table_conf.xml. More info at: https://wiki.galaxyproject.org/Admin/Tools/Data%20Tables
237
238 License
239 -------
240
241 **featureCounts / subread package**:
242
243 - GNU General Public License version 3.0 (GPLv3)
244
245 Contact
246 -------
247
248 The tool wrapper has been written by Youri Hoogstrate from the Erasmus
249 Medical Center (Rotterdam, Netherlands) on behalf of the Translational
250 Research IT (TraIT) project:
251
252 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
253
254 More tools by the Translational Research IT (TraIT) project can be found
255 in the following toolsheds:
256
257 http://toolshed.dtls.nl/
258
259 http://toolshed.g2.bx.psu.edu/
260
261 http://testtoolshed.g2.bx.psu.edu/
262
263 References
264 ----------
265 **featureCounts: an efficient general purpose program for assigning sequence reads to genomic features.**
266
267 *Liao Y1, Smyth GK, Shi W.* - Bioinformatics. 2014 Apr 1;30(7):923-30.
268
269 - http://www.ncbi.nlm.nih.gov/pubmed/24227677
270 - http://dx.doi.org/10.1093/bioinformatics/btt656
271
272
273 Acknowledgements
274 ----------------
275
276 I would like to thank Marius van den Beek for his contributions to this project.
277 </help>
278 <citations>
279 <citation type="doi">10.1093/bioinformatics/btt656</citation>
280 </citations>
281 </tool>