comparison pyCRAC/pyReadCounters.xml @ 0:19b20927172d draft

Uploaded
author swebb
date Tue, 18 Jun 2013 09:11:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:19b20927172d
1 <tool id ="pyReadCounters" name="pyReadCounters" force_history_refresh="True">
2 <requirements>
3 <requirement type="package">pyCRAC</requirement>
4 </requirements>
5 <command interpreter="perl">
6 pyReadCounters.pl
7 -f $ftype.input
8 --file_type $ftype.file_type
9 --gtf $addGTF.gtf
10 #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
11 --discarded $discarded
12 #end if#
13 #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
14 --alignOpt
15 --align_quality $ftype.addAlignOpt.align_quality
16 --align_score $ftype.addAlignOpt.align_score
17 #if int($ftype.addAlignOpt.max) > 0:
18 --max $ftype.addAlignOpt.max
19 #end if#
20 --distance $ftype.addAlignOpt.d
21 --length $ftype.addAlignOpt.length
22 $ftype.addAlignOpt.unique
23 $ftype.addAlignOpt.blocks
24 $ftype.addAlignOpt.mutations
25 #end if#
26 #if $addOpt.options == "edit":
27 --options
28 --range $addOpt.range
29 $addOpt.ignore
30 --overlap $addOpt.overlap
31 #end if#
32
33 --stats $stats
34 --hittable $hittable
35 --intronUTRoverlap $intronUTRoverlap
36
37 #if $ftype.file_type == "novo" or $ftype.file_type == "sam":
38 --countoutput $countoutput
39 #end if#
40
41 --id $stats.id
42 </command>
43 <version_command>/usr/local/bin/pyReadCounters.py --version</version_command>
44 <inputs>
45 <conditional name="addGTF">
46 <param name="gtfFile" type="select" label="Choose GTF File from">
47 <option value="default" selected="true">Defaults</option>
48 <option value="other">History</option>
49 </param>
50 <when value="default">
51 <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates">
52 <options from_data_table="pycrac_gtf"/>
53 </param>
54 </when>
55 <when value="other">
56 <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/>
57 </when>
58 </conditional>
59 <conditional name="ftype">
60 <param name="file_type" type="select" label="Input File Type --file_type" help="Use .novo or .sam input files">
61 <option value="novo" selected="true">Novo</option>
62 <option value="sam">Sam/Bam</option>
63 <option value="gtf">GTF</option>
64 </param>
65 <when value="novo">
66 <param format="tabular" name="input" type="data" label="Input File --input_file" help="Alignment file of type .novo" />
67 <conditional name="disc">
68 <param name="discard" type="select" label="Print discarded reads to a separate file">
69 <option value="" selected="true">OFF</option>
70 <option value="discard">ON</option>
71 </param>
72 <when value="discard">
73 </when>
74 <when value="">
75 </when>
76 </conditional>
77 <conditional name="addAlignOpt">
78 <param name="alignoptions" type="select" label="Alignment Options">
79 <option value="default" selected="true">Default</option>
80 <option value="edit">Edit</option>
81 </param>
82 <when value="edit">
83 <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
84 <option value="" selected="true">Off</option>
85 <option value="--mutations delsonly">deletions</option>
86 <option value="--mutations subsonly">substitutions</option>
87 <option value="--mutations TC">T->C substitutions</option>
88 <option value="--mutations nomuts">no mutations</option>
89 </param>
90 <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
91 <validator type="in_range" min="0" message="Please enter a value >= 0"/>
92 </param>
93 <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
94 <validator type="in_range" min="0" message="Please enter a value >= 0"/>
95 </param>
96 <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
97 <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
98 </param>
99 <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
100 <validator type="in_range" min="1" message="Please enter a value >= 0"/>
101 </param>
102 <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
103 <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
104 </param>
105 <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique">
106 <option value="" selected="true">OFF</option>
107 <option value="--unique">ON</option>
108 </param>
109 <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks">
110 <option value="" selected="true">OFF</option>
111 <option value="--blocks">ON</option>
112 </param>
113 </when>
114 <when value="default">
115 </when>
116 </conditional>
117 </when>
118 <when value="sam">
119 <param format="sam,bam" name="input" type="data" label="Input File --input_file" help="Alignment file of type .sam or .bam" />
120 <conditional name="disc">
121 <param name="discard" type="select" label="Print discarded reads to a separate file">
122 <option value="" selected="true">OFF</option>
123 <option value="discard">ON</option>
124 </param>
125 <when value="discard">
126 </when>
127 <when value="">
128 </when>
129 </conditional>
130 <conditional name="addAlignOpt">
131 <param name="alignoptions" type="select" label="Alignment Options">
132 <option value="default" selected="true">Default</option>
133 <option value="edit">Edit</option>
134 </param>
135 <when value="edit">
136 <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
137 <option value="" selected="true">Off</option>
138 <option value="--mutations delsonly">deletions</option>
139 <option value="--mutations subsonly">substitutions</option>
140 <option value="--mutations TC">T->C mutations</option>
141 <option value="--mutations nomuts">no mutations</option>
142 </param>
143 <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
144 <validator type="in_range" min="0" message="Please enter a value >= 0"/>
145 </param>
146 <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
147 <validator type="in_range" min="0" message="Please enter a value >= 0"/>
148 </param>
149 <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
150 <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
151 </param>
152 <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
153 <validator type="in_range" min="1" message="Please enter a value >= 0"/>
154 </param>
155 <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
156 <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
157 </param>
158 <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique">
159 <option value="" selected="true">OFF</option>
160 <option value="--unique">ON</option>
161 </param>
162 <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks">
163 <option value="" selected="true">OFF</option>
164 <option value="--blocks">ON</option>
165 </param>
166 </when>
167 <when value="default">
168 </when>
169 </conditional>
170 </when>
171 <when value="gtf">
172 <param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" />
173 </when>
174 </conditional>
175 <conditional name="addOpt">
176 <param name="options" type="select" label="Standard Options">
177 <option value="default" selected="true">Default</option>
178 <option value="edit">Edit</option>
179 </param>
180 <when value="edit">
181 <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000">
182 <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/>
183 </param>
184 <param name="ignore" type="select" label="Ignore strand information? --ignorestrand">
185 <option value="" selected="true">No</option>
186 <option value="--ignorestrand">Yes</option>
187 </param>
188 <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. ">
189 <validator type="in_range" min="1" message="Please enter a positive integer"/>
190 </param>
191 </when>
192 <when value="default">
193 </when>
194 </conditional>
195 <param name="label" type="text" format="txt" size="30" value="pyReadCounters" label="Enter output file label -o" />
196 </inputs>
197 <outputs>
198 <data format="tabular" name="stats" label="${label.value}_file_statistics.txt"/>
199 <data format="tabular" name="hittable" label="${label.value}_hittable.txt"/>
200 <data format="gtf" name="intronUTRoverlap" label="${label.value}_intron_and_UTR_overlap.txt"/>
201 <data format="gtf" name="countoutput" label="${label.value}_count_output.gtf">
202 <filter>ftype['file_type'] == "novo" or ftype['file_type'] == "sam"</filter>
203 </data>
204 <data format="txt" name="discarded" label="${label.value}_discarded.txt">
205 <filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"</filter>
206 </data>
207 </outputs>
208 <help>
209
210 .. class:: infomark
211
212 **pyReadCounters**
213
214 pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap.
215 Finally the tool produces a read statistics file that provides information about the complexity of your dataset.
216
217 **Output file examples**
218
219 A hittable file::
220
221 # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012
222 # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique
223 # total number of reads 12534556
224 # total number of paired reads 10947376
225 # total number of single reads 483095
226 # total number of mapped reads: 11430471
227 # total number of overlapping genomic features 7019550
228 # sense 5960669
229 # anti-sense 1058881
230 # feature sense_overlap anti-sense_overlap number of reads
231
232 ## protein_coding 3190701
233 YEF3 49930 3629 24221
234 PMA1 32621 2650 21776
235 COX1 24559 1037 15174
236 TFP1 21539 1689 13506
237 HSC82 21177 1458 12729
238 ADH1 20245 1467 11351
239 AI5_ALPHA 20022 918 13101
240 AI4 19390 886 12638
241 AI3 17823 798 11473
242 AI2 17590 790 11297
243 RPL10 16822 1113 8797
244 ENO2 16336 1125 8913
245 TEF1 15578 1333 5450
246
247 An example of a GTF 'count_output' file::
248
249 ##gff-version 2
250 # generated by Counters version 1.2.0, Tue Jan 8 22:47:29 2013
251 # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v
252 # total number of reads: 2455251
253 # total number of paired reads: 0
254 # total number of single reads: 2455251
255 # total number of mapped reads: 2455251
256 # total number of overlapping genomic features: 5153943
257 # sense: 2640600
258 # anti-sense: 2513343
259 chrXIV reads exon 661572 661605 2 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S;
260 chrXIV reads exon 661720 661738 1 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S;
261 chrXIV reads exon 661839 661878 4 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S;
262
263 This output file also reports whether a read contains a mutation.
264
265 For example::
266
267 # 661596S
268
269 Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column.
270
271 .. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
272
273 ------
274
275 **Parameter list**
276
277 File input options::
278
279 -f FILE, --input_file=FILE
280 provide the path to your novo, SAM/BAM or gtf data
281 file. Default is standard input. Make sure to specify
282 the file type of the file you want to have analyzed
283 using the --file_type option!
284 -o OUTPUT_FILE, --output_file=OUTPUT_FILE
285 Use this flag to override the standard file names. Do
286 NOT add an extension.
287 --file_type=FILE_TYPE
288 use this option to specify the file type (i.e.
289 'novo','sam' or 'gtf'). This will tell the program
290 which parsers to use for processing the files. Default
291 = 'novo'
292 --gtf=annotation_file.gtf
293 type the path to the gtf annotation file that you want
294 to use
295
296 Common pyCRAC options::
297
298 --ignorestrand
299 To ignore strand information and all reads overlapping
300 with genomic features will be considered sense reads.
301 Useful for analysing ChIP or RIP data
302 --overlap=1
303 sets the number of nucleotides a read has to overlap
304 with a gene before it is considered a hit. Default =
305 1 nucleotide
306 -r 100, --range=100
307 allows you to add regions flanking the genomic
308 feature. If you set '-r 50' or '--range=50', then the
309 program will add 50 nucleotides to each feature on
310 each side regardless of whether the GTF file has genes
311 with annotated UTRs
312
313 Options for SAM/BAM and Novo files::
314
315 --mutations=delsonly
316 Use this option to only track mutations that are of
317 interest. For CRAC data this is usually deletions
318 (--mutations=delsonly). For PAR-CLIP data this is
319 usually T-C mutations (--mutations=TC). Other options
320 are\: do not report any mutations: --mutations=nomuts.
321 Only report specific base mutations, for example only
322 in T's, C's and G's :--mutations=[TCG]. The brackets
323 are essential. Other nucleotide combinations are also
324 possible
325 --align_quality=100, --mapping_quality=100
326 with these options you can set the alignment quality
327 (Novoalign) or mapping quality (SAM) threshold. Reads
328 with qualities lower than the threshold will be
329 ignored. Default = 0
330 --align_score=100
331 with this option you can set the alignment score
332 threshold. Reads with alignment scores lower than the
333 threshold will be ignored. Default = 0
334 --unique
335 with this option reads with multiple alignment
336 locations will be removed. Default = Off
337 --blocks
338 with this option reads with the same start and end
339 coordinates on a chromosome will be counted as one
340 cDNA. Default = Off
341 -m 100000, --max=100000
342 maximum number of mapped reads that will be analyzed.
343 Default = All
344 -d 1000, --distance=1000
345 this option allows you to set the maximum number of
346 base-pairs allowed between two non-overlapping paired
347 reads. Default = 1000
348 --discarded=FILE
349 prints the lines from the alignments file that were
350 discarded by the parsers. This file contains reads
351 that were unmapped (NM), of poor quality (i.e. QC) or
352 paired reads that were mapped to different chromosomal
353 locations or were too far apart on the same
354 chromosome. Useful for debugging purposes
355 -l 100, --length=1000
356 to set read length threshold. Default = 1000
357
358 </help>
359 </tool>