comparison grinder.xml @ 5:5ba7c9ac056a

Uploaded
author fangly
date Tue, 04 Oct 2011 02:01:58 -0400
parents
children 68576b1d2d8b
comparison
equal deleted inserted replaced
4:8c1cbee38ffd 5:5ba7c9ac056a
1 <tool id="grinder" name="Grinder" version="0.3.8" force_history_refresh="True">
2
3 <!--
4 Author: florent.angly@gmail.com
5 TODO:
6 • See bfast tool (tools/sr_mapping/bfast_wrapper.xml) for how to use datatables easily
7 • Basic tests
8 • Link to full manual
9 • Better sync with Grinder parameters, defaults and help
10 -->
11
12 <description>genomic, metagenomic and amplicon read simulator</description>
13
14 <requirements>
15 <requirement type="binary">grinder</requirement>
16 </requirements>
17
18 <version_string>grinder --version</version_string>
19
20 <command>
21 #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' )
22 #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' )
23 #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' )
24
25 $script1
26 grinder
27 #if $reference_file.specify == "builtin":
28 -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }
29 #else if $reference_file.specify == "uploaded":
30 -reference_file $reference_file.value
31 #end if
32 #if str($coverage_fold):
33 -coverage_fold $coverage_fold
34 #end if
35 #if str($total_reads):
36 -total_reads $total_reads
37 #end if
38 #if str($read_dist):
39 -read_dist $read_dist
40 #end if
41 #if str($insert_dist):
42 -insert_dist $insert_dist
43 #end if
44 #if str($exclude_chars):
45 -exclude_chars $exclude_chars
46 #end if
47 #if str($delete_chars):
48 -delete_chars $delete_chars
49 #end if
50 #if str($forward_reverse) != "None":
51 -forward_reverse $forward_reverse
52 #end if
53 #if str($unidirectional):
54 -unidirectional $unidirectional
55 #end if
56 #if str($length_bias):
57 -length_bias $length_bias
58 #end if
59 #if str($copy_bias):
60 -copy_bias $copy_bias
61 #end if
62 #if str($mutation_dist):
63 -mutation_dist $mutation_dist
64 #end if
65 #if str($mutation_ratio):
66 -mutation_ratio $mutation_ratio
67 #end if
68 #if str($homopolymer_dist):
69 -homopolymer_dist $homopolymer_dist
70 #end if
71 #if str($chimera_perc):
72 -chimera_perc $chimera_perc
73 #end if
74 #if str($abundance_file) != "None":
75 -abundance_file $abundance_file
76 #end if
77 #if str($abundance_model):
78 -abundance_model $abundance_model
79 #end if
80 #if str($num_libraries):
81 -num_libraries $num_libraries
82 #end if
83 #if str($multiplex_ids) != "None":
84 -multiplex_ids $multiplex_ids
85 #end if
86 #if str($diversity):
87 -diversity $diversity
88 #end if
89 #if str($shared_perc):
90 -shared_perc $shared_perc
91 #end if
92 #if str($permuted_perc):
93 -permuted_perc $permuted_perc
94 #end if
95 #if str($random_seed):
96 -random_seed $random_seed
97 #end if
98 #if str($permuted_perc):
99 -desc_track $desc_track
100 #end if
101 #if str($qual_levels):
102 -qual_levels $qual_levels
103 #end if
104 #if str($fastq_output):
105 -fastq_output $fastq_output
106 #end if
107 #if str($profile_file) != "None":
108 -profile_file $profile_file.value
109 #end if
110 <!-- When Galaxy bug #661 is resolved, then we can use the same method to check for all optional argument -->
111 <!-- i.e. either if str($param) != "None": or if str($param): -->
112 <!-- URL: https://bitbucket.org/galaxy/galaxy-central/issue/661/optional-arguments-problems#comment-655611 -->
113
114 #set $output_dir = $__new_file_path__
115 -output_dir $output_dir
116
117 #set $base_name = $output.id
118 -base_name $base_name
119 ;
120
121 $script2 $output_dir $base_name
122
123 </command>
124
125 <inputs>
126
127 <conditional name="reference_file">
128 <param name="specify" type="select" label="Specify">
129 <option value="builtin">Built-in file</option>
130 <option value="uploaded">Uploaded file</option>
131 </param>
132 <when value="builtin">
133 <param name="value" type="select" label="Reference sequences" help="Galaxy built-in FASTA file">
134 <options from_data_table="all_fasta" />
135 </param>
136 </when>
137 <when value="uploaded">
138 <param name="value" type="data" format="fasta" label="Reference sequences" help="FASTA file that contains the input reference sequences" />
139 </when>
140 </conditional>
141
142 <param name="total_reads" type="text" value="100" optional="true" label="Number of reads" help="Number of shotgun or amplicon reads to generate for each library. Do not specify this if you specify the fold coverage." />
143
144 <param name="coverage_fold" type="text" optional="true" label="Coverage fold" help="Generate the number of reads needed to achieve the specified fold coverage of the input reference sequences for each library (the output FASTA length divided by the input FASTA length). Do not specify this if you specify the number of reads directly" />
145
146 <param name="read_dist" type="text" value="100" optional="true" label="Sequence length distribution" help="Desired sequence length distribution specified as:
147 average length, distribution ('uniform' or 'normal') and standard deviation
148 Only the first element is required.
149 Examples:
150 1/ All sequences exactly 250 bp long: 250
151 2/ Uniform distribution around 100+-10 bp: 100 uniform 10
152 3/ Read normally distributed with an average of 800 and a standard deviation
153 of 100 bp: 800 normal 100" />
154
155 <param name="insert_dist" type="text" value="0" optional="true" label="Insert size distribution" help="Create shotgun paired end reads (mate pairs) spanning the given insert length (the reads are interior to the insert):
156 0 : off,
157 or: insert size distribution in bp, in the same format as the read length
158 distribution (a typical value is 2,500 bp)
159 Two distinct reads are generated whether or not the mate pair overlaps.
160 Default: insert_dist.default" />
161
162 <param name="exclude_chars" type="text" optional="true" label="Characters to exclude" help="Do not create reads containing any of the specified characters (case insensitive), e.g. 'N-' to prevent reads with gaps (-) or ambiguities (N)." />
163
164 <param name="delete_chars" type="text" optional="true" label="Characters to delete" help="Remove the specified characters from the reference sequences (case insensitive), e.g. 'N-' to remove gaps (-) and ambiguities (N)." />
165
166 <param name="forward_reverse" type="data" format="fasta" optional="true" label="Amplicon primers" help="Use amplicon sequencing using the given forward and reverse PCR primer sequences (in a FASTA file, in this order). The second sequence in the FASTA file (the reverse primer) is optional. The sequences should use the IUPAC convention for degenerate residues). Example: AAACTYAAAKGAATTGRCGG and ACGGGCGGTGTGTRC for the 926F and 1392R primers respectively (primers that target the v6 to v9 region of the 16S rRNA gene). Genome sequences that do not match the specified primers are excluded. It is recommended to use the unidirectional and no genome length bias options to generate amplicon reads." />
167
168 <param name="unidirectional" type="select" display="radio" value="0" label="Sequencing direction" help="Produce reads just from one strand, by opposition to the reference strand and its reverse complement.">
169 <option value="0">both strands</option>
170 <option value="1">forward strand only</option>
171 <option value="-1">reverse strand only</option>
172 </param>
173
174 <param name="length_bias" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Length bias" help="In shotgun libraries, sample species proportionally to their genome length: at the same relative abundance, larger genomes contribute more reads than smaller genomes." />
175
176 <param name="copy_bias" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Copy number bias" help="In amplicon libraries, sample species proportionally to the number of copies of the target gene: at equal relative abundance, genomes that have multiple copies of the target gene contribute more amplicon reads than genomes that have a single copy. Note: you should use full genomes in the reference file to make use of this option." />
177
178 <param name="mutation_dist" type="text" value="0" optional="true" label="Mutation distribution" help="Introduce sequencing errors in the reads, under the form of mutations (substitutions, insertions and deletions) using a specified frequency distribution:
179 average probability (%),
180 model (uniform, linear),
181 value at 3&apos; end (not applicable for uniform model).
182 For example, for Sanger-type errors, use:
183 1.5 linear 2." />
184
185 <param name="mutation_ratio" type="text" value="80 20" optional="true" label="Mutation ratio" help="Indicate the percentage of substitutions and indels (insertions and deletions). For example, use 80 20 (4 substitutions for each indel) for Sanger reads. Note that this parameter has no effect unless you specify the mutation distribution option." />
186
187 <param name="homopolymer_dist" type="text" value="0" optional="true" label="Homopolymer distribution" help="Introduce sequencing errors in the reads under the form of homopolymeric stretches (e.g. AAA, CCCCC) using a specified model (n: homopolymer length).
188 Margulies: N(n, 0.15 * n), Margulies et al. 2005.
189 Richter: N(n, 0.15 * sqrt(n)), Richter et al. 2008.
190 Balzer: N(n, 0.03494 + n * 0.06856), Balzer et al. 2010." />
191
192 <param name="chimera_perc" type="text" value="0" optional="true" label="Percentage of chimeras" help="Specify the percent of reads in amplicon libraries that should be chimeric sequences. A typical value is 10%." />
193
194 <param name="abundance_file" type="data" format="tabular" optional="true" label="Abundance file" help="Specify the relative abundance of the genomes manually in an input file. Each line of the file should contain a sequence name and its relative abundance (%), e.g. 'seqABC 82.1' or 'seqABC 82.1 10.2' if you are specifying 2 different communities." />
195
196 <param name="abundance_model" type="text" value="uniform 1" optional="true" label="Rank abundance model" help="Relative abundance model for the input genomes:
197 uniform, linear, powerlaw, logarithmic or exponential.
198 Examples:
199 1/ uniform distribution: uniform,
200 2/ powerlaw distribution with parameter 0.1: powerlaw 0.1." />
201
202 <param name="num_libraries" type="text" value="1" optional="true" label="Number of libraries" help="Number of independent libraries to create. Specify how diverse and similar they should be using the options diversity, shared percent; and permuted percent. Assign them different MID tags with the multiplex mids option." />
203
204 <param name="multiplex_ids" type="data" format="fasta" optional="true" label="Specify MID tags file" help="Specify an optional FASTA file that contains sequence identifiers (a.k.a MIDs or barcodes) to add to the sequences (one per library)."/>
205
206 <!-- When Galaxy bug #661 is resolved, then we can really have optional parameters of type "integer" or "float" -->
207 <!-- URL: https://bitbucket.org/galaxy/galaxy-central/issue/661/optional-arguments-problems#comment-655611 -->
208 <!-- Affected params: diversity (int), shared_perc (float), permuted_perc (float), random_seed (int), num_libraries (int), chimera_perc (float) -->
209 <param name="diversity" type="text" optional="true" label="Diversity (richness)" help="Richness, or number of genomes to include in the shotgun libraries. Use 0 for the maximum diversity possible, i.e. all the genomes from the input file when a single independent library is requested." />
210
211 <param name="shared_perc" type="text" value="0" optional="true" label="Percent shared" help="For multiple libraries, percent of genomes they should have in common." />
212
213 <param name="permuted_perc" type="text" value="0" optional="true" label="Percent permuted" help="For multiple libraries, percent of the most-abundant genomes to permute in rank-abundance." />
214
215 <param name="random_seed" type="text" optional="true" label="Random seed" help="Seed number to use for the pseudo-random number generator." />
216
217 <param name="desc_track" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Read tracking" help="Track read information (reference sequence, position, errors, ...) by writing it in the FASTA read description." />
218
219 <param name="qual_levels" type="text" optional="true" label="Quality score levels" help="Generate basic quality scores for the simulated reads. Good residues are given a specified good score (e.g. 30) and residues that are the result of an insertion or substitution are given a specified bad score (e.g. 10). Specify first the good score and then the bad score, e.g. '30 10'" />
220
221 <param name="fastq_output" type="boolean" truevalue="1" falsevalue="0" checked="false" label="FASTQ output" help="
222 Write the generated reads in FASTQ format (Sanger variant) instead of FASTA and
223 QUAL. Quality score levels need to be specified for this option to be effective." />
224
225 <param name="profile_file" type="data" format="txt" optional="true" label="Profile file" help="A file that contains Grinder arguments. This is useful if you use many options or often use the same options. Lines with comments (#) are ignored. Consider the profile file, 'simple_profile.txt':
226
227 # A simple Grinder profile
228 -read_dist 105 normal 12
229 -total_reads 1000
230
231 Running: grinder -reference_file viral_genomes.fa -profile_file simple_profile.txt
232
233 Translates into: grinder -reference_file viral_genomes.fa -read_dist 105 normal 12 -total_reads 1000
234
235 Note that the arguments specified in the profile should not be specified again on the command line." />
236
237 </inputs>
238
239 <!--
240 <outputs>
241 <data format="tabular" name="ranks" from_work_dir="grinder-ranks.txt" label="${tool.name} ranks from ${on_string}" />
242 <conditional/>
243 <data format="fasta" name="fasta" from_work_dir="grinder-reads.fa" label="${tool.name} reads from ${on_string}" />
244 <data format="qual" name="qual" from_work_dir="grinder-reads.qual" label="${tool.name} read quals from ${on_string}" >
245 <filter>(str(qual_levels))</filter>
246 </data>
247 </outputs>
248 -->
249
250 <outputs>
251 <data format="text" name="output" />
252 </outputs>
253
254 <tests>
255 <!-- no tests since they would not not always return the same results -->
256 <!--
257 <test>
258 <param name="specify" value="uploaded" />
259 <param name="value" value="ngs_simulation_in1.fasta" ftype="fasta" />
260 <output name="ranks" file="" />
261 <output name="fasta" file="" />
262 <output name="qual" file="" />
263 </test>
264
265 <test>
266 <param name="specify" value="builtin" />
267 <param name="builtin" value="pUC18" />
268 <output name="ranks" file="" />
269 <output name="fasta" file="" />
270 <output name="qual" file="" />
271 </test>
272 -->
273
274 </tests>
275
276 <help>
277
278 **What it does**
279
280 Grinder is a program to create random shotgun and amplicon sequence libraries
281 based on reference sequences in a FASTA file. Features include:
282
283 * shotgun library or amplicon library
284 * arbitrary read length distribution and number of reads
285 * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)
286 * support for creating paired-end (mate pair) datasets
287 * specific rank-abundance settings or manually given abundance for each genome
288 * creation of datasets with a given richness (alpha diversity)
289 * independent datasets can share a variable number of genomes (beta diversity)
290 * modeling of the bias created by varying genome lengths or gene copy number
291 * profile mechanism to store preferred options
292 * API to automate the creation of a large number of simulated datasets
293
294
295 **Input**
296
297 A variety of FASTA databases containing genes or genomes can be used as input
298 for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),
299 the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...
300
301 These input files can either be provided as a Galaxy dataset, or can be uploaded
302 by Galaxy users in their history.
303
304
305 **Output**
306
307 For each library requested, a first file contains the abundance of the species
308 in the simulated community created, e.g.::
309
310 # rank seqID rel. abundance
311 1 86715_Lachnospiraceae 0.367936925098555
312 2 6439_Neisseria_polysaccharea 0.183968462549277
313 3 103712_Fusobacterium_nucleatum 0.122645641699518
314 4 103024_Frigoribacterium 0.0919842312746386
315 5 129066_Streptococcus_pyogenes 0.0735873850197109
316 6 106485_Pseudomonas_aeruginosa 0.0613228208497591
317 7 13824_Veillonella_criceti 0.0525624178712221
318 8 28044_Lactosphaera 0.0459921156373193
319
320 The second file is a FASTA file containing shotgun or amplicon reads, e.g.::
321
322 >1 reference=13824_Veillonella_criceti position=89-1088 strand=+
323 ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA
324 TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG
325 AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA
326
327 >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
328 TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA
329 GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG
330
331 If you specify the quality score levels option, a third file representing the
332 quality scores of the reads is created::
333
334 >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
335 30 30 30 10 30 30 ...
336
337
338 </help>
339
340 </tool>
341