comparison tools/ngs_simulation/grinder.xml @ 1:7d26d64539b2

Uploaded
author fangly
date Mon, 19 Sep 2011 01:07:28 -0400
parents
children 27a15723d4f0
comparison
equal deleted inserted replaced
0:b35ec780aac1 1:7d26d64539b2
1 <tool id="grinder" name="Grinder" version="0.3.7" force_history_refresh="True">
2
3 <!--
4 Author: florent.angly@gmail.com
5 TODO:
6 • See bfast tool (tools/sr_mapping/bfast_wrapper.xml) for how to use datatables easily
7 • Basic tests
8 • Link to full manual
9 • Better sync with Grinder parameters, defaults and help
10 -->
11
12 <description>genomic, metagenomic and amplicon read simulator (BETA)</description>
13
14 <requirements>
15 <requirement type="binary">grinder</requirement>
16 </requirements>
17
18 <version_string>grinder --version</version_string>
19
20 <command>
21 #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' )
22 #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' )
23 #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' )
24
25 $script1
26 grinder
27 #if $reference_file.specify == "builtin":
28 -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }
29 #else if $reference_file.specify == "uploaded":
30 -reference_file $reference_file.value
31 #end if
32 #if str($coverage_fold):
33 -coverage_fold $coverage_fold
34 #end if
35 #if str($total_reads):
36 -total_reads $total_reads
37 #end if
38 #if str($read_dist):
39 -read_dist $read_dist
40 #end if
41 #if str($insert_dist):
42 -insert_dist $insert_dist
43 #end if
44 #if str($exclude_chars):
45 -exclude_chars $exclude_chars
46 #end if
47 #if str($delete_chars):
48 -delete_chars $delete_chars
49 #end if
50 #if str($forward_reverse) != "None":
51 -forward_reverse $forward_reverse
52 #end if
53 #if str($unidirectional):
54 -unidirectional $unidirectional
55 #end if
56 #if str($length_bias):
57 -length_bias $length_bias
58 #end if
59 #if str($copy_bias):
60 -copy_bias $copy_bias
61 #end if
62 #if str($mutation_dist):
63 -mutation_dist $mutation_dist
64 #end if
65 #if str($mutation_ratio):
66 -mutation_ratio $mutation_ratio
67 #end if
68 #if str($homopolymer_dist):
69 -homopolymer_dist $homopolymer_dist
70 #end if
71 #if str($chimera_perc):
72 -chimera_perc $chimera_perc
73 #end if
74 #if str($abundance_file) != "None":
75 -abundance_file $abundance_file
76 #end if
77 #if str($abundance_model):
78 -abundance_model $abundance_model
79 #end if
80 #if str($num_libraries):
81 -num_libraries $num_libraries
82 #end if
83 #if str($multiplex_ids) != "None":
84 -multiplex_ids $multiplex_ids
85 #end if
86 #if str($diversity):
87 -diversity $diversity
88 #end if
89 #if str($shared_perc):
90 -shared_perc $shared_perc
91 #end if
92 #if str($permuted_perc):
93 -permuted_perc $permuted_perc
94 #end if
95 #if str($random_seed):
96 -random_seed $random_seed
97 #end if
98 #if str($permuted_perc):
99 -desc_track $desc_track
100 #end if
101 #if str($qual_levels):
102 -qual_levels $qual_levels
103 #end if
104 #if str($profile_file) != "None":
105 -profile_file $profile_file.value
106 #end if
107 <!-- When Galaxy bug #661 is resolved, then we can use the same method to check for all optional argument -->
108 <!-- i.e. either if str($param) != "None": or if str($param): -->
109 <!-- URL: https://bitbucket.org/galaxy/galaxy-central/issue/661/optional-arguments-problems#comment-655611 -->
110
111 #set $output_dir = $__new_file_path__
112 -output_dir $output_dir
113
114 #set $base_name = $output.id
115 -base_name $base_name
116 ;
117
118 $script2 $output_dir $base_name
119
120 </command>
121
122 <inputs>
123
124 <conditional name="reference_file">
125 <param name="specify" type="select" label="Specify">
126 <option value="builtin">Built-in file</option>
127 <option value="uploaded">Uploaded file</option>
128 </param>
129 <when value="builtin">
130 <param name="value" type="select" label="Reference sequences" help="Galaxy built-in FASTA file">
131 <options from_data_table="all_fasta" />
132 </param>
133 </when>
134 <when value="uploaded">
135 <param name="value" type="data" format="fasta" label="Reference sequences" help="FASTA file that contains the input reference sequences" />
136 </when>
137 </conditional>
138
139 <param name="total_reads" type="text" value="100" optional="true" label="Number of reads" help="Number of shotgun or amplicon reads to generate for each library. Do not specify this if you specify the fold coverage." />
140
141 <param name="coverage_fold" type="text" optional="true" label="Coverage fold" help="Generate the number of reads needed to achieve the specified fold coverage of the input reference sequences for each library (the output FASTA length divided by the input FASTA length). Do not specify this if you specify the number of reads directly" />
142
143 <param name="read_dist" type="text" value="100" optional="true" label="Sequence length distribution" help="Desired sequence length distribution specified as:
144 average length, distribution ('uniform' or 'normal') and standard deviation
145 Only the first element is required.
146 Examples:
147 1/ All sequences exactly 250 bp long: 250
148 2/ Uniform distribution around 100+-10 bp: 100 uniform 10
149 3/ Read normally distributed with an average of 800 and a standard deviation
150 of 100 bp: 800 normal 100" />
151
152 <param name="insert_dist" type="text" value="0" optional="true" label="Insert size distribution" help="Create shotgun paired end reads (mate pairs) spanning the given insert length (the reads are interior to the insert):
153 0 : off,
154 or: insert size distribution in bp, in the same format as the read length
155 distribution (a typical value is 2,500 bp)
156 Two distinct reads are generated whether or not the mate pair overlaps.
157 Default: insert_dist.default" />
158
159 <param name="exclude_chars" type="text" optional="true" label="Characters to exclude" help="Do not create reads containing any of the specified characters (case insensitive), e.g. 'N-' to prevent reads with gaps (-) or ambiguities (N)." />
160
161 <param name="delete_chars" type="text" optional="true" label="Characters to delete" help="Remove the specified characters from the reference sequences (case insensitive), e.g. 'N-' to remove gaps (-) and ambiguities (N)." />
162
163 <param name="forward_reverse" type="data" format="fasta" optional="true" label="Amplicon primers" help="Use amplicon sequencing using the given forward and reverse PCR primer sequences (in a FASTA file, in this order). The second sequence in the FASTA file (the reverse primer) is optional. The sequences should use the IUPAC convention for degenerate residues). Example: AAACTYAAAKGAATTGRCGG and ACGGGCGGTGTGTRC for the 926F and 1392R primers respectively (primers that target the v6 to v9 region of the 16S rRNA gene). Genome sequences that do not match the specified primers are excluded. It is recommended to use the unidirectional and no genome length bias options to generate amplicon reads." />
164
165 <param name="unidirectional" type="select" display="radio" value="0" label="Sequencing direction" help="Produce reads just from one strand, by opposition to the reference strand and its reverse complement.">
166 <option value="0">both strands</option>
167 <option value="1">forward strand only</option>
168 <option value="-1">reverse strand only</option>
169 </param>
170
171 <param name="length_bias" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Length bias" help="In shotgun libraries, sample species proportionally to their genome length: at the same relative abundance, larger genomes contribute more reads than smaller genomes." />
172
173 <param name="copy_bias" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Copy number bias" help="In amplicon libraries, sample species proportionally to the number of copies of the target gene: at equal relative abundance, genomes that have multiple copies of the target gene contribute more amplicon reads than genomes that have a single copy. Note: you should use full genomes in the reference file to make use of this option." />
174
175 <param name="mutation_dist" type="text" value="0" optional="true" label="Mutation distribution" help="Introduce sequencing errors in the reads, under the form of mutations (substitutions, insertions and deletions) using a specified frequency distribution:
176 average probability (%),
177 model (uniform, linear),
178 value at 3&apos; end (not applicable for uniform model).
179 For example, for Sanger-type errors, use:
180 1.5 linear 2." />
181
182 <param name="mutation_ratio" type="text" value="80 20" optional="true" label="Mutation ratio" help="Indicate the percentage of substitutions and indels (insertions and deletions). For example, use 80 20 (4 substitutions for each indel) for Sanger reads. Note that this parameter has no effect unless you specify the mutation distribution option." />
183
184 <param name="homopolymer_dist" type="text" value="0" optional="true" label="Homopolymer distribution" help="Introduce sequencing errors in the reads under the form of homopolymeric stretches (e.g. AAA, CCCCC) using a specified model (n: homopolymer length).
185 Margulies: N(n, 0.15 * n), Margulies et al. 2005.
186 Richter: N(n, 0.15 * sqrt(n)), Richter et al. 2008.
187 Balzer: N(n, 0.03494 + n * 0.06856), Balzer et al. 2010." />
188
189 <param name="chimera_perc" type="text" value="0" optional="true" label="Percentage of chimeras" help="Specify the percent of reads in amplicon libraries that should be chimeric sequences. A typical value is 10%." />
190
191 <param name="abundance_file" type="data" format="tabular" optional="true" label="Abundance file" help="Specify the relative abundance of the genomes manually in an input file. Each line of the file should contain a sequence name and its relative abundance (%), e.g. 'seqABC 82.1' or 'seqABC 82.1 10.2' if you are specifying 2 different communities." />
192
193 <param name="abundance_model" type="text" value="uniform 1" optional="true" label="Rank abundance model" help="Relative abundance model for the input genomes:
194 uniform, linear, powerlaw, logarithmic or exponential.
195 Examples:
196 1/ uniform distribution: uniform,
197 2/ powerlaw distribution with parameter 0.1: powerlaw 0.1." />
198
199 <param name="num_libraries" type="text" value="1" optional="true" label="Number of libraries" help="Number of independent libraries to create. Specify how diverse and similar they should be using the options diversity, shared percent; and permuted percent. Assign them different MID tags with the multiplex mids option." />
200
201 <param name="multiplex_ids" type="data" format="fasta" optional="true" label="Specify MID tags file" help="Specify an optional FASTA file that contains sequence identifiers (a.k.a MIDs or barcodes) to add to the sequences (one per library)."/>
202
203 <!-- When Galaxy bug #661 is resolved, then we can really have optional parameters of type "integer" or "float" -->
204 <!-- URL: https://bitbucket.org/galaxy/galaxy-central/issue/661/optional-arguments-problems#comment-655611 -->
205 <!-- Affected params: diversity (int), shared_perc (float), permuted_perc (float), random_seed (int), num_libraries (int), chimera_perc (float) -->
206 <param name="diversity" type="text" optional="true" label="Diversity (richness)" help="Richness, or number of genomes to include in the shotgun libraries. Use 0 for the maximum diversity possible, i.e. all the genomes from the input file when a single independent library is requested." />
207
208 <param name="shared_perc" type="text" value="0" optional="true" label="Percent shared" help="For multiple libraries, percent of genomes they should have in common." />
209
210 <param name="permuted_perc" type="text" value="0" optional="true" label="Percent permuted" help="For multiple libraries, percent of the most-abundant genomes to permute in rank-abundance." />
211
212 <param name="random_seed" type="text" optional="true" label="Random seed" help="Seed number to use for the pseudo-random number generator." />
213
214 <param name="desc_track" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Read tracking" help="Track read information (reference sequence, position, errors, ...) by writing it in the FASTA read description." />
215
216 <param name="qual_levels" type="text" optional="true" label="Quality score levels" help="Generate basic quality scores for the simulated reads. Good residues are given a specified good score (e.g. 30) and residues that are the result of an insertion or substitution are given a specified bad score (e.g. 10). Specify first the good score and then the bad score, e.g. '30 10'" />
217
218 <param name="profile_file" type="data" format="txt" optional="true" label="Profile file" help="A file that contains Grinder arguments. This is useful if you use many options or often use the same options. Lines with comments (#) are ignored. Consider the profile file, 'simple_profile.txt':
219
220 # A simple Grinder profile
221 -read_dist 105 normal 12
222 -total_reads 1000
223
224 Running: grinder -reference_file viral_genomes.fa -profile_file simple_profile.txt
225
226 Translates into: grinder -reference_file viral_genomes.fa -read_dist 105 normal 12 -total_reads 1000
227
228 Note that the arguments specified in the profile should not be specified again on the command line." />
229
230 </inputs>
231
232 <!--
233 <outputs>
234 <data format="tabular" name="ranks" from_work_dir="grinder-ranks.txt" label="${tool.name} ranks from ${on_string}" />
235 <conditional/>
236 <data format="fasta" name="fasta" from_work_dir="grinder-reads.fa" label="${tool.name} reads from ${on_string}" />
237 <data format="qual" name="qual" from_work_dir="grinder-reads.qual" label="${tool.name} read quals from ${on_string}" >
238 <filter>(str(qual_levels))</filter>
239 </data>
240 </outputs>
241 -->
242
243 <outputs>
244 <data format="text" name="output" />
245 </outputs>
246
247 <tests>
248 <!-- no tests since they would not not always return the same results -->
249 <!--
250 <test>
251 <param name="specify" value="uploaded" />
252 <param name="value" value="ngs_simulation_in1.fasta" ftype="fasta" />
253 <output name="ranks" file="" />
254 <output name="fasta" file="" />
255 <output name="qual" file="" />
256 </test>
257
258 <test>
259 <param name="specify" value="builtin" />
260 <param name="builtin" value="pUC18" />
261 <output name="ranks" file="" />
262 <output name="fasta" file="" />
263 <output name="qual" file="" />
264 </test>
265 -->
266
267 </tests>
268
269 <help>
270
271 **What it does**
272
273 Grinder is a program to create random shotgun and amplicon sequence libraries
274 based on reference sequences in a FASTA file. Features include:
275
276 * shotgun library or amplicon library
277 * arbitrary read length distribution and number of reads
278 * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)
279 * support for creating paired-end (mate pair) datasets
280 * specific rank-abundance settings or manually given abundance for each genome
281 * creation of datasets with a given richness (alpha diversity)
282 * independent datasets can share a variable number of genomes (beta diversity)
283 * modeling of the bias created by varying genome lengths or gene copy number
284 * profile mechanism to store preferred options
285 * API to automate the creation of a large number of simulated datasets
286
287
288 **Input**
289
290 A variety of FASTA databases containing genes or genomes can be used as input
291 for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),
292 the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...
293
294 These input files can either be provided as a Galaxy dataset, or can be uploaded
295 by Galaxy users in their history.
296
297
298 **Output**
299
300 For each library requested, a first file contains the abundance of the species
301 in the simulated community created, e.g.::
302
303 # rank seqID rel. abundance
304 1 86715_Lachnospiraceae 0.367936925098555
305 2 6439_Neisseria_polysaccharea 0.183968462549277
306 3 103712_Fusobacterium_nucleatum 0.122645641699518
307 4 103024_Frigoribacterium 0.0919842312746386
308 5 129066_Streptococcus_pyogenes 0.0735873850197109
309 6 106485_Pseudomonas_aeruginosa 0.0613228208497591
310 7 13824_Veillonella_criceti 0.0525624178712221
311 8 28044_Lactosphaera 0.0459921156373193
312
313 The second file is a FASTA file containing shotgun or amplicon reads, e.g.::
314
315 >1 reference=13824_Veillonella_criceti position=89-1088 strand=+
316 ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA
317 TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG
318 AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA
319
320 >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
321 TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA
322 GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG
323
324 If you specify the quality score levels option, a third file representing the
325 quality scores of the reads is created::
326
327 >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
328 30 30 30 10 30 30 ...
329
330
331 </help>
332
333 </tool>
334