annotate tools/sample_seqs/sample_seqs.xml @ 1:16ecf25d521f draft

Uploaded v0.0.1 with fixed README file
author peterjc
date Thu, 27 Mar 2014 12:13:22 -0400
parents 3a807e5ea6c8
children da64f6a9e32b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
1 <tool id="sample_seqs" name="Sub-sample sequences files" version="0.0.1">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
2 <description>e.g. to reduce coverage</description>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
3 <requirements>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
4 <requirement type="package" version="1.63">biopython</requirement>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
5 <requirement type="python-module">Bio</requirement>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
6 </requirements>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
7 <version_command interpreter="python">sample_seqs.py --version</version_command>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
8 <command interpreter="python">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
9 #if str($sampling.type) == "everyNth":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
10 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
11 #elif str($sampling.type) == "percentage":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
12 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
13 #else:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
14 ##Should give an error about invalid sampling type:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
15 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
16 #end if
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
17 </command>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
18 <stdio>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
19 <!-- Anything other than zero is an error -->
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
20 <exit_code range="1:" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
21 <exit_code range=":-1" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
22 </stdio>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
23 <inputs>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
24 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
25 <conditional name="sampling">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
26 <param name="type" type="select" label="Sub-sampling approach">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
27 <option value="everyNth">Take every N-th sequence (e.g. every fifth sequence)</option>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
28 <option value="percentage">Take some percentage of the sequences (e.g. 20% will take every fifth sequence)</option>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
29 <!-- TODO - target coverage etc -->
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
30 </param>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
31 <when value="everyNth">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
32 <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
33 </when>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
34 <when value="percentage">
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
35 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
36 </when>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
37 </conditional>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
38 </inputs>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
39 <outputs>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
40 <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
41 </outputs>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
42 <tests>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
43 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
44 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
45 <param name="type" value="everyNth" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
46 <param name="every_n" value="100" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
47 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
48 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
49 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
50 <param name="input_file" value="ecoli.fastq" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
51 <param name="type" value="everyNth" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
52 <param name="every_n" value="100" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
53 <output name="output_file" file="ecoli.sample_N100.fastq" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
54 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
55 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
56 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
57 <param name="type" value="everyNth" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
58 <param name="every_n" value="5" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
59 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
60 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
61 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
62 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
63 <param name="type" value="percentage" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
64 <param name="percent" value="1.0" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
65 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
66 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
67 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
68 <param name="input_file" value="ecoli.fastq" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
69 <param name="type" value="percentage" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
70 <param name="percent" value="1.0" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
71 <output name="output_file" file="ecoli.sample_N100.fastq" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
72 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
73 <test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
74 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
75 <param name="type" value="percentage" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
76 <param name="percent" value="20.0" />
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
77 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
78 </test>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
79 </tests>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
80 <help>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
81 **What it does**
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
82
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
83 Takes an input file of sequences (typically FASTA or FASTQ, but also
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
84 Standard Flowgram Format (SFF) is supported), and returns a new sequence
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
85 file sub-sampling from this (in the same format).
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
86
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
87 Several sampling modes are supported, all designed to be non-random. This
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
88 allows reproducibility, and also works on paired sequence files. Also
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
89 note that by sampling uniformly through the file, this avoids any bias
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
90 should reads in any part of the file are of lesser quality (e.g. one part
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
91 of the slide).
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
92
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
93 The simplest mode is to take every N-th sequence, for example taking
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
94 every 2nd sequence would sample half the file - while taking every 5th
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
95 sequence would take 20% of the file.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
96
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
97
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
98 **Example Usage**
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
99
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
100 Suppose you have some Illumina paired end data as files ``R1.fastq`` and
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
101 ``R2.fastq`` which give an estimated x200 coverage, and you wish to do a
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
102 *de novo* assembly with a tool like MIRA which recommends lower coverage.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
103 Taking every 3rd read would reduce the estimated coverage to about x66,
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
104 and would preserve the pairing as well.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
105
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
106
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
107 **Citation**
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
108
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
109 This tool uses Biopython, so if you use this Galaxy tool in work leading to a
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
110 scientific publication please cite the following paper:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
111
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
112 Cock et al (2009). Biopython: freely available Python tools for computational
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
113 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
114 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
115
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
116 This tool is available to install into other Galaxy Instances via the Galaxy
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
117 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
118 </help>
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
119 </tool>