0
|
1 <tool id="sample_seqs" name="Sub-sample sequences files" version="0.0.1">
|
|
2 <description>e.g. to reduce coverage</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="1.63">biopython</requirement>
|
|
5 <requirement type="python-module">Bio</requirement>
|
|
6 </requirements>
|
|
7 <version_command interpreter="python">sample_seqs.py --version</version_command>
|
|
8 <command interpreter="python">
|
|
9 #if str($sampling.type) == "everyNth":
|
|
10 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
|
|
11 #elif str($sampling.type) == "percentage":
|
|
12 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
|
|
13 #else:
|
|
14 ##Should give an error about invalid sampling type:
|
|
15 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
|
|
16 #end if
|
|
17 </command>
|
|
18 <stdio>
|
|
19 <!-- Anything other than zero is an error -->
|
|
20 <exit_code range="1:" />
|
|
21 <exit_code range=":-1" />
|
|
22 </stdio>
|
|
23 <inputs>
|
|
24 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." />
|
|
25 <conditional name="sampling">
|
|
26 <param name="type" type="select" label="Sub-sampling approach">
|
|
27 <option value="everyNth">Take every N-th sequence (e.g. every fifth sequence)</option>
|
|
28 <option value="percentage">Take some percentage of the sequences (e.g. 20% will take every fifth sequence)</option>
|
|
29 <!-- TODO - target coverage etc -->
|
|
30 </param>
|
|
31 <when value="everyNth">
|
|
32 <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" />
|
|
33 </when>
|
|
34 <when value="percentage">
|
|
35 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
|
|
36 </when>
|
|
37 </conditional>
|
|
38 </inputs>
|
|
39 <outputs>
|
|
40 <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
|
|
41 </outputs>
|
|
42 <tests>
|
|
43 <test>
|
|
44 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
|
|
45 <param name="type" value="everyNth" />
|
|
46 <param name="every_n" value="100" />
|
|
47 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
|
|
48 </test>
|
|
49 <test>
|
|
50 <param name="input_file" value="ecoli.fastq" />
|
|
51 <param name="type" value="everyNth" />
|
|
52 <param name="every_n" value="100" />
|
|
53 <output name="output_file" file="ecoli.sample_N100.fastq" />
|
|
54 </test>
|
|
55 <test>
|
|
56 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
|
|
57 <param name="type" value="everyNth" />
|
|
58 <param name="every_n" value="5" />
|
|
59 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
|
|
60 </test>
|
|
61 <test>
|
|
62 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
|
|
63 <param name="type" value="percentage" />
|
|
64 <param name="percent" value="1.0" />
|
|
65 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
|
|
66 </test>
|
|
67 <test>
|
|
68 <param name="input_file" value="ecoli.fastq" />
|
|
69 <param name="type" value="percentage" />
|
|
70 <param name="percent" value="1.0" />
|
|
71 <output name="output_file" file="ecoli.sample_N100.fastq" />
|
|
72 </test>
|
|
73 <test>
|
|
74 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
|
|
75 <param name="type" value="percentage" />
|
|
76 <param name="percent" value="20.0" />
|
|
77 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
|
|
78 </test>
|
|
79 </tests>
|
|
80 <help>
|
|
81 **What it does**
|
|
82
|
|
83 Takes an input file of sequences (typically FASTA or FASTQ, but also
|
|
84 Standard Flowgram Format (SFF) is supported), and returns a new sequence
|
|
85 file sub-sampling from this (in the same format).
|
|
86
|
|
87 Several sampling modes are supported, all designed to be non-random. This
|
|
88 allows reproducibility, and also works on paired sequence files. Also
|
|
89 note that by sampling uniformly through the file, this avoids any bias
|
|
90 should reads in any part of the file are of lesser quality (e.g. one part
|
|
91 of the slide).
|
|
92
|
|
93 The simplest mode is to take every N-th sequence, for example taking
|
|
94 every 2nd sequence would sample half the file - while taking every 5th
|
|
95 sequence would take 20% of the file.
|
|
96
|
|
97
|
|
98 **Example Usage**
|
|
99
|
|
100 Suppose you have some Illumina paired end data as files ``R1.fastq`` and
|
|
101 ``R2.fastq`` which give an estimated x200 coverage, and you wish to do a
|
|
102 *de novo* assembly with a tool like MIRA which recommends lower coverage.
|
|
103 Taking every 3rd read would reduce the estimated coverage to about x66,
|
|
104 and would preserve the pairing as well.
|
|
105
|
|
106
|
|
107 **Citation**
|
|
108
|
|
109 This tool uses Biopython, so if you use this Galaxy tool in work leading to a
|
|
110 scientific publication please cite the following paper:
|
|
111
|
|
112 Cock et al (2009). Biopython: freely available Python tools for computational
|
|
113 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
114 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
115
|
|
116 This tool is available to install into other Galaxy Instances via the Galaxy
|
|
117 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
|
|
118 </help>
|
|
119 </tool>
|