Mercurial > repos > peterjc > sample_seqs
diff tools/sample_seqs/sample_seqs.xml @ 2:da64f6a9e32b draft
Uploaded v0.2.0, adds desired count mode
author | peterjc |
---|---|
date | Fri, 06 Mar 2015 11:48:09 -0500 |
parents | 3a807e5ea6c8 |
children | 02c13ef1a669 |
line wrap: on
line diff
--- a/tools/sample_seqs/sample_seqs.xml Thu Mar 27 12:13:22 2014 -0400 +++ b/tools/sample_seqs/sample_seqs.xml Fri Mar 06 11:48:09 2015 -0500 @@ -1,18 +1,21 @@ -<tool id="sample_seqs" name="Sub-sample sequences files" version="0.0.1"> +<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.0"> <description>e.g. to reduce coverage</description> <requirements> - <requirement type="package" version="1.63">biopython</requirement> + <requirement type="package" version="1.65">biopython</requirement> <requirement type="python-module">Bio</requirement> </requirements> <version_command interpreter="python">sample_seqs.py --version</version_command> <command interpreter="python"> +sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file" #if str($sampling.type) == "everyNth": -sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}" +-n "${sampling.every_n}" #elif str($sampling.type) == "percentage": -sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}" -#else: -##Should give an error about invalid sampling type: -sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" +-p "${sampling.percent}" +#else +-c "${sampling.count}" +#end if +#if $interleaved +--interleaved #end if </command> <stdio> @@ -24,8 +27,9 @@ <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." /> <conditional name="sampling"> <param name="type" type="select" label="Sub-sampling approach"> - <option value="everyNth">Take every N-th sequence (e.g. every fifth sequence)</option> - <option value="percentage">Take some percentage of the sequences (e.g. 20% will take every fifth sequence)</option> + <option value="everyNth">Take every N-th sequence (or pair, e.g. every fifth sequence)</option> + <option value="percentage">Take some percentage of the sequences (or pairs, e.g. 20% will take every fifth sequence)</option> + <option value="desired_count">Take exactly N sequences (or pairs, e.g. 1000 sequences)</option> <!-- TODO - target coverage etc --> </param> <when value="everyNth"> @@ -34,7 +38,11 @@ <when value="percentage"> <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" /> </when> + <when value="desired_count"> + <param name="count" value="1000" type="integer" min="1" label="N" help="Number of unique sequences to pick (between 1 and number itotal n input file)" /> + </when> </conditional> + <param name="interleaved" type="boolean" label="Interleaved paired reads" help="This mode keeps paired reads together (e.g. take every 5th read pair)" /> </inputs> <outputs> <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/> @@ -53,6 +61,13 @@ <output name="output_file" file="ecoli.sample_N100.fastq" /> </test> <test> + <param name="input_file" value="ecoli.fastq" /> + <param name="type" value="everyNth" /> + <param name="every_n" value="100" /> + <param name="interleaved" value="true" /> + <output name="output_file" file="ecoli.pair_sample_N100.fastq" /> + </test> + <test> <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> <param name="type" value="everyNth" /> <param name="every_n" value="5" /> @@ -65,35 +80,108 @@ <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> </test> <test> + <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> + <param name="type" value="everyNth" /> + <param name="every_n" value="100" /> + <param name="interleaved" value="true" /> + <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> + <param name="type" value="desired_count" /> + <param name="count" value="2910" /> + <output name="output_file" file="get_orf_input.Suis_ORF.prot.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> + <param name="type" value="desired_count" /> + <param name="count" value="10" /> + <param name="interleaved" value="true" /> + <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta" /> + </test> + <test> <param name="input_file" value="ecoli.fastq" /> <param name="type" value="percentage" /> <param name="percent" value="1.0" /> <output name="output_file" file="ecoli.sample_N100.fastq" /> </test> <test> + <param name="input_file" value="ecoli.fastq" /> + <param name="type" value="desired_count" /> + <param name="count" value="10" /> + <output name="output_file" file="ecoli.sample_C10.fastq" /> + </test> + <test> + <param name="input_file" value="ecoli.sample_C10.fastq" /> + <param name="type" value="desired_count" /> + <param name="count" value="10" /> + <output name="output_file" file="ecoli.sample_C10.fastq" /> + </test> + <test> <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> <param name="type" value="percentage" /> <param name="percent" value="20.0" /> <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> + <param name="type" value="everyNth" /> + <param name="percent" value="5" /> + <param name="interleaved" value="true" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> + <param name="type" value="desired_count" /> + <param name="count" value="30" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30.sff" ftype="sff"/> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> + <param name="type" value="desired_count" /> + <param name="count" value="1" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff" ftype="sff"/> + </test> </tests> <help> **What it does** Takes an input file of sequences (typically FASTA or FASTQ, but also Standard Flowgram Format (SFF) is supported), and returns a new sequence -file sub-sampling from this (in the same format). +file sub-sampling uniformly from this (in the same format, preserving the +input order and selecting sequencing evenly though the input file). -Several sampling modes are supported, all designed to be non-random. This -allows reproducibility, and also works on paired sequence files. Also -note that by sampling uniformly through the file, this avoids any bias -should reads in any part of the file are of lesser quality (e.g. one part -of the slide). +Several sampling modes are supported, all designed to do non-random +uniform sampling (i.e. evenly through the input file). This allows +reproducibility, and also works on paired sequence files (run the tool +twice, once on each file using the same settings). -The simplest mode is to take every N-th sequence, for example taking +By sampling uniformly (evenly) through the file, this avoids any bias +should reads in any part of the file be of lesser quality (e.g. for +high throughput sequencing the reads at the start and end of the file +can be of lower quality). + +The simplest mode is to take every *N*-th sequence, for example taking every 2nd sequence would sample half the file - while taking every 5th sequence would take 20% of the file. +The target count method picks *N* sequences from the input file, which +again will be distributed uniformly (evenly) though the file. This works +by first counting the number of records, then calculating the desired +percentage of sequences to take. Note if your input file has exactly +*N* sequences this selects them all (effectively copying the input file). +If your input file has less than *N* sequences, this is treated as an +error. + +If you tick the interleaved option, the file is processed as pairs of +records to ensure your read pairs are not separated by sampling. +For example using 20% would take every 5th pair of records, or you +could request 1000 read pairs. + +.. class:: warningmark + +Note interleaves/pair mode does *not* actually check your read names +match a known pair naming scheme! **Example Usage** @@ -103,6 +191,14 @@ Taking every 3rd read would reduce the estimated coverage to about x66, and would preserve the pairing as well. +Similarly, if you had some Illumina paired end data interleaved into one +file with an estimated x200 coverage, you would run this tool in +interleaved mode, taking every 3rd read pair. This would again reduce +the estimated coverage to about x66, while preserving the read pairing. + +Suppose you have a transcriptome assembly, and wish to look at the +species distribution of the top BLAST hits for an initial quality check. +Rather than using all your sequences, you could pick 1000 only for this. **Citation** @@ -116,4 +212,7 @@ This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp163</citation> + </citations> </tool>