Mercurial > repos > mvdbeek > dedup_hash
diff dedup_hash.xml @ 0:f33e9e6a6c88 draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author | mvdbeek |
---|---|
date | Wed, 23 Nov 2016 07:49:05 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dedup_hash.xml Wed Nov 23 07:49:05 2016 -0500 @@ -0,0 +1,114 @@ +<tool id="dedup_hash" name="Deduplicate FASTQ files" version="0.1.1"> + <description>with fast and memory-efficient sequence hashes</description> + <requirements> + <requirement type="package" version="0.150.1">smhasher</requirement> + </requirements> + <command><![CDATA[ + python '$__tool_directory__/dedup_hash/dedup_hash.py' + #if str($readtype.single_or_paired) == "se": + --r1_in '${readtype.input_single}' + --r1_out '$output_single' + #elif str($readtype.single_or_paired) == "pe_sep": + --r1_in '${readtype.input_paired1}' + --r2_in '${readtype.input_paired2}' + --r1_out '$output_paired1' + --r2_out '$output_paired2' + #else + --r1_in '${readtype.input_paired.forward}' + --r2_in '${readtype.input_paired.reverse}' + --r1_out '${output_paired_coll.forward}' + --r2_out '${output_paired_coll.reverse}' + #end if + $compress_fastq + ]]></command> + <inputs> + <conditional name="readtype"> + <param name="single_or_paired" type="select" label="Single-end or paired-end reads?"> + <option value="se" selected="true">Single-end</option> + <option value="pe_sep">Paired-end (two separate input files)</option> + <option value="pe_collection">Paired-end (as collection)</option> + </param> + <when value="se"> + <param format="fastq,fastq.gz" name="input_single" type="data" label="Single-end FASTQ reads" help="(-f)" /> + </when> + <when value="pe_sep"> + <param format="fastq,fastq.gz" name="input_paired1" type="data" label="Paired-end forward strand FASTQ reads" help="(-f)" /> + <param format="fastq,fastq.gz" name="input_paired2" type="data" label="Paired-end reverse strand FASTQ reads" help="(-r)" /> + </when> + <when value="pe_collection"> + <param name="input_paired" format="fastq,fastq.gz" type="data_collection" collection_type="paired" label="Paired-end FASTQ reads as paired collection" /> + </when> + </conditional> + <param name="compress_fastq" type="boolean" checked="true" truevalue="--write_gzip" falsevalue="" label="Produce compressed fastq?"/> + </inputs> + <outputs> + <data name="output_single" format="fastq" label="Single-end output of ${tool.name} on ${on_string}"> + <filter>readtype['single_or_paired'] == 'se'</filter> + <change_format> + <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> + </change_format> + </data> + <data name="output_paired1" format="fastq" label="Paired-end forward strand output of ${tool.name} on ${on_string}"> + <filter>readtype['single_or_paired'] == 'pe_sep'</filter> + <change_format> + <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> + </change_format> + </data> + <data name="output_paired2" format="fastq" label="Paired-end reverse strand output of ${tool.name} on ${on_string}"> + <filter>readtype['single_or_paired'] == 'pe_sep'</filter> + <change_format> + <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> + </change_format> + </data> + <collection name="output_paired_coll" type="paired" structured_like="readtype.pe_collection" label="Paired-end output of ${tool.name} on ${on_string}"> + <filter>readtype['single_or_paired'] == 'pe_collection'</filter> + <data name="forward" format="fastq"> + <change_format> + <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> + </change_format> + </data> + <data name="reverse" format="fastq"> + <change_format> + <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> + </change_format> + </data> + </collection> + </outputs> + <tests> + <test> + <param name="single_or_paired" value="pe_sep"/> + <param name="input_paired1" value="r1.fastq.gz" ftype="fastq.gz"/> + <param name="input_paired2" value="r2.fastq.gz" ftype="fastq.gz"/> + <param name="compress_fastq" value="--write_gzip"/> + <output name="output_paired1" file="r1_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/> + <output name="output_paired2" file="r2_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/> + </test> + <test> + <param name="single_or_paired" value="pe_sep"/> + <param name="input_paired1" value="r1.fastq" ftype="fastq"/> + <param name="input_paired2" value="r2.fastq" ftype="fastq"/> + <param name="compress_fastq" value=""/> + <output name="output_paired1" file="r1_dedup.fastq" ftype="fastq"/> + <output name="output_paired2" file="r2_dedup.fastq" ftype="fastq"/> + </test> + </tests> + <help> <![CDATA[ +**Deduplicate paired fastq** is a fast and memory-efficient tool for removal of duplicates in paired short DNA sequence reads in fastq format. +It identifies duplicates by concatenating the sequence of a readpair and calculating a short hash that uniquely identifies the concatenated sequence. +Sequences that are not unique (i.e a hash of the concatenated sequence has been seen previously) are being discarded. + +Compared to fastuniq this tool requires only a fraction of the memory, but does not identify pairs that are identical, +except for a switch of R1 and R2. Such reads may nevertheless align to different places based on the seed-searching of the aligner, +so this may or may not be a problem for your application. + +Fastuniq consumed 76 GB of memory and took 4:01.52 on a typical dataset of 100nt 25 x 10^6 paired end reads, +while this tool took 4.7GB of memory and 3:23.27 for the same dataset. + +Both tools produced the exact same result, arguing that, at least before quality and/or adapter trimming, +the previously mentioned limitations are of theoretical concern. + + ]]> </help> + <citations> + <citation type="doi">doi:10.1371/journal.pone.0052249</citation> + </citations> +</tool>