Mercurial > repos > iuc > samblaster
changeset 0:aa72470e14f7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/samblaster commit 82097013a9eb5a6161d400e5b6c493113c440687
author | iuc |
---|---|
date | Mon, 19 Dec 2016 15:18:40 -0500 |
parents | |
children | cf4ad98037d9 |
files | samblaster.xml test-data/output.bam test-data/splitters.bam test-data/sr.input.bam test-data/sr.input.sam.gz |
diffstat | 5 files changed, 211 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/samblaster.xml Mon Dec 19 15:18:40 2016 -0500 @@ -0,0 +1,211 @@ +<tool id="samblaster" name="samblaster" version="0.1.24"> + <description>marks duplicates, outputs split reads, discordant read pairs and unmapped reads</description> + <requirements> + <requirement type="package" version="0.1.24">samblaster</requirement> + <requirement type="package" version="0.6.5">sambamba</requirement> + </requirements> + <version_command>samblaster --version</version_command> + <command detect_errors="exit_code"><![CDATA[ + #if $input.is_of_type('sam'): + #set stream="<(sambamba view -S -f bam -t ${GALAXY_SLOTS:-4} -h '%s')" % $input + #else: + #set stream="'%s'" % $input + #end if + sambamba view -t \${GALAXY_SLOTS:-4} -h <(sambamba sort -t \${GALAXY_SLOTS:-4} -n $stream -o /dev/stdout) | + samblaster + $output + $discordantFile + $splitterFile + $unmappedFile + $acceptDupMarks + $excludeDups + $removeDups + $addMateTags + $compatibility_mode + --maxSplitCount '$maxSplitCount' + --maxUnmappedBases '$maxUnmappedBases' + --minIndelSize '$minIndelSize' + --minNonOverlap '$minNonOverlap' + --minClipSize '$minClipSize' + #if $output != "-o /dev/null": + && sambamba sort -o output.bam -l 6 -t \${GALAXY_SLOTS:-4} <(sambamba view -S -f bam output.sam) + #end if + #if $discordantFile: + && sambamba sort -o discordant.bam -l 6 -t \${GALAXY_SLOTS:-4} <(sambamba view -S -f bam discordant.sam) + #end if + #if $splitterFile: + && sambamba sort -o splitter.bam -l 6 -t \${GALAXY_SLOTS:-4} <(sambamba view -S -f bam splitter.sam) + #end if + ]]></command> + <inputs> + <param argument="--input" type="data" format="bam,sam"/> + <param argument="--output" label="Output bam file for all input alignments" type="boolean" checked="true" truevalue="-o output.sam" falsevalue="-o /dev/null"/> + <param argument="--discordantFile" label="Output discordant read pairs?" type="boolean" truevalue="-d discordant.sam" falsevalue=""/> + <param argument="--splitterFile" label="Output split reads?" type="boolean" truevalue="-s splitter.sam" falsevalue=""/> + <param argument="--unmappedFile" label="Output unmapped/clipped reads as FASTQ?" type="boolean" truevalue="-u unmapped.fastq" falsevalue=""/> + <param argument="--acceptDupMarks" label="Accept duplicate marks already in input file instead of looking for duplicates in the input?" type="boolean" truevalue="-a" falsevalue=""/> + <param argument="--excludeDups" label="Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file?" type="boolean" truevalue="-a" falsevalue=""/> + <param argument="--removeDups" label="Remove duplicates reads from all output files?" help="(Implies --excludeDups)" type="boolean" truevalue="-e" falsevalue=""/> + <param argument="--addMateTags" label="Add MC and MQ tags?" type="boolean" truevalue="--addMateTags" falsevalue=""/> + <param name="compatibility_mode" argument="-M" label="Run in compatibility mode?" help="Both 0x100 and 0x800 are considered chimeric. Similar to BWA MEM -M option." type="boolean" truevalue="-M" falsevalue="" /> + <param argument="--maxSplitCount" label="Maximum number of split alignments for a read to be included in splitter file." type="integer" value="2"/> + <param argument="--maxUnmappedBases" label="Maximum number of un-aligned bases between two alignments to be included in splitter file." type="integer" value="50" min="1"/> + <param argument="--minIndelSize" label="Minimum structural variant feature size for split alignments to be included in splitter file." type="integer" value="50" min="1"/> + <param argument="--minNonOverlap" label="Minimum non-overlaping base pairs between two alignments for a read to be included in splitter file." type="integer" value="20" min="1"/> + <param argument="--minClipSize" label="Minumum number of bases a mapped read must be clipped to be included in unmapped file." type="integer" value="20" min="1"/> + </inputs> + <outputs> + <data name="output_bam" format="bam" label="samblaster alignments on ${on_string}" from_work_dir="output.bam"> + <filter>output</filter> + </data> + <data name="discordant_bam" format="bam" label="samblaster discordant alignments on ${on_string}" from_work_dir="discordant.bam"> + <filter>discordantFile</filter> + </data> + <data name="splitter_bam" format="bam" label="samblaster split alignments on ${on_string}" from_work_dir="splitter.bam"> + <filter>splitterFile</filter> + </data> + <data name="unmapped_fastq" format="fastqsanger" label="samblaster unmapped fastq on ${on_string}" from_work_dir="unmapped.fastq"> + <filter>unmappedFile</filter> + </data> + </outputs> + <tests> + <test> + <param name="input" value="sr.input.bam"/> + <param name="output" value="true"/> + <param name="discordandFile" value="false"/> + <param name="splitterFile" value="true"/> + <param name="unmappedFile" value="true"/> + <output name="output_bam" file="output.bam" compare="sim_size"/> + <output name="splitter_bam" file="splitters.bam" compare="sim_size"/> + <output name="unmapped_fastq"> + <assert_contents> + <has_line line="@M00860:26:000000000-A6UGV:1:1101:10000:6072" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="sr.input.sam.gz" ftype="sam"/> + <param name="output" value="true"/> + <param name="discordandFile" value="false"/> + <param name="splitterFile" value="true"/> + <param name="unmappedFile" value="true"/> + <output name="output_bam" file="output.bam" compare="sim_size"/> + <output name="splitter_bam" file="splitters.bam" compare="sim_size"/> + <output name="unmapped_fastq"> + <assert_contents> + <has_line line="@M00860:26:000000000-A6UGV:1:1101:10000:6072" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + +*samblaster* +============ + +Summary +------- + +*samblaster* is a fast and flexible program for marking duplicates in +**read-id grouped** paired-end SAM files. It can also optionally output +discordant read pairs and/or split read mappings to separate SAM files, +and/or unmapped/clipped reads to a separate FASTQ file. When marking +duplicates, *samblaster* will require approximately 20MB of memory per +1M read pairs. + +Usage +----- + +See the `SAM File Format +Specification <http://samtools.sourceforge.net/SAMv1.pdf>`__ for details +about the SAM alignment format. + +By default, samblaster marks duplicates with SAM FLAG 0x400. The +**--removeDups** option will instead remove duplicate alignments from the +output file. + +**ALIGNMENT TYPE DEFINITIONS:** Below, we will use the following +definitions for alignment types. Starting with *samblaster* release +0.1.22, these definitions are affected by the use of the **-M** option. +By default, *samblaster* will use the current definitions of alignment +types as specified in the `SAM +Specification <http://samtools.sourceforge.net/SAMv1.pdf>`__. Namely, +alignments marked with FLAG 0x100 are considered *secondary*, while +those marked with FLAG 0x800 are considered *supplemental*. If the +**-M** option is specified, alignments marked with either FLAG 0x100 or +0x800 are considered *supplemental*, and no alignments are considered +*secondary*. A *primary* alignment is always one that is neither +*secondary* nor *supplemental*. Only *primary* and *supplemental* +alignments are used to find chimeric (split-read) mappings. The **-M** +flag is used for backward compatibility with older SAM/BAM files in +which "chimeric" alignments were marked with FLAG 0x100, and should also +be used with output from more recent runs of *bwa mem* using its **-M** +option. + +**DISCORDANT READ PAIR IDENTIFICATION:** A **discordant** read pair is +one which meets all of the following criteria: + +1. Both side of the read pair are mapped (neither FLAG 0x4 or 0x8 is + set). +2. The *properly paired* FLAG (0x2) is not set. +3. *Secondary* or *supplemental* alignments are never output as + discordant, although a discordant read pair can have such alignments + associated with them. +4. Duplicate read pairs that meet the above criteria will be output as + discordant unless the **-e** option is used. + +**UNMAPPED/CLIPPED READ IDENTIFICATION:** An **unmapped** or **clipped** +read is a *primary* alignment that is unaligned over all or part of its +length respectively. The lack of a full alignment may be caused by a SV +breakpoint that falls within the read. Therefore, *samblaster* will +optionally output such reads to a FASTQ file for re-alignment by a tool, +such as `YAHA <https://github.com/GregoryFaust/yaha/>`__, geared toward +finding split-read mappings. *samblaster* applies the following strategy +to identify and output unmapped/clipped reads: + +1. An **unmapped** read has the *unmapped read* FLAG set (0x4). +2. A **clipped** read is a mapped read with a CIGAR string that begins + or ends with at least **--minClipSize** unaligned bases (CIGAR code S + and/or H), and is not from a read that has one or more *supplemental* + alignments. +3. In order for *samblaster* to output the entire sequence for clipped + reads, the input SAM file must have soft clipped primary alignments. +4. *samblaster* will output unmapped/clipped reads into a FASTQ file if + QUAL information is available in the input file, and a FASTA file if + not. +5. Unmapped/clipped reads that are part of a duplicate read pair will be + output unless the **-e** option is used. + + +**Written by:** Greg Faust (gf4ea@virginia.edu) `Ira Hall Lab, +University of Virginia <http://faculty.virginia.edu/irahall/>`__ + +**Please cite:** `Faust, G.G. and Hall, I.M., “\ *SAMBLASTER*: fast +duplicate marking and structural variant read extraction,” +*Bioinformatics* Sept. 2014; **30**\ (17): +2503-2505. <http://bioinformatics.oxfordjournals.org/content/30/17/2503>`__ + +**Also see:** `SAMBLASTER\_Supplemental.pdf +<https://github.com/GregoryFaust/samblaster/raw/master/SAMBLASTER_Supplemental.pdf>`__ +for additonal discussion and statistics about the duplicates marked by +*samblaster* vs. *Picard* using the NA12878 sample dataset. Click the +preceeding link or download the file from this repository. +**Written by:** Greg Faust (gf4ea@virginia.edu) `Ira Hall Lab, +University of Virginia <http://faculty.virginia.edu/irahall/>`__ + +**Please cite:** `Faust, G.G. and Hall, I.M., “\ *SAMBLASTER*: fast +duplicate marking and structural variant read extraction,” +*Bioinformatics* Sept. 2014; **30**\ (17): +2503-2505. <http://bioinformatics.oxfordjournals.org/content/30/17/2503>`__ + +**Also see:** `SAMBLASTER\_Supplemental.pdf +<https://github.com/GregoryFaust/samblaster/raw/master/SAMBLASTER_Supplemental.pdf>`__ +for additonal discussion and statistics about the duplicates marked by +*samblaster* vs. *Picard* using the NA12878 sample dataset. Click the +preceeding link or download the file from this repository. + + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btu314</citation> + </citations> +</tool>