Mercurial > repos > iuc > bbtools_bbmap

<tool id="bbtools_bbmap" name="BBTools: BBMap" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>short-read aligner</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
#import os
#import re

#if str($ref_source_cond.ref_source) == 'cached'
    #set ref = str($ref_source_cond.reference.fields.path)
#else:
    #set ref = $ref_source_cond.reference
#end if

#if str($input_type_cond.input_type) in ['single', 'pair']:
    #set read1 = $input_type_cond.read1
    ## bbmap uses the file extension to determine the input format.
    #set ext = '.fastq'
    #if $read1.ext.endswith('.gz'):
        #set ext = $ext + '.gz'
    #end if
    #set read1_file = 'forward' + $ext
    ln -s '${read1}' '${read1_file}' &&
    #if str($input_type_cond.input_type) == 'pair':
        #set read2 = $input_type_cond.read2
        #set read2_file = 'reverse' + $ext
        ln -s '${read2}' '${read2_file}' &&
    #end if
#else:
    #set read1 = $input_type_cond.reads_collection['forward']
    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
    ## bbmap uses the file extension to determine the input format.
    #set ext = $read1_identifier + '.fastq'
    #if $read1.ext.endswith('.gz'):
        #set ext = $ext + '.gz'
    #end if
    #set read1_file = $read1_identifier + $ext
    ln -s '${read1}' '${read1_file}' &&
    #set read2 = $input_type_cond.reads_collection['reverse']
    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
    #set read2_file = $read2_identifier + $ext
    ln -s '${read2}' '${read2_file}' &&
#end if

bbmap.sh

#### Indexing Parameters
nodisk=f
ref='${ref}'
k=13
usemodulo=f
rebuild=f

#### Input Parameters
#if str($input_type_cond.input_type) == 'single':
    in='${read1_file}'
#else:
    in='${read1_file}' in2='${read2_file}'
#end if
fastareadlen=500
unpigz=f
touppercase=t

#### Sampling Parameters
reads=-1
samplerate=1
skipreads=0

#### Mapping Parameters
maxindel=$mapping_options.maxindel
strictmaxindel='$mapping_options.strictmaxindel'
tipsearch=$mapping_options.tipsearch
minid=$mapping_options.minid
minhits=$mapping_options.minhits
local='$mapping_options.local'
perfectmode='$mapping_options.perfectmode'
semiperfectmode='$mapping_options.semiperfectmode'
threads=\${GALAXY_SLOTS:-4}
ambiguous='$mapping_options.ambiguous'
samestrandpairs='$mapping_options.samestrandpairs'
requirecorrectstrand='$mapping_options.requirecorrectstrand'
killbadpairs='$mapping_options.killbadpairs'
pairedonly='$mapping_options.pairedonly'
rcomp='$mapping_options.rcomp'
rcompmate='$mapping_options.rcompmate'
pairlen=$mapping_options.pairlen
rescuedist=$mapping_options.rescuedist
rescuemismatches=$mapping_options.rescuemismatches
averagepairdist=$mapping_options.averagepairdist
deterministic='$mapping_options.deterministic'
bandwidthratio='$mapping_options.bandwidthratio'
bandwidth='$mapping_options.bandwidth'
usejni='f'
maxsites2=$mapping_options.maxsites2
ignorefrequentkmers='$mapping_options.ignorefrequentkmers'
excludefraction=$mapping_options.excludefraction
greedy='$mapping_options.greedy'
kfilter=$mapping_options.kfilter

#### Quality and Trimming Parameters
qin='auto'
qout='auto'
qtrim='$qt_options.qtrim'
untrim='$qt_options.untrim'
trimq=$qt_options.trimq
mintrimlength=$qt_options.mintrimlength
fakefastaquality=$qt_options.fakefastaquality
ignorebadquality='$qt_options.ignorebadquality'
usequality='$qt_options.usequality'
minaveragequality=$qt_options.minaveragequality
maqb=$qt_options.maqb

#### Post-Filtering options
idfilter=$pf_options.idfilter
subfilter=$pf_options.subfilter
insfilter=$pf_options.insfilter
delfilter=$pf_options.delfilter
indelfilter=$pf_options.indelfilter
editfilter=$pf_options.editfilter
inslenfilter=$pf_options.inslenfilter
dellenfilter=$pf_options.dellenfilter
nfilter=$pf_options.nfilter

#### Output parameters
secondary='$output_options.secondary'
maxsites=$output_options.maxsites
sssr=$output_options.sssr
ssao='$output_options.ssao'
quickmatch='$output_options.quickmatch'
trimreaddescriptions='$output_options.trimreaddescriptions'
machineout='$output_options.machineout'
printunmappedcount='$output_options.printunmappedcount'
renamebyinsert='$output_options.renamebyinsert'
#if str($output_options.output_sort) == 'coordinate':
    out='all_reads.bam'
    outu='unmapped_reads.bam'
    outm='mapped_reads.bam'
    && samtools sort --no-PG -@\${GALAXY_SLOTS:-4} -T "\${TMPDIR:-.}" -O bam -o '$output_all_reads' 'all_reads.bam'
    && samtools sort --no-PG -@\${GALAXY_SLOTS:-4} -T "\${TMPDIR:-.}" -O bam -o '$output_unmapped_reads' 'unmapped_reads.bam'
    && samtools sort --no-PG -@\${GALAXY_SLOTS:-4} -T "\${TMPDIR:-.}" -O bam -o '$output_mapped_reads' 'mapped_reads.bam'
#elif str($output_options.output_sort) == 'name':
    out='all_reads.bam'
    outu='unmapped_reads.bam'
    outm='mapped_reads.bam'
    && samtools sort --no-PG -n -@\${GALAXY_SLOTS:-4} -T '\${TMPDIR:-.}' -O bam -o '$output_all_reads' 'all_reads.bam'
    && samtools sort --no-PG -n -@\${GALAXY_SLOTS:-4} -T '\${TMPDIR:-.}' -O bam -o '$output_unmapped_reads' 'unmapped_reads.bam'
    && samtools sort --no-PG -n -@\${GALAXY_SLOTS:-4} -T '\${TMPDIR:-.}' -O bam -o '$output_mapped_reads' 'mapped_reads.bam'
#else:
    out='all_reads.bam'
    outu='unmapped_reads.bam'
    outm='mapped_reads.bam'
    && mv 'all_reads.bam' '$output_all_reads'
    && mv 'unmapped_reads.bam' '$output_unmapped_reads'
    && mv 'mapped_reads.bam' '$output_mapped_reads'
#end if
]]></command>
    <inputs>
        <expand macro="input_type_cond"/>
        <expand macro="reference_source_cond"/>
        <section name="mapping_options" title="Mapping options">
            <param argument="maxindel" type="integer" value="16000" label="Maximum indel length" help="Don't look for indels longer than this (lower is faster)"/>
            <param argument="strictmaxindel" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Do not allow indels longer than the specified maximum indel length?" help="By default these are not sought, but may be found anyway"/>
            <param argument="tipsearch" type="integer" value="100" label="Look this far for read-end deletions with anchors shorter than K using brute force"/>
            <param argument="minid" type="float" value="0.76" label="Approximate minimum alignment identity to look for" help="Higher is faster and less sensitive"/>
            <param argument="minhits" type="integer" value="1" label="Minimum number of seed hits required for candidate sites" help="Higher is faster"/>
            <param argument="local" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Use local (rather than global) alignments?"/>
            <param argument="perfectmode" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Allow only perfect mappings?"/>
            <param argument="semiperfectmode" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Allow only perfect and semiperfect (perfect except for N's in the reference) mappings?"/>
            <param name="ambiguous" type="select" label="Specify behavior on ambiguously-mapped reads with multiple top-scoring mapping locations">
                <option value="best" selected="true">Use the first best site</option>
                <option value="toss">Consider unmapped</option>
                <option value="random">Select one top-scoring site randomly</option>
                <option value="all">Retain all top-scoring sites</option>
            </param>
            <param argument="samestrandpairs" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Map paired reads to the same strand?" help="No value will map to opposite strands"/>
            <param argument="requirecorrectstrand" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Forbid pairing of reads without correct strand orientation?" help="Select No for long-mate-pair libraries"/>
            <param argument="killbadpairs" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Mark the read with the lower mapping quality as unmapped if a read pair is mapped with an inappropriate insert size or orientation?"/>
            <param argument="pairedonly" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Treat unpaired reads as unmapped?"/>
            <param argument="rcomp" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Reverse complement both reads prior to mapping (for LMP outward-facing libraries)?"/>
            <param argument="rcompmate" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Reverse complement read2 prior to mapping?"/>
            <param argument="pairlen" type="integer" value="32000" label="Maximum allowed distance between paired reads"/>
            <param argument="rescuedist" type="integer" value="1200" label="Maximum average insert size for trying to rescue paired reads" help="Lower is faster"/>
            <param argument="rescuemismatches" type="integer" value="32" label="Maximum mismatches allowed in a rescued read" help="Lower is faster"/>
            <param argument="averagepairdist" type="integer" value="100" label="Initial average distance between paired reads" help="Varies dynamically; does not need to be specified"/>
            <param argument="deterministic" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Run in deterministic mode?" help="BBMap is deterministic without this flag if using single-ended reads?"/>
            <param argument="bandwidthratio" type="integer" value="0" label="Restrict alignment band to this fraction of read length" help="Zero value ignores"/>
            <param argument="bandwidth" type="integer" value="0" label="Set the bandwidth directly to this fraction of read length" help="Faster but less accurate"/>
            <param argument="maxsites2" type="integer" value="800" label="Maximum number of alignments per read to output"/>
            <param argument="ignorefrequentkmers" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Discard low-information kmers that occur often?"/>
            <param argument="excludefraction" type="float" value="0.03" label="Fraction of kmers to ignore" help="For example, 0.03 will ignore the most common 3% of kmers"/>
            <param argument="greedy" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Use a greedy algorithm to discard the least-useful kmers on a per-read basis?"/>
            <param argument="kfilter" type="integer" value="0" label="Potential mapping sites must have at least this many consecutive exact matches" help="Zero value ignores"/>
        </section>
        <section name="qt_options" title="Quality and trimming options">
            <param name="qtrim" type="select" label="Select option for quality trimming ends before mapping">
                <option value="f" selected="true">No trimming</option>
                <option value="l">Trim left</option>
                <option value="r">Trim right</option>
                <option value="lr">Trim both</option>
            </param>
            <param argument="untrim" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Undo trimming after mapping?" help="Untrimmed bases will be soft-clipped in cigar strings"/>
            <param argument="trimq" type="integer" value="6" label="Trim regions with average quality below this value"/>
            <param argument="mintrimlength" type="integer" value="60" label="Don't trim reads to be shorter than this value"/>
            <param argument="fakefastaquality" type="integer" value="-1" max="50" label="Set to a positive number 1-50 to generate fake quality strings for fasta input reads"/>
            <param argument="ignorebadquality" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Keep going, rather than crashing, if a read has out-of-range quality values?"/>
            <param argument="usequality" type="boolean" truevalue="t" falsevalue="f" checked="true" label="Use quality scores when determining which read kmers to use as seeds?"/>
            <param argument="minaveragequality" type="integer" value="0" label="Do not map reads with average quality below this value"/>
            <param argument="maqb" type="integer" value="0" label="Calculate maq from this many initial bases" help="Zero value ignores"/>
        </section>
        <section name="output_options" title="Output options">
            <param argument="secondary" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Output secondary alignments?"/>
            <param argument="maxsites" type="integer" value="5" label="Maximum number of total alignments to output per read" help="Relevant only when outputting secondary alignments"/>
            <param argument="sssr" type="float" value="0.95" label="Output only secondary alignments with score of at least this fraction of primary"/>
            <param argument="ssao" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Only output secondary alignments for ambiguously-mapped reads?"/>
            <param argument="quickmatch" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Generate cigar strings more quickly?"/>
            <param argument="trimreaddescriptions" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Truncate read and ref names at the first whitespace, assuming that the remainder is a comment or description?"/>
            <param argument="machineout" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Output statistics in machine-friendly (key=value) format?"/>
            <param argument="printunmappedcount" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Output the total number of unmapped reads and bases?" help="If input is paired, the number will be of pairs for which both reads are unmapped"/>
            <param argument="renamebyinsert" type="boolean" truevalue="t" falsevalue="f" checked="false" label="Rename reads based on their mapped insert size?"/>
            <param name="output_sort" type="select" label="BAM sorting mode" help="The 'Not sorted' option can significantly extend the run time of the tool (it runs using a single thread)">
                <option value="coordinate" selected="True">Sort by chromosomal coordinates</option>
                <option value="name">Sort by read names</option>
                <option value="unsorted">Not sorted (sorted as input)</option>
            </param>
        </section>
        <section name="pf_options" title="Post-filtering options">
            <param argument="idfilter" type="integer" value="0" min="0" max="1" label="Specify exact minimum identity allowed for alignments to be output" help="Independent of approximate minimum alignment identity to look for"/>
            <param argument="subfilter" type="integer" value="-1" label="Ban alignments with more than this many substitutions" help="Negative value ignores"/>
            <param argument="insfilter" type="integer" value="-1" label="Ban alignments with more than this many insertions" help="Negative value ignores"/>
            <param argument="delfilter" type="integer" value="-1" label="Ban alignments with more than this many deletions" help="Negative value ignores"/>
            <param argument="indelfilter" type="integer" value="-1" label="Ban alignments with more than this many indels" help="Negative value ignores"/>
            <param argument="editfilter" type="integer" value="-1" label="Ban alignments with more than this many edits" help="Negative value ignores"/>
            <param argument="inslenfilter" type="integer" value="-1" label="Ban alignments with an insertion longer than this" help="Negative value ignores"/>
            <param argument="dellenfilter" type="integer" value="-1" label="Ban alignments with a deletion longer than this" help="Negative value ignores"/>
            <param argument="nfilter" type="integer" value="-1" label="Ban alignments with more than this many ns (includes nocall, noref, and off scaffold ends)" help="Negative value ignores"/>
        </section>
    </inputs>
    <outputs>
        <data format="bam" name="output_all_reads" label="${tool.name} on ${on_string} (all reads)">
            <expand macro="dbKeyActionsBBMap"/>
            <change_format>
                <when input="output_options.output_sort" value="name" format="qname_sorted.bam"/>
                <when input="output_options.output_sort" value="unsorted" format="qname_input_sorted.bam"/>
            </change_format>
        </data>
        <data format="bam" name="output_unmapped_reads" label="${tool.name} on ${on_string} (unmapped reads)">
            <expand macro="dbKeyActionsBBMap"/>
            <change_format>
                <when input="output_options.output_sort" value="name" format="qname_sorted.bam"/>
                <when input="output_options.output_sort" value="unsorted" format="qname_input_sorted.bam"/>
            </change_format>
        </data>
        <data format="bam" name="output_mapped_reads" label="${tool.name} on ${on_string} (mapped reads)">
            <expand macro="dbKeyActionsBBMap"/>
            <change_format>
                <when input="output_options.output_sort" value="name" format="qname_sorted.bam"/>
                <when input="output_options.output_sort" value="unsorted" format="qname_input_sorted.bam"/>
            </change_format>
        </data>
    </outputs>
    <tests>
        <!-- Single file, cached reference, output coordinate sorted -->
        <test expect_num_outputs="3">
            <param name="input_type" value="single"/>
            <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz"/>
            <param name="output_sort" value="coordinate"/>
            <output name="output_all_reads" ftype="bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="9310" delta="300"/>
                </assert_contents>
            </output>
            <output name="output_unmapped_reads" ftype="bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="9310" delta="300"/>
                </assert_contents>
            </output>
            <output name="output_mapped_reads" ftype="bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="851" delta="100"/>
                </assert_contents>
            </output>
        </test>
        <!-- Paired reads in separate datasets, cached reference, output name sorted -->
        <test expect_num_outputs="3">
            <param name="input_type" value="pair"/>
            <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz"/>
            <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz"/>
            <param name="output_sort" value="name"/>
            <output name="output_all_reads" ftype="qname_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="16893" delta="600"/>
                </assert_contents>
            </output>
            <output name="output_unmapped_reads" ftype="qname_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="16893" delta="600"/>
                </assert_contents>
            </output>
            <output name="output_mapped_reads" ftype="qname_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="860" delta="100"/>
                </assert_contents>
            </output>
        </test>
        <!-- Collection of Paired reads, history reference, output unsorted -->
        <test expect_num_outputs="3">
            <param name="input_type" value="paired"/>
            <param name="reads_collection">
                <collection type="paired">
                    <element name="forward" value="13-1941-6_S4_L001_R1_600000.fastq.gz"/>
                    <element name="reverse" value="13-1941-6_S4_L001_R2_600000.fastq.gz"/>
                </collection>
            </param>
            <param name="ref_source" value="history"/>
            <param name="reference" value="NC_002945v4.fasta" dbkey="89" ftype="fasta"/>
            <param name="output_sort" value="unsorted"/>
            <output name="output_all_reads" ftype="qname_input_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="16928" delta="600"/>
                </assert_contents>
            </output>
            <output name="output_unmapped_reads" ftype="qname_input_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="16928" delta="600"/>
                </assert_contents>
            </output>
            <output name="output_mapped_reads" ftype="qname_input_sorted.bam">
                <metadata name="dbkey" value="89"/>
                <assert_contents>
                    <has_size value="878" delta="100"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
**What it does**

BBMap is a splice-aware global aligner for DNA and RNA sequencing reads.  It is fast and extremely accurate, particularly
with highly mutated genomes or reads with long indels, even whole-gene deletions over 100kbp long. It has no upper limit
to genome size or number of contigs and has been successfully used for mapping to an 85 gigabase soil metagenome with over
200 million contigs. the indexing phase is very fast compared to other aligners.

BBMap can output many different statistics files; an empirical read quality histogram, insert-size distribution, and genome
coverage with or without generating a sam file.  It is useful in quality control of libraries and sequencing runs or
evaluating new sequencing platforms.

**Options**

  *Bam sorting mode* - the generated bam files can be sorted according to three criteria: coordinates, names and input order.

    * Sort by chromosomal coordinates - the file is sorted by coordinates (i.e., the reads from the beginning of the first
      chromosome are first in the file.
    * Sort by read names - the file is sorted by the reference ID (i.e., the QNAME field).
    * Not sorted (sorted as input) - the file is sorted in the order of the reads in the input file.

    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Mon, 06 Feb 2023 18:06:17 +0000
parents	858a65b52cf9
children	7f19559b8b55