Mercurial > repos > iuc > minimap2
view minimap2.xml @ 2:4070f129540a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/minimap2 commit 79e4c4d7182a4d3f7b6d9a7e444a2bb34be588c0
author | iuc |
---|---|
date | Wed, 08 Nov 2017 16:48:49 -0500 |
parents | b103bc946f57 |
children | 1650a97189be |
line wrap: on
line source
<?xml version="1.0"?> <tool id="minimap2" name="Map with minimap2" version="2.4.1" profile="17.01"> <description>A fast pairwise aligner for genomic and spliced nucleotide sequences</description> <requirements> <requirement type="package" version="2.4">minimap2</requirement> <requirement type="package" version="1.6">samtools</requirement> </requirements> <version_command>minimap2 --version</version_command> <command> <![CDATA[ #if $reference_source.reference_source_selector == 'history': ln -f -s '$reference_source.ref_file' reference.fa && #else: ln -f -s '$reference_source.ref_file.fields.path' reference.fa && #end if minimap2 -a -x $analysis_type_selector ## indexing options #if $indexing_options.k: -k $indexing_options.k #end if #if $indexing_options.w: -w $indexing_options.w #end if #if $indexing_options.I: -I $indexing_options.I #end if ## Mapping options #if $mapping_options.f: -f $mapping_options.f #end if #if $mapping_options.g: -g $mapping_options.g #end if #if $mapping_options.G: -G $mapping_options.G #end if #if $mapping_options.F: -F $mapping_options.F #end if #if $mapping_options.r: -r $mapping_options.r #end if #if $mapping_options.n: -n $mapping_options.n #end if #if $mapping_options.m: -m $mapping_options.m #end if $mapping_options.X #if $mapping_options.p: -p $mapping_options.p #end if #if $mapping_options.N: -N $mapping_options.N #end if ## Alignment options #if $alignment_options.A: -A $alignment_options.A #end if #if $alignment_options.B: -B $alignment_options.B #end if #if $alignment_options.O: #if $alignment_options.O2: -O $alignment_options.O,$alignment_options.O2 #end if -O $alignment_options.O #end if #if $alignment_options.E: #if $alignment_options.E2: -E $alignment_options.E,$alignment_options.E2 #else -E $alignment_options #end if #end if #if $alignment_options.z: $alignment_options.z #end if #if $alignment_options.s: -s $alignment_options.s #end if #if $alignment_options.u: -u $alignment_options.u #end if ## Output options $io_options.Q $io_options.L #if $io_options.cs: --cs $io_options.cs #end if #if $io_options.K: -K $io_options.K #end if -t \${GALAXY_SLOTS:-4} reference.fa #if $fastq_input.fastq_input_selector in ['single', 'paired_iv']: '$fastq_input.fastq_input1' #else if $fastq_input.fastq_input_selector == 'paired': '$fastq_input.fastq_input1' '$fastq_input.fastq_input2' #else if $fastq_input.fastq_input_selector == 'paired_collection': '$fastq_input.fastq_input1.forward' '$fastq_input.fastq_input1.reverse' #end if | samtools sort -@\${GALAXY_SLOTS:-2} -O $io_options.output_format #if $io_options.output_format == 'CRAM': --reference reference.fa --output-fmt-option no_ref #end if -o '$alignment_output' ]]> </command> <inputs> <conditional name="reference_source"> <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options. See `Indexes` section of help below"> <option value="cached">Use a built-in genome index</option> <option value="history">Use a genome from history and build index</option> </param> <when value="cached"> <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list"> <options from_data_table="all_fasta"> <filter type="sort_by" column="2" /> <validator type="no_options" message="No reference genomes are available" /> </options> <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> </param> </when> <when value="history"> <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" /> </when> </conditional> <section name="indexing_options" title="Indexing options"> <!-- Homopolymer setting seems to not properly overwrite sr preset <param argument="-H" name="H" type="boolean" optional="true" truevalue="-H" falsevalue="" label="Use homopolymer-compressed k-mer ?"/> --> <param argument="-k" type="integer" min="4" max="28" optional="true" label="k-mer size" help=""/> <param argument="-w" type="integer" min="1" optional="true" label="minimizer window size" help=""/> <param argument="-I" type="integer" min="1" optional="true" label="split index for every N input gigabases" help=""/> </section> <!-- start unchanged copy from bwa-mem --> <conditional name="fastq_input"> <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data"> <option value="single">Single</option> <option value="paired">Paired</option> <option value="paired_collection">Paired Collection</option> <option value="paired_iv">Paired Interleaved</option> </param> <when value="paired"> <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/> <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/> </when> <when value="single"> <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/> </when> <when value="paired_collection"> <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/> </when> <when value="paired_iv"> <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/> </when> </conditional> <!-- end unchanged copy from bwa-mem --> <param name="analysis_type_selector" type="select" label="Select analysis mode (sets default)"> <option value="map-pb">-Hk19 (PacBio vs reference mapping)</option> <option value="map-ont">-k15 (Oxford Nanopore vs reference mapping)</option> <option value="asm5">-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5% div.)</option> <option value="asm10">-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10% div.)</option> <option value="ava-pb">-Hk19 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)</option> <option value="ava-ont">-k15 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)</option> <option value="splice">long-read spliced alignment</option> <option value="sr">short single-end reads without splicing</option> </param> <section name="mapping_options" title="Set advanced mapping options" help="Sets -f, -g, -G, -F, -r, -n, -m, -X, -p and -N options." expanded="False"> <param argument="-f" type="float" value="" optional="true" label="filter out top FLOAT fraction of repetitive minimizers" help="default=0.0002"/> <param argument="-g" type="integer" value="" optional="true" label="stop chain enlongation if there are no minimizers in INT-bp" help="default=5000"/> <param argument="-G" type="integer" value="" optional="true" label="max intron length in thousand (effective with -xsplice; changing -r)" help="default=200"/> <param argument="-F" type="integer" value="" optional="true" label="max fragment length (effective with -xsr or in the fragment mode)" help="default=800" /> <param argument="-r" type="integer" value="" optional="true" label="bandwidth used in chaining and DP-based alignment" help="default=500" /> <param argument="-n" type="integer" value="" optional="true" label="minimal number of minimizers on a chain" help="default=3"/> <param argument="-m" type="integer" value="" optional="true" label="minimal chaining score (matching bases minus log gap penalty)" help="default=40"/> <param argument="-X" type="boolean" truevalue="-X" falsevalue="" optional="true" label="skip self and dual mappings (for the all-vs-all mode)"/> <param argument="-p" type="float" value="" max="1" optional="true" label="min secondary-to-primary score ratio" help="default=0.8"/> <param argument="-N" type="integer" min="0" optional="true" label="retain at most INT secondary alignments" help="default=5"/> </section> <section name="alignment_options" title="Set advanced alignment options" help="Sets -A, -B, -O, -E, -z, -s and -u options." expanded="False"> <param argument="-A" type="integer" optional="true" label="Score for a sequence match" help="default=2"/> <param argument="-B" type="integer" optional="true" label="Penalty for a mismatch" help="-B; default=4" /> <param argument="-O" type="integer" min="0" optional="true" label="Gap open penalties for deletions" help="-O; default=4"/> <param name="-O2" type="integer" min="0" optional="true" label="Gap open penalties for insertions" help="-O; default=24"/> <param argument="-E" type="integer" min="0" optional="true" label="Gap extension penalties; a gap of size k cost '-O + -E*k'. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=2"/> <param name="E2" type="integer" min="0" optional="true" label="Gap extension penalty for extending an insertion; if left empty uses the value specified for Gap extension penalties above" help="-E; default=1"/> <param argument="-z" type="integer" optional="true" label="Z-drop score" help="default=400"/> <param argument="-s" type="integer" optional="true" label="minimal peak DP alignment score" help="default=80"/> <param argument="-u" type="select" optional="true" label="how to find GT-AG"> <option value="n">don't match GT-AG</option> <option value="f">transcript strand</option> <option value="b">both strands</option> </param> </section> <section name="io_options" title="Set advanced output options" help="Sets -Q, -L, -R, -c, --cs and -K options." expanded="False"> <param name="output_format" type="select" label="Produce BAM or CRAM file?"> <option value="BAM">BAM</option> <option value="CRAM">CRAM</option> </param> <param argument="-Q" type="boolean" truevalue="-Q" falsevalue="" optional="true" label="don't output base quality"/> <param argument="-L" type="boolean" truevalue="-L" falsevalue="" optional="true" label="write CIGAR with >65535 ops to the CG tag" help="Useful for very long reads in SAM/BAM format"/> <param argument="-K" type="integer" optional="true" label="minibatch size for mapping (in megabyte)" help="default=500M"/> <param argument="--cs" type="select" optional="true" label="Output cs tag?" help="The cs tag is a more compact standalone representation of the MD tag, see help below."> <option value="none">no</option> <option value="short">short</option> <option value="long">long</option> </param> </section> </inputs> <outputs> <data format="bam" name="alignment_output" label="${tool.name} on ${on_string} (mapped reads in ${io_options.output_format} format)"> <actions> <conditional name="reference_source.reference_source_selector"> <when value="cached"> <action type="metadata" name="dbkey"> <option type="from_data_table" name="all_fasta" column="1" offset="0"> <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> <filter type="param_value" ref="reference_source.ref_file" column="0"/> </option> </action> </when> <when value="history"> <action type="metadata" name="dbkey"> <option type="from_param" name="reference_source.ref_file" param_attribute="dbkey" /> </action> </when> </conditional> </actions> <change_format> <when input="io_options.output_format" value="CRAM" format="cram" /> </change_format> </data> </outputs> <tests> <test> <!-- test single input --> <param name="reference_source_selector" value="history" /> <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> <param name="fastq_input_selector" value="single"/> <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/> <param name="analysis_type_selector" value="sr"/> <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" /> </test> <test> <!-- test cram output --> <param name="reference_source_selector" value="history" /> <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> <param name="fastq_input_selector" value="single"/> <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/> <param name="analysis_type_selector" value="sr"/> <param name="output_format" value="CRAM"/> <output name="alignment_output" ftype="cram" file="minimap2-test1-fasta.cram" compare="sim_size" /> </test> <test> <!-- test paired input --> <param name="reference_source_selector" value="history" /> <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> <param name="fastq_input_selector" value="paired"/> <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/> <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> <param name="analysis_type_selector" value="sr"/> <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" /> </test> <test> <!-- test paired input with one pair compressed --> <param name="reference_source_selector" value="history" /> <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> <param name="fastq_input_selector" value="paired"/> <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/> <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> <param name="analysis_type_selector" value="sr"/> <output name="alignment_output" ftype="bam" file="minimap2-test1.bam" lines_diff="2" /> </test> <test> <!-- test collection input --> <param name="reference_source_selector" value="history" /> <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> <param name="fastq_input_selector" value="paired_collection"/> <param name="fastq_input1"> <collection type="paired"> <element name="forward" value="bwa-mem-fastq1.fq" /> <element name="reverse" value="bwa-mem-fastq2.fq" /> </collection> </param> <param name="analysis_type_selector" value="sr"/> <output name="alignment_output" ftype="bam" file="minimap2-test2.bam" lines_diff="2" /> </test> <test> <!-- test data table reference --> <param name="reference_source_selector" value="cached" /> <param name="ref_file" value="bwa-mem-mt-genome"/> <param name="fastq_input_selector" value="single"/> <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/> <param name="analysis_type_selector" value="sr"/> <output name="alignment_output" ftype="bam" file="minimap2-test1-fasta.bam" lines_diff="2" /> </test> </tests> <help> Users’ Guide ------------ Minimap2 is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database. Typical use cases include: (1) mapping PacBio or Oxford Nanopore genomic reads to the human genome; (2) finding overlaps between long reads with error rate up to ~15%; (3) splice-aware alignment of PacBio Iso-Seq or Nanopore cDNA or Direct RNA reads against a reference genome; (4) aligning Illumina single- or paired-end reads; (5) assembly-to-assembly alignment; (6) full-genome alignment between two closely related species with divergence below ~15%. For ~10kb noisy reads sequences, minimap2 is tens of times faster than mainstream long-read mappers such as BLASR, BWA-MEM, NGMLR and GMAP. It is more accurate on simulated long reads and produces biologically meaningful alignment ready for downstream analyses. For >100bp Illumina short reads, minimap2 is three times as fast as BWA-MEM and Bowtie2, and as accurate on simulated data. Detailed evaluations are available from the `minimap2 preprint`. General usage ~~~~~~~~~~~~~ Minimap2 seamlessly works with gzip’d FASTA and FASTQ formats as input. You don’t need to convert between FASTA and FASTQ or decompress gzip’d files first. For the human reference genome, minimap2 takes a few minutes to generate a minimizer index for the reference before mapping. To reduce indexing time, you can optionally save the index with option **-d** and replace the reference sequence file with the index file on the minimap2 command line: ***Importantly***, it should be noted that once you build the index, indexing parameters such as **-k**, **-w**, **-H** and **-I** can’t be changed during mapping. If you are running minimap2 for different data types, you will probably need to keep multiple indexes generated with different parameters. This makes minimap2 different from BWA which always uses the same index regardless of query data types. Use cases ~~~~~~~~~ Minimap2 uses the same base algorithm for all applications. However, due to the different data types it supports (e.g. short vs long reads; DNA vs mRNA reads), minimap2 needs to be tuned for optimal performance and accuracy. It is usually recommended to choose a preset with option **-x**, which sets multiple parameters at the same time. The default setting is the same as ``map-ont``. Map long noisy genomic reads ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The difference between ``map-pb`` and ``map-ont`` is that ``map-pb`` uses homopolymer-compressed (HPC) minimizers as seeds, while ``map-ont`` uses ordinary minimizers as seeds. Emperical evaluation suggests HPC minimizers improve performance and sensitivity when aligning PacBio reads, but hurt when aligning Nanopore reads. Map long mRNA/cDNA reads ^^^^^^^^^^^^^^^^^^^^^^^^ There are different long-read RNA-seq technologies, including tranditional full-length cDNA, EST, PacBio Iso-seq, Nanopore 2D cDNA-seq and Direct RNA-seq. They produce data of varying quality and properties. By default, ``-x splice`` assumes the read orientation relative to the transcript strand is unknown. It tries two rounds of alignment to infer the orientation and write the strand to the ``ts`` SAM/PAF tag if possible. For Iso-seq, Direct RNA-seq and tranditional full-length cDNAs, it would be desired to apply ``-u f`` to force minimap2 to consider the forward transcript strand only. This speeds up alignment with slight improvement to accuracy. For noisy Nanopore Direct RNA-seq reads, it is recommended to use a smaller k-mer size for increased sensitivity to the first or the last exons. It is worth noting that by default ``-x splice`` prefers GT[A/G]..[C/T]AG over GT[C/T]..[A/G]AG, and then over other splicing signals. Considering one additional base improves the junction accuracy for noisy reads, but reduces the accuracy when aligning against the widely used SIRV control data. This is because SIRV does not honor the evolutionarily conservative splicing signal. If you are studying SIRV, you may apply ``--splice-flank=no`` to let minimap2 only model GT..AG, ignoring the additional base. Find overlaps between long reads ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Similarly, ``ava-pb`` uses HPC minimizers while ``ava-ont`` uses ordinary minimizers. It is usually not recommended to perform base-level alignment in the overlapping mode because it is slow and may produce false positive overlaps. However, if performance is not a concern, you may try to add ``-a`` or ``-c`` anyway. Map short accurate genomic reads ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When two read files are specified, minimap2 reads from each file in turn and merge them into an interleaved stream internally. Two reads are considered to be paired if they are adjacent in the input stream and have the same name (with the ``/[0-9]`` suffix trimmed if present). Single- and paired-end reads can be mixed. Minimap2 does not work well with short spliced reads. There are many capable RNA-seq mappers for short reads. Full genome/assembly alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For cross-species full-genome alignment, the scoring system needs to be tuned according to the sequence divergence. Advanced features ~~~~~~~~~~~~~~~~~ Working with >65535 CIGAR operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Due to a design flaw, BAM does not work with CIGAR strings with >65535 operations (SAM and CRAM work). However, for ultra-long nanopore reads minimap2 may align ~1% of read bases with long CIGARs beyond the capability of BAM. If you convert such SAM/CRAM to BAM, Picard and recent samtools will throw an error and abort. Older samtools and other tools may create corrupted BAM. To avoid this issue, you can add option ``-L`` at the minimap2 command line. This option moves a long CIGAR to the ``CG`` tag and leaves a fully clipped CIGAR at the SAM CIGAR column. Current tools that don’t read CIGAR (e.g. merging and sorting) still work with such BAM records; tools that read CIGAR will effectively ignore these records. It has been decided that future tools will seamlessly recognize long-cigar records generated by option `-L`. **TD;DR**: if you work with ultra-long reads and use tools that only process BAM files, please add option ``-L``. The cs optional tag ^^^^^^^^^^^^^^^^^^^ The ``cs`` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular expression ``/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/``. Like CIGAR, ``cs`` consists of series of operations. Each leading character specifies the operation; the following sequence is the one involved in the operation. The ``cs`` tag is enabled by command line option ``--cs``. The following alignment, for example: .. code:: CGATCGATAAATAGAGTAG---GAATAGCA |||||| |||||||||| |||| ||| CGATCG---AATAGAGTAGGTCGAATtGCA is represented as ``:6-ata:10+gtc:4*at:3``, where ``:[0-9]+`` represents an identical block, ``-ata`` represents a deltion, ``+gtc`` an insertion and ``*at`` indicates reference base ``a`` is substituted with a query base ``t``. It is similar to the ``MD`` SAM tag but is standalone and easier to parse. If ``--cs=long`` is used, the ``cs`` string also contains identical sequences in the alignment. The above example will become ``=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA``. The long form of ``cs`` encodes both reference and query sequences in one string. Algorithm overview ~~~~~~~~~~~~~~~~~~ In the following, minimap2 command line options have a dash ahead and are highlighted in bold. The description may help to tune minimap2 parameters. 1. Read **-I** [=*4G*] reference bases, extract (**-k**,\ **-w**)-minimizers and index them in a hash table. 2. Read **-K** [=*200M*] query bases. For each query sequence, do step 3 through 7: 3. For each (**-k**,\ **-w**)-minimizer on the query, check against the reference index. If a reference minimizer is not among the top **-f** [=*2e-4*] most frequent, collect its the occurrences in the reference, which are called *seeds*. 4. Sort seeds by position in the reference. Chain them with dynamic programming. Each chain represents a potential mapping. For read overlapping, report all chains and then go to step 8. For reference mapping, do step 5 through 7: 5. Let *P* be the set of primary mappings, which is an empty set initially. For each chain from the best to the worst according to their chaining scores: if on the query, the chain overlaps with a chain in *P* by **–mask-level** [=*0.5*] or higher fraction of the shorter chain, mark the chain as *secondary* to the chain in *P*; otherwise, add the chain to *P*. 6. Retain all primary mappings. Also retain up to **-N** [=*5*] top secondary mappings if their chaining scores are higher than **-p** [=*0.8*] of their corresponding primary mappings. 7. If alignment is requested, filter out an internal seed if it potentially leads to both a long insertion and a long deletion. Extend from the left-most seed. Perform global alignments between internal seeds. Split the chain if the accumulative score along the global alignment drops by **-z** [=*400*], disregarding long gaps. Extend from the right-most seed. Output chains and their alignments. 8. If there are more query sequences in the input, go to step 2 until no more queries are left. 9. If there are more reference sequences, reopen the query file from the start and go to step 1; otherwise stop. Limitations ----------- - Minimap2 may produce suboptimal alignments through long low-complexity regions where seed positions may be suboptimal. This should not be a big concern because even the optimal alignment may be wrong in such regions. </help> <citations> <citation type="doi">10.1093/bioinformatics/btp324</citation> <citation type="doi">10.1093/bioinformatics/btp698</citation> <citation type="bibtex">@misc{1303.3997, Author = {Heng Li}, Title = {Minimap2: fast pairwise alignment for long nucleotide sequences}, Year = {2017}, Eprint = {arXiv:1708.01492}, url = {https://arxiv.org/abs/1708.01492}, }</citation> </citations> </tool>