Mercurial > repos > devteam > bwa
changeset 2:e29bc5c169bc draft
Uploaded
author | devteam |
---|---|
date | Fri, 20 Mar 2015 12:09:08 -0400 |
parents | c71dd035971e |
children | 607ca4b95837 |
files | bwa-mem.xml bwa.xml bwa_macros.xml shed_upload.tar.gz test-data/bwa-aln-test3.bam test-data/bwa-mem-test2.bam tool_dependencies.xml |
diffstat | 7 files changed, 403 insertions(+), 407 deletions(-) [+] |
line wrap: on
line diff
--- a/bwa-mem.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa-mem.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,31 +1,29 @@ <?xml version="1.0"?> -<tool id="bwa_mem" name="BWA-MEM" version="0.1"> - +<tool id="bwa_mem" name="Map with BWA-MEM" version="0.2.1"> + <description>- map medium and long reads (> 100 bp) against reference genome</description> <macros> <import>bwa_macros.xml</import> </macros> - <requirements> <requirement type="package" version="0.7.10.039ea20639">bwa</requirement> <requirement type="package" version="1.1">samtools</requirement> </requirements> - <description>- map medium and long reads (> 100 bp) against reference genome</description> + <stdio> + <exit_code range="1:" /> + </stdio> <command> - #set $reference_fasta_filename = "localref.fa" - + #if str( $reference_source.reference_source_selector ) == "history": - ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && - + ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run ## depending ob the size of the input FASTA dataset - ( size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux - if [ $? -eq 0 ]; + if [ $? -eq 0 ]; then - if [ \$size -lt 2000000000 ]; + if [ "\$size" -lt 2000000000 ]; then bwa index -a is "${reference_fasta_filename}"; echo "Generating BWA index with is algorithm"; @@ -35,10 +33,10 @@ fi; fi; - eval \$(stat -s "${reference_fasta_filename}"); ## OSX - if [ $? -eq 0 ]; + eval \$(stat -s "${reference_fasta_filename}" 2>/dev/null); ## OSX + if [ -n "\$st_size" ]; then - if [ \$st_size -lt 2000000000 ]; + if [ "\$st_size" -lt 2000000000 ]; then bwa index -a is "${reference_fasta_filename}"; echo "Generating BWA index with is algorithm"; @@ -48,31 +46,28 @@ fi; fi; ) && - + #else: #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) #end if - + ## Begin BWA-MEM command line - + bwa mem -t "\${GALAXY_SLOTS:-1}" - -v 1 ## Verbosity is set to 1 (errors only) - + -v 1 ## Verbosity is set to 1 (errors only) + #if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option -p #if str( $fastq_input.iset_stats ): ## check that insert statistics is used -I "${fastq_input.iset_stats}" #end if #end if - + #if str( $analysis_type.analysis_type_selector ) == "pacbio": - -x - + -x pacbio #elif str( $analysis_type.analysis_type_selector ) == "full": - - #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "True": ## Algorithmic options - + #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "set": ## Algorithmic options -k "${analysis_type.algorithmic_options.k}" -w "${analysis_type.algorithmic_options.w}" -d "${analysis_type.algorithmic_options.d}" @@ -85,22 +80,18 @@ ${analysis_type.algorithmic_options.S} ${analysis_type.algorithmic_options.P} ${analysis_type.algorithmic_options.e} - #end if - - #if str( $analysis_type.scoring_options.scoring_options_selector ) == "True": ## Scoring options - + + #if str( $analysis_type.scoring_options.scoring_options_selector ) == "set": ## Scoring options -A "${analysis_type.scoring_options.A}" -B "${analysis_type.scoring_options.B}" -O "${analysis_type.scoring_options.O}" -E "${analysis_type.scoring_options.E}" -L "${analysis_type.scoring_options.L}" -U "${analysis_type.scoring_options.U}" - #end if - - #if str( $analysis_type.io_options.io_options_selector ) == "True": ## IO options - + + #if str( $analysis_type.io_options.io_options_selector ) == "set": ## IO options -T "${analysis_type.io_options.T}" -h "${analysis_type.io_options.h}" ${analysis_type.io_options.a} @@ -108,51 +99,39 @@ ${analysis_type.io_options.V} ${analysis_type.io_options.Y} ${analysis_type.io_options.M} - #end if - + #end if - - #if str( $rg.rg_selector ) == "True": - -R "@RG\tID:$rg.ID\tSM:$rg.SM" - #end if - + + #if str( $rg.rg_selector ) == "set": + @set_rg_string@ + -R '$rg_string' + #end if + #if str( $fastq_input.fastq_input_selector ) == "paired": - - #if str( $fastq_input.iset_stats ): ## check that insert statistics is used - -I "${fastq_input.iset_stats}" - #end if - - "${reference_fasta_filename}" - - "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}" - - #elif str( $fastq_input.fastq_input_selector ) == "paired_collection": - #if str( $fastq_input.iset_stats ): ## check that insert statistics is used -I "${fastq_input.iset_stats}" #end if - "${reference_fasta_filename}" - - "${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}" - - #else: - - + "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}" + #elif str( $fastq_input.fastq_input_selector ) == "paired_collection": + #if str( $fastq_input.iset_stats ): ## check that insert statistics is used + -I "${fastq_input.iset_stats}" + #end if + "${reference_fasta_filename}" - + "${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}" + #else: + "${reference_fasta_filename}" "${fastq_input.fastq_input1}" - #end if - + | samtools view -Sb - > temporary_bam_file.bam && - + samtools sort -f temporary_bam_file.bam ${bam_output} - </command> - + <inputs> <conditional name="reference_source"> @@ -169,7 +148,7 @@ <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> </param> </when> - <when value="history"> + <when value="history"> <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" /> </when> </conditional> @@ -188,7 +167,7 @@ <valid initial="string.digits"><add value=","/> </valid> </sanitizer> </param> - </when> + </when> <when value="single"> <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with single reads"/> </when> @@ -201,37 +180,22 @@ </param> </when> <when value="paired_iv"> - <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with interleaved reads"/> + <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with interleaved reads"/> <param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both "250" and "250,25" will work while "250,,10" will not. See below for details."> <sanitizer invalid_char=""> <valid initial="string.digits"><add value=","/> </valid> </sanitizer> - </param> + </param> </when> </conditional> - - - <conditional name="rg"> - <param name="rg_selector" type="select" label="Set read groups information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details"> - <option value="set">Set</option> - <option value="do_not_set" selected="True">Do not set</option> - </param> - <when value="set"> - <param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment"> - </param> - <param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive"> - </param> - </when> - <when value="do_not_set"> - <!-- do nothing --> - </when> - </conditional> - + + <expand macro="readgroup_params" /> + <conditional name="analysis_type"> <param name="analysis_type_selector" type="select" label="Select analysis mode"> - <option value="illumina">1.Simple Illumina mode</option> - <option value="pacbio">2.PacBio mode</option> - <option value="full">3.Full list of options</option> + <option value="illumina">Simple Illumina mode</option> + <option value="pacbio">PacBio mode (-x pacbio)</option> + <option value="full">Full list of options</option> </param> <when value="illumina"> <!-- do nothing --> @@ -246,67 +210,67 @@ <option value="do_not_set" selected="True">Do not set</option> </param> <when value="set"> - <param name="k" type="integer" value="19" label="minimum seed length" help="-k; default=19"/> - <param name="w" type="integer" value="100" label="band width for banded alignment" help="-w; default=100"/> - <param name="d" type="integer" value="100" label="off-diagonal X-dropoff" help="-d; default=100"/> - <param name="r" type="float" value="1.5" label="look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5"/> - <param name="y" type="integer" value="0" label="find maximum exact matches (MEMs) longer than -k * -r with size less than THIS VALUE" help="-y; default=0"/> - <param name="c" type="integer" value="500" label="skip seeds with more than that many occurrences" help="-c; default=500"/> - <param name="D" type="float" value="0.5" label="drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/> - <param name="W" type="integer" value="0" label="discard a chain if seeded bases shorter than" help="-W; default=0"/> - <param name="m" type="integer" value="50" label="perform at most this many rounds of mate rescues for each read" help="-m; default=50"/> - <param name="S" type="boolean" truevalue="-S" falsevalue="" label="skip mate rescue" help="-S"/> - <param name="P" type="boolean" truevalue="-P" falsevalue="" label="skip pairing; mate rescue performed unless -S also in use" help="-P"/> - <param name="e" type="boolean" truevalue="-e" falsevalue="" label="discard full-length exact matches" help="-e"/> + <param name="k" type="integer" value="19" label="Minimum seed length" help="-k; default=19"/> + <param name="w" type="integer" value="100" label="Band width for banded alignment" help="-w; default=100"/> + <param name="d" type="integer" value="100" label="Off-diagonal X-dropoff" help="-d; default=100"/> + <param name="r" type="float" value="1.5" label="Look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5; This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy" /> + <param name="y" type="integer" value="20" label="Seed occurrence for the 3rd round seeding" help="-y; default=20" /> + <param name="c" type="integer" value="500" label="Skip seeds with more than that many occurrences" help="-c; default=500"/> + <param name="D" type="float" value="0.5" label="Drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/> + <param name="W" type="integer" value="0" label="Discard a chain if seeded bases shorter than THIS VALUE" help="-W; default=0"/> + <param name="m" type="integer" value="50" label="Perform at most this many rounds of mate rescues for each read" help="-m; default=50"/> + <param name="S" type="boolean" truevalue="-S" falsevalue="" label="Skip mate rescue" help="-S"/> + <param name="P" type="boolean" truevalue="-P" falsevalue="" label="Skip pairing; mate rescue performed unless -S also in use" help="-P"/> + <param name="e" type="boolean" truevalue="-e" falsevalue="" label="Discard full-length exact matches" help="-e"/> </when> <when value="do_not_set"> <!-- do nothing --> </when> </conditional> - + <conditional name="scoring_options"> <param name="scoring_options_selector" type="select" label="Set scoring options?" help="Sets -A, -B, -O, -E, -L, and -U options."> <option value="set">Set</option> <option value="do_not_set" selected="True">Do not set</option> </param> <when value="set"> - <param name="A" type="integer" value="1" label="score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U; default=1"/> - <param name="B" type="integer" value="4" label="penalty for mismatch" help="-B; default=4"/> - <param name="O" type="text" value="6,6" label="gap open penalty for deletions and insertions" help="-O; default=6,6"> + <param name="A" type="integer" value="1" label="Score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U unless overridden; default=1"/> + <param name="B" type="integer" value="4" label="Penalty for a mismatch" help="-B; default=4"/> + <param name="O" type="text" value="6,6" label="Gap open penalties for deletions and insertions" help="-O; default=6,6"> <sanitizer invalid_char=""> <valid initial="string.digits"><add value=","/> </valid> </sanitizer> </param> - <param name="E" type="text" value="1,1" label="gap extension penalty; a gap of size k cost '-O + -E*k' " help="-E; default=1,1"> + <param name="E" type="text" value="1,1" label="Gap extension penalties; a gap of size k cost '-O + -E*k'. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=1,1"> <sanitizer invalid_char=""> <valid initial="string.digits"><add value=","/> </valid> </sanitizer> </param> - <param name="L" type="text" value="5,5" label="penalty for 5'-end and 3'-end clipping" help="-L; default=5,5"> + <param name="L" type="text" value="5,5" label="Penalties for 5'-end and 3'-end clipping" help="-L; default=5,5; When performing Smith-Waterman extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best Smith-Waterman score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best Smith-Waterman score; clipping penalty is not deduced"> <sanitizer invalid_char=""> <valid initial="string.digits"><add value=","/> </valid> </sanitizer> </param> - <param name="U" type="integer" value="17" label="penalty for an unpaired read pair" help="-U; default=17"/> + <param name="U" type="integer" value="17" label="Penalty for an unpaired read pair" help="-U; default=17"/> </when> <when value="do_not_set"> <!-- do nothing --> </when> </conditional> - + <conditional name="io_options"> <param name="io_options_selector" type="select" label="Set input/output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options."> <option value="set">Set</option> <option value="do_not_set" selected="True">Do not set</option> </param> <when value="set"> - <param name="T" type="integer" value="30" label="minimum score to output" help="-T; default=30"/> - <param name="h" type="integer" value="5" label="if there are this many hits with score >80% of the max score, output all in XA tag" help="-h; default=5"/> - <param name="a" type="boolean" truevalue="-a" falsevalue="" label="output all alignments for single-ends or unpaired paired-ends" help="-a"/> - <param name="C" type="boolean" truevalue="-C" falsevalue="" label="append FASTA/FASTQ comment to BAM output" help="-C"/> - <param name="V" type="boolean" truevalue="-V" falsevalue="" label="output the reference FASTA header in the XR tag" help="-C"/> - <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments" help="-Y"/> - <param name="M" type="boolean" truevalue="-M" falsevalue="" label="mark shorter split hits as secondary" help="-M"/> + <param name="T" type="integer" value="30" label="Minimum score to output" help="-T; default=30"/> + <param name="h" type="integer" value="5" label="If there are less than THIS VALUE hits with score >80% of the max score, output them all in the XA tag" help="-h; default=5" /> + <param name="a" type="boolean" truevalue="-a" falsevalue="" label="Output all alignments for single-ends or unpaired paired-ends" help="-a; These alignments will be flagged as secondary alignments"/> + <param name="C" type="boolean" truevalue="-C" falsevalue="" label="Append FASTA/FASTQ comment to BAM output" help="-C"/> + <param name="V" type="boolean" truevalue="-V" falsevalue="" label="Output the reference FASTA header in the XR tag" help="-C"/> + <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="Use soft clipping for supplementary alignments" help="-Y; By default, BWA-MEM uses soft clipping for the primary alignment and hard clipping for supplementary alignments" /> + <param name="M" type="boolean" truevalue="-M" falsevalue="" label="Mark shorter split hits of a chimeric alignment in the FLAG field as 'secondary alignment' instead of 'supplementary alignment'" help="-M; For Picard<1.96 compatibility" /> </when> <when value="do_not_set"> <!-- do nothing --> @@ -315,11 +279,11 @@ </when> </conditional> </inputs> - + <outputs> <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/> </outputs> - + <tests> <test> <param name="reference_source_selector" value="history" /> @@ -330,12 +294,19 @@ <param name="analysis_type_selector" value="illumina"/> <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" /> </test> + <test> + <param name="reference_source_selector" value="history" /> + <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> + <param name="fastq_input_selector" value="paired"/> + <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/> + <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> + <param name="rg_selector" value="set"/> + <param name="ID" value="rg1"/> + <param name="analysis_type_selector" value="illumina"/> + <output name="bam_output" ftype="bam" file="bwa-mem-test2.bam" lines_diff="2" /> + </test> </tests> - <stdio> - <exit_code range="1:" /> - </stdio> <help> - **What is does** From http://arxiv.org/abs/1303.3997: @@ -358,7 +329,7 @@ 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2] 2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 <reference index> <PacBio dataset in fastq format> 3. *Full list of options*: Allows access to all options through Galaxy interface. - + ------ **BWA MEM options** @@ -407,16 +378,12 @@ specify the mean, standard deviation (10% of the mean if absent), max (4 sigma from the mean if absent) and min of the insert size distribution. FR orientation only. [inferred] - @dataset_collections@ @RG@ @info@ - - - </help> <citations> <citation type="doi">10.1093/bioinformatics/btp324</citation>
--- a/bwa.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,220 +1,10 @@ <?xml version="1.0"?> -<tool id="bwa" name="BWA" version="0.1"> - - <requirements> - <requirement type="package" version="0.7.10.039ea20639">bwa</requirement> - <requirement type="package" version="1.1">samtools</requirement> - </requirements> +<tool id="bwa" name="Map with BWA" version="0.1"> <description>- map short reads (< 100 bp) against reference genome</description> - <command> - - #set $reference_fasta_filename = "localref.fa" - - #if str( $reference_source.reference_source_selector ) == "history": - - ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && - - ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run - ## depending ob the size of the input FASTA dataset - - ( - size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux - if [ $? -eq 0 ]; - then - if [ \$size -lt 2000000000 ]; - then - bwa index -a is "${reference_fasta_filename}"; - else - bwa index -a bwtsw "${reference_fasta_filename}"; - fi; - fi; - - eval \$(stat -s "${reference_fasta_filename}"); ## OSX - if [ $? -eq 0 ]; - then - if [ \$st_size -lt 2000000000 ]; - then - bwa index -a is "${reference_fasta_filename}"; - echo "Generating BWA index with is algorithm"; - else - bwa index -a bwtsw "${reference_fasta_filename}"; - echo "Generating BWA index with bwtsw algorithm"; - fi; - fi; - ) && - - #else: - #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) - #end if - - ## Begin bwa command line - -####### Fastq paired - - #if str( $input_type.input_type_selector ) == "paired" or str( $input_type.input_type_selector ) == "paired_collection": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - - #if str( $input_type.input_type_selector ) == "paired_collection": - "${input_type.fastq_input1.forward}" - #else - "${input_type.fastq_input1}" - #end if - - > first.sai && - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - - #if str( $input_type.input_type_selector ) == "paired_collection": - "${input_type.fastq_input1.reverse}" - #else - "${input_type.fastq_input2}" - #end if - - > second.sai && - - bwa sampe - - #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True": - - -a ${$input_type.adv_pe_options.a} - -o ${$input_type.adv_pe_options.o} - -n ${$input_type.adv_pe_options.n} - -N ${$input_type.adv_pe_options.N} - - #end if - - @read_group_options@ - - #if str( $input_type.input_type_selector ) == "paired_collection": - - "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1.forward}" "${input_type.fastq_input1.reverse}" - - #else: - - "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}" - - #end if - -####### Fastq single - - #elif str( $input_type.input_type_selector ) == "single": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.fastq_input1}" - > first.sai && - - bwa samse - - #if str( $input_type.adv_se_options.adv_se_options_selector) == "True": - - -n ${$input_type.adv_se_options.n} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai "${input_type.fastq_input1}" - -####### BAM paired - - #elif str( $input_type.input_type_selector ) == "paired_bam": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -1 - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.bam_input}" - > first.sai && - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -2 - @command_options@ - "${reference_fasta_filename}" - "${input_type.bam_input}" - > second.sai && - - bwa sampe - - #if str( $input_type.adv_bam_pe_options.adv_pe_options_selector) == "True": - - -a ${$input_type.adv_bam_pe_options.a} - -o ${$input_type.adv_bam_pe_options.o} - -n ${$input_type.adv_bam_pe_options.n} - -N ${$input_type.adv_bam_pe_options.N} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai second.sai "${input_type.bam_input}" "${input_type.bam_input}" - -####### Fastq single ------------ to do next - - #elif str( $input_type.input_type_selector ) == "single_bam": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -0 - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.bam_input}" - > first.sai && - - bwa samse - - #if str( $input_type.adv_bam_se_options.adv_se_options_selector) == "True": - - -n ${$input_type.adv_bam_se_options.n} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai "${input_type.bam_input}" - - #end if - - | samtools view -Sb - > temporary_bam_file.bam && - - samtools sort -f temporary_bam_file.bam ${bam_output} - - - </command> - <macros> <import>bwa_macros.xml</import> - <token name="@command_options@"> - #if str( $analysis_type.analysis_type_selector ) == "illumina": - - ## do nothing -> just align with default parameters - - #elif str( $analysis_type.analysis_type_selector ) == "full": - + <token name="@command_options@"> + #if str( $analysis_type.analysis_type_selector ) == "full": -n ${analysis_type.n} -o ${analysis_type.o} -e ${analysis_type.e} @@ -228,25 +18,23 @@ -E ${analysis_type.E} -R ${analysis_type.R} -q ${analysis_type.q} - + #if str( $analysis_type.B ): -B ${analysis_type.B} #end if - + #if str( $analysis_type.L ): -B ${analysis_type.L} #end if - #end if + #end if </token> <token name="@read_group_options@"> - - #if str( $rg.rg_selector ) == "True": - - -r "@RG\tID:$rg.ID\tSM:$rg.SM" - + #if str( $rg.rg_selector ) == "set": + @set_rg_string@ + -r '$rg_string' #end if </token> - + <xml name="advanced_pe_options"> <param name="adv_pe_options_selector" type="select" label="Set advanced paired end options?" help="Provides additional controls"> <option value="set">Set</option> @@ -277,6 +65,190 @@ </xml> </macros> + <requirements> + <requirement type="package" version="0.7.10.039ea20639">bwa</requirement> + <requirement type="package" version="1.1">samtools</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command> + #set $reference_fasta_filename = "localref.fa" + + #if str( $reference_source.reference_source_selector ) == "history": + ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && + + ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run + ## depending ob the size of the input FASTA dataset + ( + size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux + if [ $? -eq 0 ]; + then + if [ "\$size" -lt 2000000000 ]; + then + bwa index -a is "${reference_fasta_filename}"; + else + bwa index -a bwtsw "${reference_fasta_filename}"; + fi; + fi; + + eval \$(stat -s "${reference_fasta_filename}" 2>/dev/null); ## OSX + if [ -n "\$st_size" ]; + then + if [ "\$st_size" -lt 2000000000 ]; + then + bwa index -a is "${reference_fasta_filename}"; + echo "Generating BWA index with is algorithm"; + else + bwa index -a bwtsw "${reference_fasta_filename}"; + echo "Generating BWA index with bwtsw algorithm"; + fi; + fi; + ) && + #else: + #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) + #end if + + ## Begin bwa command line + +####### Fastq paired + + #if str( $input_type.input_type_selector ) == "paired" or str( $input_type.input_type_selector ) == "paired_collection": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${input_type.fastq_input1.forward}" + #else + "${input_type.fastq_input1}" + #end if + + > first.sai && + + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${input_type.fastq_input1.reverse}" + #else + "${input_type.fastq_input2}" + #end if + + > second.sai && + + bwa sampe + + #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True": + -a ${$input_type.adv_pe_options.a} + -o ${$input_type.adv_pe_options.o} + -n ${$input_type.adv_pe_options.n} + -N ${$input_type.adv_pe_options.N} + #end if + + @read_group_options@ + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1.forward}" "${input_type.fastq_input1.reverse}" + #else: + "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}" + #end if + +####### Fastq single + + #elif str( $input_type.input_type_selector ) == "single": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.fastq_input1}" + > first.sai && + + bwa samse + + #if str( $input_type.adv_se_options.adv_se_options_selector) == "True": + -n ${$input_type.adv_se_options.n} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai "${input_type.fastq_input1}" + +####### BAM paired + + #elif str( $input_type.input_type_selector ) == "paired_bam": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -1 + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.bam_input}" + > first.sai && + + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -2 + @command_options@ + "${reference_fasta_filename}" + "${input_type.bam_input}" + > second.sai && + + bwa sampe + + #if str( $input_type.adv_bam_pe_options.adv_pe_options_selector) == "True": + -a ${$input_type.adv_bam_pe_options.a} + -o ${$input_type.adv_bam_pe_options.o} + -n ${$input_type.adv_bam_pe_options.n} + -N ${$input_type.adv_bam_pe_options.N} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai second.sai "${input_type.bam_input}" "${input_type.bam_input}" + +####### Fastq single ------------ to do next + + #elif str( $input_type.input_type_selector ) == "single_bam": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -0 + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.bam_input}" + > first.sai && + + bwa samse + + #if str( $input_type.adv_bam_se_options.adv_se_options_selector) == "True": + -n ${$input_type.adv_bam_se_options.n} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai "${input_type.bam_input}" + #end if + + | samtools view -Sb - > temporary_bam_file.bam && + + samtools sort -f temporary_bam_file.bam ${bam_output} + </command> + <inputs> <conditional name="reference_source"> @@ -293,7 +265,7 @@ <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> </param> </when> - <when value="history"> + <when value="history"> <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" /> </when> </conditional> @@ -309,69 +281,54 @@ <param name="fastq_input1" type="data" format="fastqsanger" label="Select first set of reads" help="Specify dataset with forward reads"/> <param name="fastq_input2" type="data" format="fastqsanger" label="Select second set of reads" help="Specify dataset with reverse reads"/> <conditional name="adv_pe_options"> - + <expand macro="advanced_pe_options" /> - + </conditional> </when> - + <when value="paired_collection"> <param name="fastq_input1" format="fastqsanger" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/> <conditional name="adv_pe_options"> - + <expand macro="advanced_pe_options" /> - + </conditional> </when> - - + <when value="single"> <param name="fastq_input1" type="data" format="fastqsanger" label="Select fastq dataset" help="Specify dataset with single reads"/> <conditional name="adv_se_options"> - + <expand macro="advanced_se_options" /> - + </conditional> </when> - + <!-- the difference between single and paired bams is in the <command> tag portion and realated to -0, -1, and -2 options --> - + <when value="paired_bam"> <param name="bam_input" type="data" format="bam" label="Select BAM dataset" help="Specify BAM dataset with paired reads"/> <conditional name="adv_bam_pe_options"> - + <expand macro="advanced_pe_options" /> - + </conditional> </when> - + <when value="single_bam"> <param name="bam_input" type="data" format="bam" label="Select BAM dataset" help="Specify BAM dataset with single reads"/> <conditional name="adv_bam_se_options"> - + <expand macro="advanced_se_options" /> - + </conditional> </when> - + </conditional> - - <conditional name="rg"> - <param name="rg_selector" type="select" label="Set readgroups information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details"> - <option value="set">Set</option> - <option value="do_not_set" selected="True">Do not set</option> - </param> - <when value="set"> - <param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment"> - </param> - <param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive"> - </param> - </when> - <when value="do_not_set"> - <!-- do nothing --> - </when> - </conditional> - + + <expand macro="readgroup_params" /> + <conditional name="analysis_type"> <param name="analysis_type_selector" type="select" label="Select analysis mode"> <option value="illumina">1.Simple Illumina mode</option> @@ -380,7 +337,7 @@ <when value="illumina"> <!-- do nothing --> </when> - <when value="full"> + <when value="full"> <param name="n" type="text" value="0.04" label="maximum edit distance if the value is integer, or the fraction of missing alignments given 2% uniform base error rate if float. In the latter case, the maximum edit distance is automatically chosen for different read lengths." help="aln -n; default=0.04"/> <param name="o" type="integer" value="1" label="maximum number or gap openings" help="aln -o; default=1"/> <param name="e" type="integer" value="-1" label="maximum number of gap extensions" help="aln -e; -1 disables long gaps and invokes k-difference mode; default=-1"/> @@ -395,15 +352,15 @@ <param name="R" type="integer" value="30" label="stop searching when there are more than this value of equally best hits" help="aln -R; default=30"/> <param name="q" type="integer" value="0" label="quality threshold for read trimming down to 35bp" help="aln -q; default=0"/> <param name="B" type="integer" optional="True" label="length of barcode" help="aln -B; optional parameter"/> - <param name="L" type="float" optional="True" label="log-scaled gap penalty for long deletions" help="aln -L; optional parameter"/> + <param name="L" type="float" optional="True" label="log-scaled gap penalty for long deletions" help="aln -L; optional parameter"/> </when> </conditional> </inputs> - + <outputs> <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/> </outputs> - + <tests> <test> <param name="reference_source_selector" value="history" /> @@ -422,12 +379,19 @@ <param name="analysis_type_selector" value="illumina"/> <output name="bam_output" ftype="bam" file="bwa-aln-test2.bam" lines_diff="2" /> </test> + <test> + <param name="reference_source_selector" value="history" /> + <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/> + <param name="input_type_selector" value="paired"/> + <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/> + <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/> + <param name="rg_selector" value="set"/> + <param name="ID" value="rg1"/> + <param name="analysis_type_selector" value="illumina"/> + <output name="bam_output" ftype="bam" file="bwa-aln-test3.bam" lines_diff="2" /> + </test> </tests> - <stdio> - <exit_code range="1:" /> - </stdio> <help> - **What is does** BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. The bwa-aln algorithm is designed for Illumina sequence reads up to 100bp. For longer reads use BWA-MEM algorithm distributed as separate Galaxy tool. @@ -437,7 +401,7 @@ - bwa aln - actual mapper placing reads onto the reference sequence - bwa samse - post-processor converting suffix array coordinates into genome coordinates in SAM format for single reads - bam sampe - post-processor for paired reads - + Galaxy implementation takes fastq or BAM (unaligned BAM) datasets as input and produces output in BAM (not SAM; in reality SAM produced by the bwa is converted to BAM on the fly by samtools view command) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard). ----- @@ -448,7 +412,7 @@ 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2] 2. *Full list of options*: Allows access to all options through Galaxy interface. - + ------ **bwa-aln options** @@ -490,14 +454,12 @@ -n INT maximum hits to output for paired reads [3] -r STR read group header line [null] - @dataset_collections@ @RG@ @info@ - </help> <citations> <citation type="doi">10.1093/bioinformatics/btp324</citation>
--- a/bwa_macros.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa_macros.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,6 +1,37 @@ <macros> + + <token name="@set_rg_string@"> + #set $rg_string = "@RG\tID:" + str($rg.ID) + "\tSM:" + str($rg.SM) + "\tPL:" + str($rg.PL) + #if $rg.LB + #set $rg_string += "\tLB:$rg.LB" + #end if + #if $rg.CN + #set $rg_string += "\tCN:$rg.CN" + #end if + #if $rg.DS + #set $rg_string += "\tDS:$rg.DS" + #end if + #if $rg.DT + #set $rg_string += "\tDT:$rg.DT" + #end if + #if $rg.FO + #set $rg_string += "\tFO:$rg.FO" + #end if + #if $rg.KS + #set $rg_string += "\tKS:$rg.KS" + #end if + #if $rg.PG + #set $rg_string += "\tPG:$rg.PG" + #end if + #if str($rg.PI) + #set $rg_string += "\tPI:$rg.PI" + #end if + #if $rg.PU + #set $rg_string += "\tPU:$rg.PU" + #end if + </token> - <token name="@RG@"> + <token name="@RG@"> ----- .. class:: warningmark @@ -8,9 +39,9 @@ **Read Groups are Important!** One of the recommended best practices in NGS analysis is adding read group information to BAM files. You can do thid directly in BWA interface using the -**Specify readgroup information?** widget. If you are not familiar with readgroups you shold know that this is effectively a way to tag reads with an additional ID. +**Specify read group information?** widget. If you are not familiar with read groups you shold know that this is effectively a way to tag reads with an additional ID. This allows you to combine BAM files from, for example, multiple BWA runs into a single dataset. This significantly simplifies downstream processing as -instead of dealing with multiple datasets you only have to handle only one. This is possible because the readgroup information allows you to identify +instead of dealing with multiple datasets you only have to handle only one. This is possible because the read group information allows you to identify data from different experiments even if they are combined in one file. Many downstream analysis tools such as varinat callers (e.g., FreeBayes or Naive Varinat Caller present in Galaxy) are aware of readgtroups and will automatically generate calls for each individual sample even if they are combined within a single file. @@ -51,8 +82,8 @@ @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400 Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library). - </token> - <token name="@info@"> + </token> + <token name="@info@"> ----- .. class:: infomark @@ -66,9 +97,9 @@ 3. https://github.com/lh3/bwa 4. http://bio-bwa.sourceforge.net/ - </token> + </token> - <token name="@dataset_collections@"> + <token name="@dataset_collections@"> ------ **Dataset collections - processing large numbers of datasets at once** @@ -76,7 +107,43 @@ This will be added shortly - </token> - + </token> + <xml name="readgroup_params"> + <conditional name="rg"> + <param name="rg_selector" type="select" label="Set read groups information?" help="-R; Specifying read group information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details"> + <option value="set">Set</option> + <option value="do_not_set" selected="True">Do not set</option> + </param> + <when value="set"> + <param name="ID" type="text" value="" size="20" label="Read group identifier (ID)" help="This value must be unique among multiple samples in your experiment"> + <validator type="empty_field" /> + </param> + <param name="SM" type="text" value="" size="20" label="Read group sample name (SM)" help="This value should be descriptive. Use pool name where a pool is being sequenced" /> + <param name="PL" type="select" label="Platform/technology used to produce the reads (PL)"> + <option value="CAPILLARY">CAPILLARY</option> + <option value="LS454">LS454</option> + <option value="ILLUMINA">ILLUMINA</option> + <option value="SOLID">SOLID</option> + <option value="HELICOS">HELICOS</option> + <option value="IONTORRENT">IONTORRENT</option> + <option value="PACBIO">PACBIO</option> + </param> + <param name="LB" type="text" size="25" label="Library name (LB)" /> + <param name="CN" type="text" size="25" label="Sequencing center that produced the read (CN)" /> + <param name="DS" type="text" size="25" label="Description (DS)" /> + <param name="DT" type="text" size="25" label="Date that run was produced (DT)" help="ISO8601 format date or date/time, like YYYY-MM-DD" /> + <param name="FO" type="text" size="25" optional="true" label="Flow order (FO)" help="The array of nucleotide bases that correspond to the nucleotides used for each flow of each read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. Format: /\*|[ACMGRSVTWYHKDBN]+/"> + <validator type="regex" message="Invalid flow order">\*|[ACMGRSVTWYHKDBN]+$</validator> + </param> + <param name="KS" type="text" size="25" label="The array of nucleotide bases that correspond to the key sequence of each read (KS)" /> + <param name="PG" type="text" size="25" label="Programs used for processing the read group (PG)" /> + <param name="PI" type="integer" optional="true" label="Predicted median insert size (PI)" /> + <param name="PU" type="text" size="25" label="Platform unit (PU)" help="Unique identifier (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)" /> + </when> + <when value="do_not_set"> + <!-- do nothing --> + </when> + </conditional> + </xml> </macros>
--- a/tool_dependencies.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/tool_dependencies.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,9 +1,9 @@ <?xml version="1.0"?> <tool_dependency> <package name="bwa" version="0.7.10.039ea20639"> - <repository changeset_revision="5b9aca1e1c07" name="package_bwa_0_7_10_039ea20639" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="5b9aca1e1c07" name="package_bwa_0_7_10_039ea20639" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> <package name="samtools" version="1.1"> - <repository changeset_revision="43f2fbec5d52" name="package_samtools_1_1" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="43f2fbec5d52" name="package_samtools_1_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>