# HG changeset patch # User devteam # Date 1426867748 14400 # Node ID e29bc5c169bcb568bcfd2b042a7238ee90bbcc8a # Parent c71dd035971efedee892d5411dcfe3b4c83eefe6 Uploaded diff -r c71dd035971e -r e29bc5c169bc bwa-mem.xml --- a/bwa-mem.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa-mem.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,31 +1,29 @@ - - + + - map medium and long reads (> 100 bp) against reference genome bwa_macros.xml - bwa samtools - - map medium and long reads (> 100 bp) against reference genome + + + - #set $reference_fasta_filename = "localref.fa" - + #if str( $reference_source.reference_source_selector ) == "history": - ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && - + ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run ## depending ob the size of the input FASTA dataset - ( size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux - if [ $? -eq 0 ]; + if [ $? -eq 0 ]; then - if [ \$size -lt 2000000000 ]; + if [ "\$size" -lt 2000000000 ]; then bwa index -a is "${reference_fasta_filename}"; echo "Generating BWA index with is algorithm"; @@ -35,10 +33,10 @@ fi; fi; - eval \$(stat -s "${reference_fasta_filename}"); ## OSX - if [ $? -eq 0 ]; + eval \$(stat -s "${reference_fasta_filename}" 2>/dev/null); ## OSX + if [ -n "\$st_size" ]; then - if [ \$st_size -lt 2000000000 ]; + if [ "\$st_size" -lt 2000000000 ]; then bwa index -a is "${reference_fasta_filename}"; echo "Generating BWA index with is algorithm"; @@ -48,31 +46,28 @@ fi; fi; ) && - + #else: #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) #end if - + ## Begin BWA-MEM command line - + bwa mem -t "\${GALAXY_SLOTS:-1}" - -v 1 ## Verbosity is set to 1 (errors only) - + -v 1 ## Verbosity is set to 1 (errors only) + #if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option -p #if str( $fastq_input.iset_stats ): ## check that insert statistics is used -I "${fastq_input.iset_stats}" #end if #end if - + #if str( $analysis_type.analysis_type_selector ) == "pacbio": - -x - + -x pacbio #elif str( $analysis_type.analysis_type_selector ) == "full": - - #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "True": ## Algorithmic options - + #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "set": ## Algorithmic options -k "${analysis_type.algorithmic_options.k}" -w "${analysis_type.algorithmic_options.w}" -d "${analysis_type.algorithmic_options.d}" @@ -85,22 +80,18 @@ ${analysis_type.algorithmic_options.S} ${analysis_type.algorithmic_options.P} ${analysis_type.algorithmic_options.e} - #end if - - #if str( $analysis_type.scoring_options.scoring_options_selector ) == "True": ## Scoring options - + + #if str( $analysis_type.scoring_options.scoring_options_selector ) == "set": ## Scoring options -A "${analysis_type.scoring_options.A}" -B "${analysis_type.scoring_options.B}" -O "${analysis_type.scoring_options.O}" -E "${analysis_type.scoring_options.E}" -L "${analysis_type.scoring_options.L}" -U "${analysis_type.scoring_options.U}" - #end if - - #if str( $analysis_type.io_options.io_options_selector ) == "True": ## IO options - + + #if str( $analysis_type.io_options.io_options_selector ) == "set": ## IO options -T "${analysis_type.io_options.T}" -h "${analysis_type.io_options.h}" ${analysis_type.io_options.a} @@ -108,51 +99,39 @@ ${analysis_type.io_options.V} ${analysis_type.io_options.Y} ${analysis_type.io_options.M} - #end if - + #end if - - #if str( $rg.rg_selector ) == "True": - -R "@RG\tID:$rg.ID\tSM:$rg.SM" - #end if - + + #if str( $rg.rg_selector ) == "set": + @set_rg_string@ + -R '$rg_string' + #end if + #if str( $fastq_input.fastq_input_selector ) == "paired": - - #if str( $fastq_input.iset_stats ): ## check that insert statistics is used - -I "${fastq_input.iset_stats}" - #end if - - "${reference_fasta_filename}" - - "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}" - - #elif str( $fastq_input.fastq_input_selector ) == "paired_collection": - #if str( $fastq_input.iset_stats ): ## check that insert statistics is used -I "${fastq_input.iset_stats}" #end if - "${reference_fasta_filename}" - - "${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}" - - #else: - - + "${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}" + #elif str( $fastq_input.fastq_input_selector ) == "paired_collection": + #if str( $fastq_input.iset_stats ): ## check that insert statistics is used + -I "${fastq_input.iset_stats}" + #end if + "${reference_fasta_filename}" - + "${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}" + #else: + "${reference_fasta_filename}" "${fastq_input.fastq_input1}" - #end if - + | samtools view -Sb - > temporary_bam_file.bam && - + samtools sort -f temporary_bam_file.bam ${bam_output} - - + @@ -169,7 +148,7 @@ - + @@ -188,7 +167,7 @@ - + @@ -201,37 +180,22 @@ - + - + - - - - - - - - - - - - - - - - - - + + + - - - + + + @@ -246,67 +210,67 @@ - - - - - - - - - - - - + + + + + + + + + + + + - + - - - + + + - + - + - + - + - - - - - - - + + + + + + + @@ -315,11 +279,11 @@ - + - + @@ -330,12 +294,19 @@ + + + + + + + + + + + - - - - **What is does** From http://arxiv.org/abs/1303.3997: @@ -358,7 +329,7 @@ 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2] 2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 <reference index> <PacBio dataset in fastq format> 3. *Full list of options*: Allows access to all options through Galaxy interface. - + ------ **BWA MEM options** @@ -407,16 +378,12 @@ specify the mean, standard deviation (10% of the mean if absent), max (4 sigma from the mean if absent) and min of the insert size distribution. FR orientation only. [inferred] - @dataset_collections@ @RG@ @info@ - - - 10.1093/bioinformatics/btp324 diff -r c71dd035971e -r e29bc5c169bc bwa.xml --- a/bwa.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,220 +1,10 @@ - - - - bwa - samtools - + - map short reads (< 100 bp) against reference genome - - - #set $reference_fasta_filename = "localref.fa" - - #if str( $reference_source.reference_source_selector ) == "history": - - ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && - - ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run - ## depending ob the size of the input FASTA dataset - - ( - size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux - if [ $? -eq 0 ]; - then - if [ \$size -lt 2000000000 ]; - then - bwa index -a is "${reference_fasta_filename}"; - else - bwa index -a bwtsw "${reference_fasta_filename}"; - fi; - fi; - - eval \$(stat -s "${reference_fasta_filename}"); ## OSX - if [ $? -eq 0 ]; - then - if [ \$st_size -lt 2000000000 ]; - then - bwa index -a is "${reference_fasta_filename}"; - echo "Generating BWA index with is algorithm"; - else - bwa index -a bwtsw "${reference_fasta_filename}"; - echo "Generating BWA index with bwtsw algorithm"; - fi; - fi; - ) && - - #else: - #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) - #end if - - ## Begin bwa command line - -####### Fastq paired - - #if str( $input_type.input_type_selector ) == "paired" or str( $input_type.input_type_selector ) == "paired_collection": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - - #if str( $input_type.input_type_selector ) == "paired_collection": - "${input_type.fastq_input1.forward}" - #else - "${input_type.fastq_input1}" - #end if - - > first.sai && - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - - #if str( $input_type.input_type_selector ) == "paired_collection": - "${input_type.fastq_input1.reverse}" - #else - "${input_type.fastq_input2}" - #end if - - > second.sai && - - bwa sampe - - #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True": - - -a ${$input_type.adv_pe_options.a} - -o ${$input_type.adv_pe_options.o} - -n ${$input_type.adv_pe_options.n} - -N ${$input_type.adv_pe_options.N} - - #end if - - @read_group_options@ - - #if str( $input_type.input_type_selector ) == "paired_collection": - - "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1.forward}" "${input_type.fastq_input1.reverse}" - - #else: - - "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}" - - #end if - -####### Fastq single - - #elif str( $input_type.input_type_selector ) == "single": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.fastq_input1}" - > first.sai && - - bwa samse - - #if str( $input_type.adv_se_options.adv_se_options_selector) == "True": - - -n ${$input_type.adv_se_options.n} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai "${input_type.fastq_input1}" - -####### BAM paired - - #elif str( $input_type.input_type_selector ) == "paired_bam": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -1 - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.bam_input}" - > first.sai && - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -2 - @command_options@ - "${reference_fasta_filename}" - "${input_type.bam_input}" - > second.sai && - - bwa sampe - - #if str( $input_type.adv_bam_pe_options.adv_pe_options_selector) == "True": - - -a ${$input_type.adv_bam_pe_options.a} - -o ${$input_type.adv_bam_pe_options.o} - -n ${$input_type.adv_bam_pe_options.n} - -N ${$input_type.adv_bam_pe_options.N} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai second.sai "${input_type.bam_input}" "${input_type.bam_input}" - -####### Fastq single ------------ to do next - - #elif str( $input_type.input_type_selector ) == "single_bam": - - bwa aln - -t "\${GALAXY_SLOTS:-1}" - -b - -0 - - @command_options@ - - "${reference_fasta_filename}" - "${input_type.bam_input}" - > first.sai && - - bwa samse - - #if str( $input_type.adv_bam_se_options.adv_se_options_selector) == "True": - - -n ${$input_type.adv_bam_se_options.n} - - #end if - - @read_group_options@ - - "${reference_fasta_filename}" first.sai "${input_type.bam_input}" - - #end if - - | samtools view -Sb - > temporary_bam_file.bam && - - samtools sort -f temporary_bam_file.bam ${bam_output} - - - - bwa_macros.xml - - #if str( $analysis_type.analysis_type_selector ) == "illumina": - - ## do nothing -> just align with default parameters - - #elif str( $analysis_type.analysis_type_selector ) == "full": - + + #if str( $analysis_type.analysis_type_selector ) == "full": -n ${analysis_type.n} -o ${analysis_type.o} -e ${analysis_type.e} @@ -228,25 +18,23 @@ -E ${analysis_type.E} -R ${analysis_type.R} -q ${analysis_type.q} - + #if str( $analysis_type.B ): -B ${analysis_type.B} #end if - + #if str( $analysis_type.L ): -B ${analysis_type.L} #end if - #end if + #end if - - #if str( $rg.rg_selector ) == "True": - - -r "@RG\tID:$rg.ID\tSM:$rg.SM" - + #if str( $rg.rg_selector ) == "set": + @set_rg_string@ + -r '$rg_string' #end if - + @@ -277,6 +65,190 @@ + + bwa + samtools + + + + + + #set $reference_fasta_filename = "localref.fa" + + #if str( $reference_source.reference_source_selector ) == "history": + ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && + + ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run + ## depending ob the size of the input FASTA dataset + ( + size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux + if [ $? -eq 0 ]; + then + if [ "\$size" -lt 2000000000 ]; + then + bwa index -a is "${reference_fasta_filename}"; + else + bwa index -a bwtsw "${reference_fasta_filename}"; + fi; + fi; + + eval \$(stat -s "${reference_fasta_filename}" 2>/dev/null); ## OSX + if [ -n "\$st_size" ]; + then + if [ "\$st_size" -lt 2000000000 ]; + then + bwa index -a is "${reference_fasta_filename}"; + echo "Generating BWA index with is algorithm"; + else + bwa index -a bwtsw "${reference_fasta_filename}"; + echo "Generating BWA index with bwtsw algorithm"; + fi; + fi; + ) && + #else: + #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) + #end if + + ## Begin bwa command line + +####### Fastq paired + + #if str( $input_type.input_type_selector ) == "paired" or str( $input_type.input_type_selector ) == "paired_collection": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${input_type.fastq_input1.forward}" + #else + "${input_type.fastq_input1}" + #end if + + > first.sai && + + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${input_type.fastq_input1.reverse}" + #else + "${input_type.fastq_input2}" + #end if + + > second.sai && + + bwa sampe + + #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True": + -a ${$input_type.adv_pe_options.a} + -o ${$input_type.adv_pe_options.o} + -n ${$input_type.adv_pe_options.n} + -N ${$input_type.adv_pe_options.N} + #end if + + @read_group_options@ + + #if str( $input_type.input_type_selector ) == "paired_collection": + "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1.forward}" "${input_type.fastq_input1.reverse}" + #else: + "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}" + #end if + +####### Fastq single + + #elif str( $input_type.input_type_selector ) == "single": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.fastq_input1}" + > first.sai && + + bwa samse + + #if str( $input_type.adv_se_options.adv_se_options_selector) == "True": + -n ${$input_type.adv_se_options.n} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai "${input_type.fastq_input1}" + +####### BAM paired + + #elif str( $input_type.input_type_selector ) == "paired_bam": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -1 + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.bam_input}" + > first.sai && + + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -2 + @command_options@ + "${reference_fasta_filename}" + "${input_type.bam_input}" + > second.sai && + + bwa sampe + + #if str( $input_type.adv_bam_pe_options.adv_pe_options_selector) == "True": + -a ${$input_type.adv_bam_pe_options.a} + -o ${$input_type.adv_bam_pe_options.o} + -n ${$input_type.adv_bam_pe_options.n} + -N ${$input_type.adv_bam_pe_options.N} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai second.sai "${input_type.bam_input}" "${input_type.bam_input}" + +####### Fastq single ------------ to do next + + #elif str( $input_type.input_type_selector ) == "single_bam": + bwa aln + -t "\${GALAXY_SLOTS:-1}" + -b + -0 + + @command_options@ + + "${reference_fasta_filename}" + "${input_type.bam_input}" + > first.sai && + + bwa samse + + #if str( $input_type.adv_bam_se_options.adv_se_options_selector) == "True": + -n ${$input_type.adv_bam_se_options.n} + #end if + + @read_group_options@ + + "${reference_fasta_filename}" first.sai "${input_type.bam_input}" + #end if + + | samtools view -Sb - > temporary_bam_file.bam && + + samtools sort -f temporary_bam_file.bam ${bam_output} + + @@ -293,7 +265,7 @@ - + @@ -309,69 +281,54 @@ - + - + - + - + - + - - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - + + + @@ -380,7 +337,7 @@ - + @@ -395,15 +352,15 @@ - + - + - + @@ -422,12 +379,19 @@ + + + + + + + + + + + - - - - **What is does** BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. The bwa-aln algorithm is designed for Illumina sequence reads up to 100bp. For longer reads use BWA-MEM algorithm distributed as separate Galaxy tool. @@ -437,7 +401,7 @@ - bwa aln - actual mapper placing reads onto the reference sequence - bwa samse - post-processor converting suffix array coordinates into genome coordinates in SAM format for single reads - bam sampe - post-processor for paired reads - + Galaxy implementation takes fastq or BAM (unaligned BAM) datasets as input and produces output in BAM (not SAM; in reality SAM produced by the bwa is converted to BAM on the fly by samtools view command) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard). ----- @@ -448,7 +412,7 @@ 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2] 2. *Full list of options*: Allows access to all options through Galaxy interface. - + ------ **bwa-aln options** @@ -490,14 +454,12 @@ -n INT maximum hits to output for paired reads [3] -r STR read group header line [null] - @dataset_collections@ @RG@ @info@ - 10.1093/bioinformatics/btp324 diff -r c71dd035971e -r e29bc5c169bc bwa_macros.xml --- a/bwa_macros.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/bwa_macros.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,6 +1,37 @@ + + + #set $rg_string = "@RG\tID:" + str($rg.ID) + "\tSM:" + str($rg.SM) + "\tPL:" + str($rg.PL) + #if $rg.LB + #set $rg_string += "\tLB:$rg.LB" + #end if + #if $rg.CN + #set $rg_string += "\tCN:$rg.CN" + #end if + #if $rg.DS + #set $rg_string += "\tDS:$rg.DS" + #end if + #if $rg.DT + #set $rg_string += "\tDT:$rg.DT" + #end if + #if $rg.FO + #set $rg_string += "\tFO:$rg.FO" + #end if + #if $rg.KS + #set $rg_string += "\tKS:$rg.KS" + #end if + #if $rg.PG + #set $rg_string += "\tPG:$rg.PG" + #end if + #if str($rg.PI) + #set $rg_string += "\tPI:$rg.PI" + #end if + #if $rg.PU + #set $rg_string += "\tPU:$rg.PU" + #end if + - + ----- .. class:: warningmark @@ -8,9 +39,9 @@ **Read Groups are Important!** One of the recommended best practices in NGS analysis is adding read group information to BAM files. You can do thid directly in BWA interface using the -**Specify readgroup information?** widget. If you are not familiar with readgroups you shold know that this is effectively a way to tag reads with an additional ID. +**Specify read group information?** widget. If you are not familiar with read groups you shold know that this is effectively a way to tag reads with an additional ID. This allows you to combine BAM files from, for example, multiple BWA runs into a single dataset. This significantly simplifies downstream processing as -instead of dealing with multiple datasets you only have to handle only one. This is possible because the readgroup information allows you to identify +instead of dealing with multiple datasets you only have to handle only one. This is possible because the read group information allows you to identify data from different experiments even if they are combined in one file. Many downstream analysis tools such as varinat callers (e.g., FreeBayes or Naive Varinat Caller present in Galaxy) are aware of readgtroups and will automatically generate calls for each individual sample even if they are combined within a single file. @@ -51,8 +82,8 @@ @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400 Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library). - - + + ----- .. class:: infomark @@ -66,9 +97,9 @@ 3. https://github.com/lh3/bwa 4. http://bio-bwa.sourceforge.net/ - + - + ------ **Dataset collections - processing large numbers of datasets at once** @@ -76,7 +107,43 @@ This will be added shortly - - + + + + + + + + + + + + + + + + + + + + + + + + + + + \*|[ACMGRSVTWYHKDBN]+$ + + + + + + + + + + + diff -r c71dd035971e -r e29bc5c169bc shed_upload.tar.gz Binary file shed_upload.tar.gz has changed diff -r c71dd035971e -r e29bc5c169bc test-data/bwa-aln-test3.bam Binary file test-data/bwa-aln-test3.bam has changed diff -r c71dd035971e -r e29bc5c169bc test-data/bwa-mem-test2.bam Binary file test-data/bwa-mem-test2.bam has changed diff -r c71dd035971e -r e29bc5c169bc tool_dependencies.xml --- a/tool_dependencies.xml Wed Jan 14 13:51:07 2015 -0500 +++ b/tool_dependencies.xml Fri Mar 20 12:09:08 2015 -0400 @@ -1,9 +1,9 @@ - + - +