Mercurial > repos > devteam > samtools_slice_bam
changeset 2:2b474ebbfc7d draft
Uploaded
author | devteam |
---|---|
date | Tue, 21 Apr 2015 17:37:49 -0400 |
parents | 74a8d2d60258 |
children | a4a10c7924d1 |
files | macros.xml samtools_slice_bam.py samtools_slice_bam.xml test-data/bam-slice-input.bam test-data/bam-slice-test1.bam test-data/bam-slice-test2.bam test-data/bam-slice-test3.bam test-data/bam-slice.bed test-data/gatk/fake_phiX_reads_1.bam test-data/gatk/fake_phiX_variant_locations.bed tool_dependencies.xml |
diffstat | 11 files changed, 226 insertions(+), 112 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Apr 21 17:37:49 2015 -0400 @@ -0,0 +1,70 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="1.2">samtools</requirement> + <yield/> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="bibtex"> + @misc{SAM_def, + title={Definition of SAM/BAM format}, + url = {https://samtools.github.io/hts-specs/SAMv1.pdf},} + </citation> + <citation type="doi">10.1093/bioinformatics/btp352</citation> + <citation type="doi">10.1093/bioinformatics/btr076</citation> + <citation type="doi">10.1093/bioinformatics/btr509</citation> + <citation type="bibtex"> + @misc{Danecek_et_al, + Author={Danecek, P., Schiffels, S., Durbin, R.}, + title={Multiallelic calling model in bcftools (-m)}, + url = {http://samtools.github.io/bcftools/call-m.pdf},} + </citation> + <citation type="bibtex"> + @misc{Durbin_VCQC, + Author={Durbin, R.}, + title={Segregation based metric for variant call QC}, + url = {http://samtools.github.io/bcftools/rd-SegBias.pdf},} + </citation> + <citation type="bibtex"> + @misc{Li_SamMath, + Author={Li, H.}, + title={Mathematical Notes on SAMtools Algorithms}, + url = {http://www.broadinstitute.org/gatk/media/docs/Samtools.pdf},} + </citation> + <citation type="bibtex"> + @misc{SamTools_github, + title={SAMTools GitHub page}, + url = {https://github.com/samtools/samtools},} + </citation> + </citations> + </xml> + <xml name="version_command"> + <version_command>samtools --version | head -n 1 | awk '{ print $2 }'</version_command> + </xml> + <xml name="stdio"> + <stdio> + <exit_code range="1:" level="fatal" description="Error" /> + </stdio> + </xml> + <token name="@no-chrom-options@"> +----- + +.. class:: warningmark + +**No options available? How to re-detect metadata** + +If you see a "No options available" within the "**Select references (chromosomes and contigs) you would like to restrict bam to**" drop down, you need to re-detect metadata for the dataset you are trying to process. To do this follow these steps: + +1. Click on the **pencil** icon adjacent to the dataset in the history +2. A new menu will appear in the center pane of the interface +3. Click **Datatype** tab +4. Set **New Type** to **BAM** +5. Click **Save** + +The medatada will be re-detected and you will be able to see the list of reference sequences in the "**Select references (chromosomes and contigs) you would like to restrict bam to**" drop-down. + + </token> + +</macros>
--- a/samtools_slice_bam.py Thu Mar 27 15:28:06 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -#!/usr/bin/env python -#Dan Blankenberg - -""" -A wrapper script for slicing a BAM file by provided BED file using SAMTools. -%prog input_filename.sam output_filename.bam -""" -#TODO: Confirm that the sort is necessary e.g. if input regions are out of order - - -import sys, optparse, os, tempfile, subprocess, shutil - -CHUNK_SIZE = 2**20 #1mb - -def cleanup_before_exit( tmp_dir ): - if tmp_dir and os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) - -def __main__(): - #Parse Command Line - parser = optparse.OptionParser() - (options, args) = parser.parse_args() - - assert len( args ) == 4, "Invalid command line: samtools_slice_bam.py input.bam input.bam.bai input.interval output.bam" - input_bam_filename, input_index_filename, input_interval_filename, output_bam_filename = args - - tmp_dir = tempfile.mkdtemp( prefix='tmp-samtools_slice_bam-' ) - - tmp_input_bam_filename = os.path.join( tmp_dir, 'input_bam.bam' ) - os.symlink( input_bam_filename, tmp_input_bam_filename ) - os.symlink( input_index_filename, "%s.bai" % tmp_input_bam_filename ) - - #Slice BAM - unsorted_bam_filename = os.path.join( tmp_dir, 'unsorted.bam' ) - unsorted_stderr_filename = os.path.join( tmp_dir, 'unsorted.stderr' ) - cmd = 'samtools view -b -L "%s" "%s" > "%s"' % ( input_interval_filename, tmp_input_bam_filename, unsorted_bam_filename ) - proc = subprocess.Popen( args=cmd, stderr=open( unsorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir ) - return_code = proc.wait() - if return_code: - stderr_target = sys.stderr - else: - stderr_target = sys.stdout - stderr = open( unsorted_stderr_filename ) - while True: - chunk = stderr.read( CHUNK_SIZE ) - if chunk: - stderr_target.write( chunk ) - else: - break - stderr.close() - - #sort sam, so indexing will not fail - #TODO: confirm if sorting is necessary (is original BAM order maintained, or does the output follow the order of input intervals?) - sorted_stderr_filename = os.path.join( tmp_dir, 'sorted.stderr' ) - sorting_prefix = os.path.join( tmp_dir, 'sorted_bam' ) - cmd = 'samtools sort -o "%s" "%s" > "%s"' % ( unsorted_bam_filename, sorting_prefix, output_bam_filename ) - proc = subprocess.Popen( args=cmd, stderr=open( sorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir ) - return_code = proc.wait() - - if return_code: - stderr_target = sys.stderr - else: - stderr_target = sys.stdout - stderr = open( sorted_stderr_filename ) - while True: - chunk = stderr.read( CHUNK_SIZE ) - if chunk: - stderr_target.write( chunk ) - else: - break - stderr.close() - - cleanup_before_exit( tmp_dir ) - -if __name__=="__main__": __main__()
--- a/samtools_slice_bam.xml Thu Mar 27 15:28:06 2014 -0400 +++ b/samtools_slice_bam.xml Tue Apr 21 17:37:49 2015 -0400 @@ -1,40 +1,123 @@ -<tool id="samtools_slice_bam" name="Slice BAM" version="0.0.2"> - <description>by provided regions</description> - <requirements> - <requirement type="package" version="0.1.19">samtools</requirement> - </requirements> - <command interpreter="python">samtools_slice_bam.py - "${input_bam}" - "${input_bam.metadata.bam_index}" - "${input_interval}" - "${output_bam}" - </command> - <inputs> - <param name="input_bam" type="data" format="bam" label="BAM file" /> - <param name="input_interval" type="data" format="bed" label="BED file" /> - </inputs> - <outputs> - <data format="bam" name="output_bam"/> - </outputs> - <tests> - <test> - <param name="input_bam" value="gatk/fake_phiX_reads_1.bam" ftype="bam" /> - <param name="input_interval" value="gatk/fake_phiX_variant_locations.bed" ftype="bed" /> - <output name="output_bam" file="gatk/fake_phiX_reads_1.bam" ftype="bam" /> - </test> - </tests> - <help> +<tool id="samtools_slice_bam" name="Slice" version="2.0"> + <description>BAM by genomic regions</description> + <macros> + <import>macros.xml</import> + </macros> + <!-- <code file="samtools_slice_options.py"/> --> + <expand macro="requirements"></expand> + <expand macro="stdio"></expand> + <expand macro="version_command"></expand> + <command> +<![CDATA[ + ln -s "${input_bam}" temp_input.bam && + ln -s "${input_bam.metadata.bam_index}" temp_input.bam.bai && + + #if str($slice_method.slice_method_selector) == "bed": + + samtools view -@ \${GALAXY_SLOTS:-1} -b -L "${input_interval}" -o unsorted_output.bam temp_input.bam && + + #elif str($slice_method.slice_method_selector) == "chr": + + samtools view -@ \${GALAXY_SLOTS:-1} -b -o unsorted_output.bam temp_input.bam + ${ ' '.join( map( lambda x:'"%s"' % ( x ), str( $slice_method.refs ).split(",") ) ) } && + + #elif str($slice_method.slice_method_selector) == "man": + + samtools view -@ \${GALAXY_SLOTS:-1} -b -o unsorted_output.bam temp_input.bam + + #for $region in $slice_method.regions: + "${region.chrom}:${region.start}-${region.end}" + #end for + + && + + #end if + + samtools sort -O bam -T sorted -@ \${GALAXY_SLOTS:-1} -o "${output_bam}" unsorted_output.bam +]]> + </command> + <inputs> + <param name="input_bam" format="bam" label="Select BAM dataset to slice" type="data" /> + <conditional name="slice_method"> + <param name="slice_method_selector" type="select" label="How do you want to slice your dataset?"> + <option value="bed">using a list of intervals from a BED dataset</option> + <option value="chr">by chromosomes/contigs present in the BAM dataset</option> + <option value="man">by chromosomes/contigs and coordinates</option> + </param> + <when value="bed"> + <param format="bed" label="BED file" name="input_interval" type="data" help="BED datasets can be obtained using "Get Data -> UCSC Main" datasource."/> + </when> + <when value="chr"> + <param name="refs" type="select" optional="False" multiple="True" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Click and type in the box above to see options. You can select multiple entries. If "No options available" is displayed, you need to re-detect metadata on the input dataset. See help section below."> + + <!-- The options tagset below extracts reference names from bam file metadata --> + <!-- This will not work with bed files with old style metadata. However this --> + <!-- Can be easily fixed by re-deceting metadata on a bam dataset by clicking --> + <!-- The pencil icon and settind datatype to "bam" --> + <!-- This change has been commited in the following pull request: --> + <!-- https://github.com/galaxyproject/galaxy/pull/107 --> + + <options> + <filter type="data_meta" ref="input_bam" key="reference_names" /> + </options> + </param> + </when> + <when value="man"> + <repeat name="regions" title="Regions" min="1"> + <param name="chrom" type="select" optional="False" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Select chromosome/contig from the list. If "No options available" is displayed, you need to re-detect metadata on the input dataset. See help section below."> + + <!-- See comments above --> + + <options> + <filter type="data_meta" ref="input_bam" key="reference_names" /> + </options> + </param> + <param name="start" type="integer" min="1" value="0" label="Enter START coordinate (1-based)"/> + <param name="end" type="integer" min="1" value="100" label="Enter END coordinate"/> + </repeat> + + + </when> + </conditional> + + </inputs> + <outputs> + <data format="bam" name="output_bam" /> + </outputs> + <tests> + <test> + <param ftype="bam" name="input_bam" value="bam-slice-input.bam" /> + <param name="slice_method_selector" value="bed"/> + <param ftype="bed" name="input_interval" value="bam-slice.bed" /> + <output file="bam-slice-test1.bam" ftype="bam" name="output_bam" /> + </test> + <test> + <param ftype="bam" name="input_bam" value="bam-slice-input.bam" /> + <param name="slice_method_selector" value="chr"/> + <param name="refs" value="chrM" /> + <output file="bam-slice-test2.bam" ftype="bam" name="output_bam" /> + </test> + <test> + <param ftype="bam" name="input_bam" value="bam-slice-input.bam" /> + <param name="slice_method_selector" value="man"/> + <param name="chrom" value="chrM" /> + <param name="start" value="1" /> + <param name="end" value="1000" /> + <output file="bam-slice-test3.bam" ftype="bam" name="output_bam" /> + </test> + </tests> + <help> +<![CDATA[ + **What it does** - Accepts an input BAM file and an input BED file and creates an output BAM file containing only those alignments that overlap the provided BED intervals. +Allows to restrict (slice) input BAM dataset to a list of intervals defined in a BED file, individual chromosomes, or manually set list of coordinates. BED datasets can be obtained from **Get Data -> UCSC Main**. ------- - -**Citation** +This tool is based on ``samtools view`` command. -For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`_ +@no-chrom-options@ -If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* - +]]> </help> + <expand macro="citations"></expand> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bam-slice.bed Tue Apr 21 17:37:49 2015 -0400 @@ -0,0 +1,38 @@ +chrM 5 1000 myInterval 0 + +chrM 577 647 TRNF 0 + +chrM 648 1601 RNR1 0 + +chrM 1602 1670 TRNV 0 + +chrM 1671 3229 RNR2 0 + +chrM 3230 3304 TRNL1 0 + +chrM 3307 4262 ND1 0 + +chrM 4263 4331 TRNI 0 + +chrM 4329 4400 TRNQ 0 - +chrM 4402 4469 TRNM 0 + +chrM 4470 5511 ND2 0 + +chrM 5512 5579 TRNW 0 + +chrM 5587 5655 TRNA 0 - +chrM 5657 5729 TRNN 0 - +chrM 5761 5826 TRNC 0 - +chrM 5826 5891 TRNY 0 - +chrM 5904 7445 COX1 0 + +chrM 7446 7514 TRNS1 0 - +chrM 7518 7585 TRND 0 + +chrM 7586 8269 COX2 0 + +chrM 8295 8364 TRNK 0 + +chrM 8366 8572 ATP8 0 + +chrM 8527 9207 ATP6 0 + +chrM 9207 9990 COX3 0 + +chrM 9991 10058 TRNG 0 + +chrM 10059 10404 ND3 0 + +chrM 10405 10469 TRNR 0 + +chrM 10470 10766 ND4L 0 + +chrM 10760 12137 ND4 0 + +chrM 12138 12206 TRNH 0 + +chrM 12207 12265 TRNS2 0 + +chrM 12266 12336 TRNL2 0 + +chrM 12337 14148 ND5 0 + +chrM 14149 14673 ND6 0 - +chrM 14674 14742 TRNE 0 - +chrM 14747 15887 CYTB 0 + +chrM 15888 15953 TRNT 0 + +chrM 15956 16023 TRNP 0 -
--- a/test-data/gatk/fake_phiX_variant_locations.bed Thu Mar 27 15:28:06 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -phiX174 1442 1443 -phiX174 1445 1446
--- a/tool_dependencies.xml Thu Mar 27 15:28:06 2014 -0400 +++ b/tool_dependencies.xml Tue Apr 21 17:37:49 2015 -0400 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> - <package name="samtools" version="0.1.19"> - <repository changeset_revision="1ef76f8d8e52" name="package_samtools_0_1_19" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + <package name="samtools" version="1.2"> + <repository changeset_revision="6eea04363026" name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>