view tools/picard/picard_MarkDuplicates.xml @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

<tool name="Mark Duplicates" id="picard_MarkDuplicates" version="0.01">
  <command interpreter="python">
    picard_wrapper.py
      --input="$input_file"
      --remove-dups="$remDups"
      --read-regex="$readRegex"
      --opt-dup-dist="$optDupeDist"
      --output-format=$outputFormat
      --output-txt=$outMetrics
      #if str( $outputFormat ) == "sam"
        #if str( $remDups ) == "true"
          --output-sam=$outFileSamRemoved
        #else
          --output-sam=$outFileSamMarked
        #end if
      #else if str( $outputFormat ) == "bam"
        #if str( $remDups ) == "true"
          --output-sam=$outFileBamRemoved
        #else
          --output-sam=$outFileBamMarked
        #end if
      #end if
      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/"
      --picard-cmd="MarkDuplicates"
  </command>
  <inputs>
    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to mark duplicates in"
      help="If the select list is empty, you need to upload or import some aligned short read data from a shared library"/>
    <param name="remDups" type="boolean" label="Remove duplicates from output file" truevalue="true" falsevalue="false" checked="False" 
      help="If true do not write duplicates to the output file instead of writing them with appropriate flags set" />
    <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="80"
      label="Regular expression that can be used to parse read names in the incoming SAM file" 
      help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
      <sanitizer>
        <valid initial="string.printable">
         <remove value="&apos;"/>
        </valid>
        <mapping initial="none">
          <add source="&apos;" target="__sq__"/>
        </mapping>
      </sanitizer>
    </param>
    <param name="optDupeDist" value="100" type="text"
      label="The maximum offset between two duplicate clusters in order to consider them optical duplicates" size="5" 
      help="Common range 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100" />
    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output bam instead of sam" help="Uncheck for sam output" />
  </inputs>
  <outputs>
    <data format="txt" name="outMetrics" label="${tool.name} on ${on_string}: metrics" />
    <data format="sam" name="outFileSamMarked" label="${tool.name} on ${on_string}: duplicates marked sam">
      <filter>outputFormat is False</filter>
      <filter>remDups is False</filter>
    </data>
    <data format="sam" name="outFileSamRemoved" label="${tool.name} on ${on_string}: duplicates removed sam">
      <filter>outputFormat is False</filter>
      <filter>remDups is True</filter>
    </data>
    <data format="bam" name="outFileBamMarked" label="${tool.name} on ${on_string}: duplicates marked bam">
      <filter>outputFormat is True</filter>
      <filter>remDups is False</filter>
    </data>
    <data format="bam" name="outFileBamRemoved" label="${tool.name} on ${on_string}: duplicates removed bam">
      <filter>outputFormat is True</filter>
      <filter>remDups is True</filter>
    </data>
  </outputs>
  <tests>
    <!-- Functional tests with Picard bam outputs currently aren't working
    <test>
    -->
      <!-- Command to run:
      java -jar MarkDuplicates.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.bam METRICS_FILE=picard_MD_output1.txt OUTPUT=picard_MD_output2.bam REMOVE_DUPLICATES=false ASSUME_SORTED=true READ_NAME_REGEX="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" OPTICAL_DUPLICATE_PIXEL_DISTANCE=100
      -->
    <!--
      <param name="input_file" value="picard_input_tiny_coord.bam" />
      <param name="remDups" value="false" />
      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
      <param name="optDupeDist" value="100" />
      <param name="outputFormat" value="bam" />
      <output name="outMetrics" file="picard_MD_output1.txt" ftype="txt" lines_diff="4" />
      <output name="outFileBamMarked" file="picard_MD_output2.bam" ftype="bam" />
    </test>
    -->
    <test>
      <!-- Command to run:
      java -jar MarkDuplicates.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.sam METRICS_FILE=picard_MD_output3.txt O=picard_MD_output4.sam REMOVE_DUPLICATES=true ASSUME_SORTED=true READ_NAME_REGEX="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" OPTICAL_DUPLICATE_PIXEL_DISTANCE=100
      -->
      <param name="input_file" value="picard_input_tiny_coord.sam" />
      <param name="remDups" value="true" />
      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
      <param name="optDupeDist" value="100" />
      <param name="outputFormat" value="sam" />
      <output name="outMetrics" file="picard_MD_output3.txt" ftype="txt" lines_diff="4" />
      <output name="outFileSamRemoved" file="picard_MD_output4.sam" ftype="sam" />
    </test>
  </tests>
  
  <help>

.. class:: infomark

**Purpose**

MarkDuplicates examines aligned records in the supplied sam or bam file to identify duplicate molecules.

**Picard documentation**

This is a Galaxy interface for MarkDuplicates, a part of Picard-tools_, which is closely related to SAMTools_.

 .. _Picard-tools: http://picard.sourceforge.net/index.shtml
 .. _SAMTools: http://samtools.sourceforge.net/

------

**Input**

Either a sam file or a bam file is required. If a bam file is used, it must be coordinate-sorted.

**Outputs**

This tool provides two outputs. The first contains the marked (or kept) records and is either bam (the default) or sam, according to user selection. Bam is recommended since it is smaller. The second output is the metrics file, which is a text file containing information about the duplicates. 

**MarkDuplicates parameters**

The two main parameters to be concerned with are the flag for removing duplicates and the regular expression needed to identify reads. If it is set to remove duplicates, they will not be written to the output file; otherwise they will appear in the output but will be flagged appropriately. The read name regular expression is used to parse read names from the input sam file. Read names are parsed to extract three variables: tile/region, x coordinate, and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order (capture groups are enclosed in parentheses). Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. 

One other parameter that can be set is the maximum offset between two duplicate clusters in order for them to be considered optical duplicates. Later versions of the Illumina pipeline that multiply pixel values by 10 should generally use 50-100 pixels; otherwise 5-10 is normal. The default is set to 100. 

One parameter that Picard's MarkDuplicates offers that is automatically set by Galaxy is the ASSUME_SORTED, which is set to true because Galaxy bam should always be coordinate-sorted.

**Note on the use of regular expressions for read name parsing**

The regular expression (regex) is used to parse the read names, so it's important to get it exactly right (so you probably don't want to edit this unless you know exactly what you're doing). The three parts of the read names identified are tile/region, x coordinate, and y coordinate, which are used in conjunction with the optical duplication rate to more accurately estimate library size.



  </help>
</tool>