Mercurial > repos > iuc > umi_tools_dedup

<tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Extract UMI from fastq files</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="1.12">samtools</requirement>
    </expand>
    <command detect_errors="exit_code"><![CDATA[
        @LINK_SAM_BAM_INPUT@

        echo $input.ext &&
        umi_tools dedup
            #if $output_stats_bool
                --output-stats=stats_outputs
            #end if
            @GROUPDEDUP_OPTIONS@
            @BARCODE_OPTIONS@
            @UMI_GROUPING_OPTIONS@
            @SAMBAM_OPTIONS@
            @FULLSC_OPTIONS@
            @ADVANCED_OPTIONS@
            -I '$input_file' -S deduped.bam
            ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy
            ## compares the generated file with the one in test-data
            ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files`
            ## problem seems to be the BAM file generated with pysam
            ## may be dropped in the future
            --no-sort-output
            @LOG@
            && samtools sort --no-PG deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM

    ]]></command>
    <inputs>
        <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" />
        <param name="output_stats_bool" type="boolean" checked="false" label="Output UMI related statistics files?"/>
        <expand macro="groupdedup_options_macro"/>
        <expand macro="barcode_options_macro"/>
        <expand macro="umi_grouping_options_macro"/>
        <expand macro="sambam_options_macro"/>
        <expand macro="fullsc_options_macro"/>
        <expand macro="advanced_options_macro"/>
        <expand macro="log_input_macro"/>
    </inputs>
    <outputs>
        <data format="bam" name="output" />
        <collection name="output_stats" type="list" label="${tool.name} on ${on_string} stats">
            <filter>output_stats_bool</filter>
            <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/>
            <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/>
            <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/>
        </collection>
        <expand macro="log_output_macro"/>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input" value="group_in1.sam" ftype="sam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
            </conditional>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <output name="output" file="dedup_out1.bam" ftype="bam" lines_diff="2"/><!--lines_diff won't be needed in later versions since umitools use \-\-no-PG internally -->
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="group_in2.sam" ftype="sam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
            </conditional>
            <section name="sambam">
                <param name="paired" value="true" />
            </section>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <output name="output" file="dedup_out2.bam" ftype="bam" lines_diff="2" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="group_in3.bam" ftype="bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
            </conditional>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <output name="output" file="dedup_out3.bam" ftype="bam" lines_diff="2" />
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="group_in4.bam" ftype="bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="tag" />
                <param name="umi_tag" value="BX" />
            </conditional>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <output name="output" file="dedup_out4.bam" ftype="bam" lines_diff="2"/>
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="group_in5.bam" ftype="bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
                <param name="umi_tag" value="BX" />
            </conditional>
            <section name="umi">
                <param name="method" value="cluster" />
            </section>
            <output name="output" file="dedup_out5.bam" ftype="bam" lines_diff="2"/>
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="group_in6.bam" ftype="bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
                <param name="umi_tag" value="BX" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
        </test>
        <test expect_num_outputs="5">
            <param name="input" value="group_in6.bam" ftype="bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="read_id" />
                <param name="umi_tag" value="BX" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <param name="output_stats_bool" value="true"/>
            <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
            <output_collection name="output_stats">
                <element name="edit_distance" file="stats_outputs_edit_distance.tsv" />
                <element name="per_umi" file="stats_outputs_per_umi.tsv" />
                <element name="per_umi_per_position" file="stats_outputs_per_umi_per_position.tsv" />
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
umi_tools dedup - Deduplicate reads based on their UMI and mapping coordinates
==============================================================================

Purpose
-------

The purpose of this command is to deduplicate BAM files based on the first
mapping co-ordinate and the UMI attached to the read.

@BARCODE_HELP@

@UMI_GROUPING_HELP@

Selecting the representative read
---------------------------------
For every group of duplicate reads, a single representative read is
retained.The following criteria are applied to select the read that
will be retained from a group of duplicated reads:

1. The read with the lowest number of mapping coordinates (see
``--multimapping-detection-method`` option)

2. The read with the highest mapping quality. Note that this is not
the read sequencing quality and that if two reads have the same
mapping quality then one will be picked at random regardless of the
read quality.

Otherwise a read is chosen at random.

Optional statistics output
--------------------------

One can use the edit distance between UMIs at the same position as an
quality control for the deduplication process by comparing with
a null expectation of random sampling. For the random sampling, the
observed frequency of UMIs is used to more reasonably model the null
expectation.

Use the option ``Output UMI related statistics files?`` generate stats outfiles:

edit_distance
  Reports the (binned) average edit distance between the UMIs at each
  position. Positions with a single UMI are reported seperately.  The
  edit distances are reported pre- and post-deduplication alongside
  the null expectation from random sampling of UMIs from the UMIs
  observed across all positions. Note that separate null
  distributions are reported since the null depends on the observed
  frequency of each UMI which is different pre- and
  post-deduplication. The post-duplication values should be closer to
  their respective null than the pre-deduplication vs null comparison

In addition, this option will trigger reporting of further summary
statistics for the UMIs which may be informative for selecting the
optimal deduplication method or debugging.

Each unique UMI sequence may be observed [0-many] times at multiple
positions in the BAM. The following files report the distribution for
the frequencies of each UMI.

per_umi_per_position
  The `_stats_per_umi_per_position.tsv` file simply tabulates the
  counts for unique combinations of UMI and position. E.g if prior to
  deduplication, we have two positions in the BAM (POSa, POSb), at
  POSa we have observed 2*UMIa, 1*UMIb and at POSb: 1*UMIc, 3*UMId,
  then the stats file is populated thus:

  ====== =============
  counts instances_pre
  ------ -------------
  1      2
  2      1
  3      1
  ====== =============

  If post deduplication, UMIb is grouped with UMIa such that POSa:
  3*UMIa, then the `instances_post` column is populated thus:

  ====== ============= ==============
  counts instances_pre instances_post
  ------ ------------- --------------
  1      2             1
  2      1             0
  3      1             2
  ====== ============= ==============

per_umi_per
  The `_stats_per_umi_per.tsv` table provides UMI-level summary
  statistics. Keeping in mind that each unique UMI sequence can be
  observed at [0-many] times across multiple positions in the BAM,

  :times_observed: How many positions the UMI was observed at
  :total_counts: The total number of times the UMI was observed across all positions
  :median_counts: The median for the distribution of how often the UMI was observed at each position (excluding zeros)

  Hence, whenever times_observed=1, total_counts==median_counts.]]></help>
    <expand macro="citations" />
</tool>
author	iuc
date	Mon, 13 Sep 2021 14:51:31 +0000
parents	7fa28eb10fed
children