view umi-tools_extract.xml @ 17:f3759eec3018 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit 31bad8c5bf75981eafcd19ec6b00593f184fdeb8"
author iuc
date Thu, 18 Nov 2021 08:24:00 +0000
parents 7accf7407811
children 7a7e33d28f62
line wrap: on
line source

<tool id="umi_tools_extract" name="UMI-tools extract" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Extract UMI from fastq files</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
    @COMMAND_LINK@

    umi_tools extract
        
        @FASTQ_BARCODE_EXTRACTION_OPTIONS@
        #if $input_type_cond.input_type == 'single':
            #if $gz:
                --stdin=input_single.gz
                --stdout out.gz
            #else
                --stdin=input_single.txt
                --stdout '$out'
            #end if
        #else:
            #if $gz:
                --stdin=input_read1.gz
                --read2-in=input_read2.gz
                --stdout out1.gz
                --read2-out=out2.gz
            #else:
                --stdin=input_read1.txt
                --read2-in=input_read2.txt
                #if $input_type_cond.input_type == 'paired'
                    --stdout '$out'
                    --read2-out='$out2'
                #else
                    --stdout '$out_paired_collection.forward'
                    --read2-out='$out_paired_collection.reverse'
                #end if
            #end if
            $input_type_cond.reconcile_pairs
        #end if

        #if $whitelist
            --whitelist='$whitelist'
        #end if
        #if $blacklist
            --blacklist='$blacklist'
        #end if
        $error_correct_cell.value

        #if $quality.quality_selector =='true':
            #if str($quality.quality_filter_threshold) != ''
                --quality-filter-threshold '$quality.quality_filter_threshold'
            #end if
            #if str($quality.quality_filter_mask) != ''
                --quality-filter-mask '$quality.quality_filter_mask'
            #end if
            #if $input_type_cond.input_type != 'paired_collection'
                #set input=$input_type_cond.input_read1
            #else
                #set input=$input_type_cond.input_readpair.forward
            #end if
            --quality-encoding
            #if $input.ext.startswith("fastqillumina")
                phred64
            #else if $input.ext.startswith("fastqsolexa")
                solexa
            #else
                phred33
            #end if
        #end if
        @LOG@
        #if $gz:
            #if $input_type_cond.input_type == 'single':
                && mv out.gz '$out'
            #else if $input_type_cond.input_type == 'paired' 
                && mv out1.gz '$out'
                && mv out2.gz '$out2'
            #else
                && mv out1.gz '$out_paired_collection.forward'
                && mv out2.gz '$out_paired_collection.reverse'
            #end if
        #end if
    ]]></command>
    <inputs>
        <expand macro="input_types">
                <param argument="--reconcile-pairs" type="boolean" truevalue="--reconcile-pairs" falsevalue="" checked="false" label="Allow unpaired reads" help="Allow the presences of reads in read2 input that are not present in read1 input. This allows cell barcode filtering of read1s without considering read2s" />
        </expand>
        <expand macro="fastq_barcode_extraction_options_macro"/>

        <param argument="--whitelist" type="data" optional="true" format="txt,tabular,tsv" label="Allowlist of accepted barcodes" />
        <param argument="--blacklist" type="data" optional="true" format="txt,tabular,tsv" label="Denylist of accepted barcodes" />
        <param argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command" />

        <conditional name="quality">
            <param name="quality_selector" type="select" label="Enable quality filter?" >
                <option value="false">No</option>
                <option value="true">Yes</option>
            </param>
            <when value="false">
            </when>
            <when value="true">
                <param argument="--quality-filter-threshold" label="Phred score threshold"
                    type="integer" value="" optional="true"
                    help="Remove reads where any UMI base quality score falls below this threshold" />
                <param argument="--quality-filter-mask" label="Mask UMI bases below threshold"
                    type="integer" value="" optional="true"
                    help="If a UMI base has a quality below this threshold,
                    replace the base with 'N'" />
            </when>
        </conditional>
        <expand macro="log_input_macro"/>
    </inputs>
    <outputs>
        <data name="out" format_source="input_read1" label="${tool.name} on ${on_string}: Reads" >
            <filter>input_type_cond['input_type'] in ['single', 'paired']</filter>
        </data>
        <data name="out2" format_source="input_read2" label="${tool.name} on ${on_string}: Reads2" >
            <filter>input_type_cond['input_type'] == 'paired'</filter>
        </data>
        <collection name="out_paired_collection" type="paired" label="${tool.name} on ${on_string}: Reads">
            <data name="forward" format_source="input_readpair" />
            <data name="reverse" format_source="input_readpair" />
            <filter>input_type_cond['input_type'] == 'paired_collection'</filter>
        </collection>
        <expand macro="log_output_macro"/>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <conditional name="input_type_cond">
                <param name="input_type" value="single" />
                <param name="input_read1" value="t_R1.fastq" ftype="fastqsanger" />
                <param name="bc_pattern" value="XXXNNN" />
            </conditional>
            <conditional name="extract_method_cond">
                <param name="prime3" value="true" />
            </conditional>
            <param name="quality_selector" value="true" />
            <param name="quality_filter_threshold" value="10" />
            <param name="quality_encoding" value="phred33" />
            <param name="log" value="true"/>
            <output name="out" file="out_SE.fastq" ftype="fastqsanger" />
            <output name="out_log" >
                <assert_contents>
                    <has_text text="Input Reads: 100" />
                    <has_text text="umi quality: 28" />
                    <has_text text="Reads output: 72" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <conditional name="input_type_cond">
                <param name="input_type" value="paired" />
                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="NNNXXX" />
            </conditional>
            <param name="log" value="true"/>
            <output name="out" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" />
            <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" />
            <output name="out_log" >
                <assert_contents>
                    <has_text text="Input Reads: 100" />
                    <has_text text="Reads output: 100" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <conditional name="input_type_cond">
                <param name="input_type" value="paired_collection" /> <!-- same as before, but uncompressed -->
                <param name="paired_type" value="no" />
                <param name="input_readpair">
                    <collection type="paired" >
                        <element name="forward" ftype="fastqsanger" value="t_R1.fastq" />
                        <element name="reverse" ftype="fastqsanger" value="t_R2.fastq" />
                    </collection>
                </param>
                <param name="bc_pattern" value="NNNXXX" />
            </conditional>
            <param name="log" value="true"/>
            <output_collection name="out_paired_collection" type="paired">
                <element name="forward" file="out_R1.fastq" ftype="fastqsanger" />
                <element name="reverse" file="out_R2.fastq" ftype="fastqsanger" />
            </output_collection>
            <output name="out_log" >
                <assert_contents>
                    <has_text text="Input Reads: 100" />
                    <has_text text="Reads output: 100" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <conditional name="input_type_cond">
                <param name="input_type" value="paired" />
                <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" />
            </conditional>
            <param name="extract_method" value="string" />
            <param name="whitelist" value="scrb_seq_barcodes" />
            <param name="log" value="true"/>
            <output name="out2" file="scrb_extract.fastq.gz" decompress="true"  ftype="fastqsanger.gz" />
        </test>
        <test expect_num_outputs="3"><!-- same as above but with regex barcode-->
            <conditional name="input_type_cond">
                <param name="input_type" value="paired" />
                <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="^(?P&lt;cell_1&gt;.{6})(?P&lt;umi_1&gt;.{10})" />
            </conditional>
            <param name="extract_method" value="regex" />
            <param name="whitelist" value="scrb_seq_barcodes" />
            <param name="log" value="true"/>
            <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastqsanger.gz" />
        </test>
        <test expect_num_outputs="2"><!-- CelSeq2 example -->
            <conditional name="input_type_cond">
                <param name="input_type" value="paired" />
                <param name="input_read1" value="read_R1.200.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="read_R2.200.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="NNNNNNCCCCCC" />
            </conditional>
            <param name="extract_method" value="string" />
            <output name="out" file="read_R1.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" />
            <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" />
        </test>
    </tests>
    <help><![CDATA[

extract - Extract UMI from fastq
================================

Extract UMI barcode from a read and add it to the read name, leaving
any sample barcode in place

Can deal with paired end reads and UMIs
split across the paired ends. Can also optionally extract cell
barcodes and append these to the read name also. See the section below
for an explanation for how to encode the barcode pattern(s) to
specficy the position of the UMI +/- cell barcode.


Filtering and correcting cell barcodes
--------------------------------------

``umi_tools extract`` can optionally filter cell barcodes against a user-supplied
whitelist (``--whitelist``). If a whitelist is not available for your data,
e.g
if you have performed droplet-based scRNA-Seq, you can use the
whitelist tool.

Cell barcodes which do not match the whitelist (user-generated or
automatically generated) can also be optionally corrected using the
``--error-correct-cell`` option.

The whitelist should be in  the following format (tab-separated)::

        AAAAAA    AGAAAA
        AAAATC
        AAACAT
        AAACTA    AAACTN,GAACTA
        AAATAC
        AAATCA    GAATCA
        AAATGT    AAAGGT,CAATGT

Where column 1 is the whitelisted cell barcodes and column 2 is
the list (comma-separated) of other cell barcodes which should be
corrected to the barcode in column 1. If the ``--error-correct-cell``
option is not used, this column will be ignored. Any additional columns
in the whitelist input, such as the counts columns from the output of
umi_tools whitelist, will be ignored.

@FASTQ_BARCODE_EXTRACTION_HELP@

    ]]></help>
    <expand macro="citations" />
</tool>