Mercurial > repos > lparsons > cutadapt

<tool id="cutadapt" name="Cutadapt" version="1.16.7" profile="17.09">
    <description>Remove adapter sequences from Fastq/Fasta</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="1.16">cutadapt</requirement>
    </requirements>

    <version_command>cutadapt --version</version_command>

    <command detect_errors="exit_code"><![CDATA[
## Link in the input and output files, so Cutadapt can tell their type

#import re
#set read1 = "input_f"
#set read2 = "input_r"
#set paired = False
#set library_type = str($library.type)
#if $library_type == 'paired':
#set paired = True
#set read1 = re.sub('[^\w\-\s]', '_', str($library.input_1.element_identifier))
#set read2 = re.sub('[^\w\-\s]', '_', str($library.input_2.element_identifier))
#set input_1 = $library.input_1
#set input_2 = $library.input_2
#else if $library_type == 'paired_collection'
#set paired = True
#set input_1 = $library.input_1.forward
#set input_2 = $library.input_1.reverse
#set read1 = re.sub('[^\w\-\s]', '_', str($library.input_1.name)) + "_1"
#set read2 = re.sub('[^\w\-\s]', '_', str($library.input_1.name)) + "_2"
#else
#set input_1 = $library.input_1
#set read1 = re.sub('[^\w\-\s]', '_', str($library.input_1.element_identifier))
#end if

#if $input_1.is_of_type("fastq.gz", "fastqsanger.gz"):
    #set ext = ".fq.gz"
#else if $input_1.is_of_type("fastq.bz2", "fastqsanger.bz2"):
    #set ext = ".fq.bz2"
#else if $input_1.is_of_type('fasta'):
    #set ext = ".fa"
#else:
    #set ext = ".fq"
#end if
#set read1 = $read1 + $ext
#set out1 = "out1" + $ext
#set rest_output = "rest_output" + $ext
#set wild_output = "wild_output" + $ext
#set too_short_output = "too_short_output" + $ext
#set too_long_output = "too_long_output" + $ext
#set untrimmed_output = "untrimmed_output" + $ext
ln -f -s '${input_1}' '$read1' &&

#if $paired:
    #if $input_2.is_of_type("fastq.gz", "fastqsanger.gz"):
        #set ext2 = ".fq.gz"
    #else if $input_2.is_of_type("fastq.bz2", "fastqsanger.bz2"):
        #set ext2 = ".fq.gz"
    #else if $input_2.is_of_type('fasta'):
        #set ext2 = ".fa"
    #else:
        #set ext2 = ".fq"
    #end if
    #set read2 = $read2 + $ext2
    #set out2 = "out2" + $ext2
    #set too_short_paired_output = "too_short_paired_output" + $ext2
    #set too_long_paired_output = "too_long_paired_output" + $ext2
    #set untrimmed_paired_output = "untrimmed_paired_output" + $ext2
    ln -f -s '${input_2}' '$read2' &&
#end if

## Run Cutadapt

#if $output_options.multiple_output:
    mkdir split &&
#end if

cutadapt

## cutadapt (up to version 1.16) can't be run in multicore mode with these options
#if not any(($output_options.info_file, $output_options.rest_file, $output_options.wildcard_file, $output_options.too_short_file, $output_options.too_long_file, $output_options.untrimmed_file))
     -j \${GALAXY_SLOTS:-1}
#end if

#if str( $library.type ) == "single":
    @read1_options@
    #if $output_options.multiple_output:
        --output='split/{name}.${input_1.ext}'
    #else:
        --output='$out1'
    #end if
#else:
    @read1_options@
    @read2_options@
    --output='$out1'
    --paired-output='$out2'
#end if

--error-rate=$adapter_options.error_rate
--times=$adapter_options.count
--overlap=$adapter_options.overlap
$adapter_options.no_indels
$adapter_options.match_read_wildcards
$adapter_options.no_trim
$adapter_options.mask_adapter

$filter_options.discard
$filter_options.discard_untrimmed
#if str($filter_options.min):
    --minimum-length=$filter_options.min
#end if
#if str($filter_options.max):
    --maximum-length=$filter_options.max
#end if
#if $filter_options.max_n:
    --max-n=$filter_options.max_n
#end if
#if str( $library.type ) != "single":
    #if $filter_options.pair_filter:
        --pair-filter=$filter_options.pair_filter
    #end if
#end if


#if str($read_mod_options.quality_cutoff) != '0':
   --quality-cutoff=$read_mod_options.quality_cutoff
#end if
#if str($read_mod_options.nextseq_trim) != '0':
    --nextseq-trim=$read_mod_options.nextseq_trim
#end if
$read_mod_options.trim_n
#if $read_mod_options.prefix != '':
    --prefix="$read_mod_options.prefix"
#end if
#if $read_mod_options.suffix != '':
    --suffix="$read_mod_options.suffix"
#end if
#if str($read_mod_options.length) != '0':
    --length=$read_mod_options.length
#end if
#if $read_mod_options.length_tag != '':
    --length-tag="$read_mod_options.length_tag"
#end if

'${read1}'
#if $paired:
    '${read2}'
#end if

#if $output_options.report:
    > report.txt
#end if
    ]]></command>
    <inputs>

        <!-- Reads -->
        <conditional name="library">
            <param name="type" type="select" label="Single-end or Paired-end reads?">
                <option value="single">Single-end</option>
                <option value="paired">Paired-end</option>
                <option value="paired_collection">Paired-end Collection</option>
            </param>

            <when value="single">
                <param name="input_1" format="fastq.gz,fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTQ/A file" help="Should be of datatype &quot;fastq.gz&quot; or &quot;fasta&quot;" />
                <expand macro="single_end_options" />
            </when>

            <when value="paired">
                <param name="input_1" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTQ/A file #1" help="Should be of datatype &quot;fastq.gz&quot;or &quot;fasta&quot;" />
                <param name="input_2" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data" label="FASTQ/A file #2" help="Should be of datatype &quot;fastq.gz&quot;or &quot;fasta&quot;" />
                <expand macro="paired_end_options" />
            </when>

            <when value="paired_collection">
                <param name="input_1" format="fastqsanger,fastqsanger.gz,fastqsanger.bz2,fasta" type="data_collection" collection_type="paired" label="Paired Collection" help="Should be of datatype &quot;fastq.gz&quot; or &quot;fasta&quot;" />
                <expand macro="paired_end_options" />
            </when>

        </conditional>

        <!-- Adapter Options -->
        <section name="adapter_options" title="Adapter Options">
            <param name="error_rate" argument="--error-rate" type="float" min="0" max="1" value="0.1" label="Maximum error rate" help="Maximum allowed error rate (no. of errors divided by the length of the matching region)." />
            <param name="no_indels" argument="--no-indels" type="boolean" value="False" truevalue="--no-indels" falsevalue="" label="Do not allow indels (Use ONLY with anchored 5' (front) adapters)." help="Do not allow indels in the alignments. That is, allow only mismatches. This option is currently only supported for anchored 5' adapters ('^ADAPTER') (default: both mismatches and indels are allowed)." />
            <param name="count" argument="--times" type="integer" min="1" value="1" label="Match times" help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times." />
            <param name="overlap" argument="--overlap" type="integer" min="1" value="3" label="Minimum overlap length" help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified. This reduces the number of bases trimmed purely due to short random adapter matches." />
            <param name="match_read_wildcards" type="select"  label="Match Wildcards" help="Allow 'N's as matches. Default: In the adapters but not in the reads">
                <option value=" " selected="True">In the adapters but not in the reads</option>
                <option value="--match-read-wildcards">In the adapters and in the reads</option>
                <option value="--no-match-adapter-wildcards">Nowhere</option>
            </param>
            <param name="no_trim" argument="--no-trim" type="boolean" value="False" truevalue="--no-trim" falsevalue="" label="Do not trim adapters" help="Match and redirect reads to output/untrimmed-output as usual, but don't remove the adapters (default: trim the adapters)." />
            <param name="mask_adapter" argument="--mask-adapter" type="boolean" value="False" truevalue="--mask-adapter" falsevalue="" label="Mask Adapters" help="Mask adapter bases with 'N' instead of trimming them (default: trim adapters)." />
        </section>

        <!-- Filter Options -->
        <section name="filter_options" title="Filter Options">
            <param name="discard" argument="--discard-trimmed" type="boolean" value="False" truevalue="--discard-trimmed" falsevalue="" label="Discard Trimmed Reads" help="Discard reads that contain the adapter instead of trimming them. Use the 'Minimum overlap length' option in order to avoid throwing away too many randomly matching reads!" />
            <param name="discard_untrimmed" argument="--discard_untrimmed" type="boolean" value="False" truevalue="--discard-untrimmed" falsevalue="" label="Discard Untrimmed Reads" help="Discard reads that do not contain the adapter." />
            <param name="min" argument="--minimum-length" type="integer" min="0" optional="True" value="" label="Minimum length" help="Discard trimmed reads that are shorter than LENGTH.  Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted." />
            <param name="max" argument="--maximum-length" type="integer" min="0" optional="True" value="" label="Maximum length" help="Discard trimmed reads that are longer than LENGTH.  Reads that are too long even before adapter removal are also discarded. In colorspace, an initial primer is not counted." />
            <param name="max_n" argument="--max-n" type="float" min="0" optional="True" label="Max N" help="Discard reads with more than this number of 'N' bases. A number between 0 and 1 is interpreted as a fraction of the read length." />
            <param name="pair_filter" argument="--pair-filter" type="select" optional="True" label="Pair filter" help="Which of the reads in a paired-end read have to match the filtering criterion in order for the pair to be filtered. Default: any">
                <option value="any" selected="True">any</option>
                <option value="both">both</option>
            </param>
        </section>

        <!-- Read Modification Options -->
        <section name="read_mod_options" title="Read Modification Options">
            <param name="quality_cutoff" argument="--quality-cutoff" type="text" value="0" label="Quality cutoff" help=" Trim low-quality bases from 5' and/or 3' ends of each read before adapter removal. Applied to both reads if data is paired. If one value is given, only the 3' end is trimmed. If two comma-separated cutoffs are given, the 5' end is trimmed with the first cutoff, the 3' end with the second.">
                <sanitizer>
                    <valid initial="string.digits"><add value="," /></valid>
                </sanitizer>
            </param>
            <param name="nextseq_trim" argument="--nextseq-trim" type="integer" value="0" label="NextSeq trimming" help="Experimental option for quality trimming of NextSeq data. This is necessary because that machine cannot distinguish between G and reaching the end of the fragment (it encodes G as ‘black’). This option works like regular quality trimming (where one would use -q 20 instead), except that the qualities of G bases are ignored." />
            <param name="trim_n" argument="--trim-n" type="boolean" truevalue="--trim-n" falsevalue="" checked="False" label="Trim Ns" help="Trim N's on ends of reads." />
            <param name="prefix" argument="--prefix" label="Prefix" type="text" help="Add this prefix to read names" />
            <param name="suffix" argument="--suffix" label="Suffix" type="text" help="Add this suffix to read names" />
            <param name="strip_suffix" argument="--strip-suffix" label="Strip suffix" type="text" help="Remove this suffix from read names if present." />
            <param name="length" argument="--length" type="integer" value="0" label="Length" help="Shorten reads to this length. This modification is applied after adapter trimming." />
            <param name="length_tag" argument="--length-tag" label="Length Tag" type="text" help="Search for TAG followed by a decimal number in the name of the read (description/comment field of the FASTA or FASTQ file). Replace the decimal number with the correct length of the trimmed read. For example, use --length-tag 'length=' to search for fields like 'length=123'." />
        </section>

        <!-- Output Options -->
        <section name="output_options" title="Output Options">
            <param name="report" type="boolean" value="False" label="Report" help="Cutadapt's per-adapter statistics. You can use this file with MultiQC."/>
            <param name="info_file" argument="--info-file" type="boolean" value="False" label="Info File" help="Write information about each read and its adapter matches to a file."/>
            <param name="rest_file" argument="--rest-file" type="boolean" value="False" label="Rest of Read" help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file."/>
            <param name="wildcard_file" argument="--wildcard-file" type="boolean" value="False" label="Wildcard File" help="When the adapter has wildcard bases ('N's) write adapter bases matching wildcard positions to file."/>
            <param name="too_short_file" argument="--too-short-output" type="boolean" value="False" label="Too Short Reads" help="Write reads that are too short (according to minimum length specified) to a file. (default: discard reads)"/>
            <param name="too_long_file" argument="--too-long-output" type="boolean" value="False" label="Too Long Reads" help="Write reads that are too long (according to maximum length specified) to a file. (default: discard reads)"/>
            <param name="untrimmed_file" argument="--untrimmed-output" type="boolean" value="False" label="Untrimmed Reads" help="Write reads that do not contain the adapter to a separate file, instead of writing them to the regular output file.  (default: output to same file as trimmed)"/>
            <param name="multiple_output" argument="" type="boolean" value="False" label="Multiple output" help="Create a separate file for each adapter trimmed  (default: all trimmed reads are in a single file)"/>
        </section>

    </inputs>

    <outputs>
        <data name="out1" format="fastqsanger" metadata_source="input_1" from_work_dir="out1*" label="${tool.name} on ${on_string}: Read 1 Output">
            <filter>(output_options['multiple_output'] is False)</filter>
            <expand macro="inherit_format_1" />
        </data>
        <data name="out2" format="fastqsanger" metadata_source="input_2" from_work_dir="out2*" label="${tool.name} on ${on_string}: Read 2 Output" >
            <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection')</filter>
            <expand macro="inherit_format_2" />
        </data>

        <data name="report" format="txt" from_work_dir="report.txt" label="${tool.name} on ${on_string}: Report">
            <filter>(output_options['report'] is True)</filter>
        </data>

        <data  name="info_file" format="txt" metadata_source="input_1" label="${tool.name} on ${on_string}: Info File" >
            <filter>(output_options['info_file'] is True)</filter>
        </data>

        <data name="rest_output" format="fastqsanger" metadata_source="input_1" from_work_dir="rest_output*" label="${tool.name} on ${on_string}: Rest of Reads (R1 only)" >
            <filter>(output_options['rest_file'] is True)</filter>
            <expand macro="inherit_format_1" />
        </data>

        <data name="wild_output" format="txt" metadata_source="input_1" from_work_dir="wild_output*" label="${tool.name} on ${on_string}: Wildcard File" >
            <filter>(output_options['wildcard_file'] is True)</filter>
        </data>

        <data name="untrimmed_output" format="fastqsanger" metadata_source="input_1" from_work_dir="untrimmed_output*" label="${tool.name} on ${on_string}: Untrimmed Read 1" >
            <filter>(output_options['untrimmed_file'] is True)</filter>
            <expand macro="inherit_format_1" />
        </data>
        <data name="untrimmed_paired_output" format="fastqsanger" metadata_source="input_2" from_work_dir="untrimmed_paired_output*" label="${tool.name} on ${on_string}: Untrimmed Read 2" >
            <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection')</filter>
            <filter>(output_options['untrimmed_file'] is True)</filter>
            <expand macro="inherit_format_2" />
        </data>

        <data name="too_short_output" format="fastqsanger" metadata_source="input_1" from_work_dir="too_short_output*" label="${tool.name} on ${on_string}: Too Short Read 1" >
            <filter>(output_options['too_short_file'] is True)</filter>
            <expand macro="inherit_format_1" />
        </data>
        <data name="too_short_paired_output" format="fastqsanger" metadata_source="input_2" from_work_dir="too_short_paired_output*" label="${tool.name} on ${on_string}: Too Short Read 2" >
            <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection')</filter>
            <filter>(output_options['too_short_file'] is True)</filter>
            <expand macro="inherit_format_2" />
        </data>

        <data name="too_long_output" format="fastqsanger" metadata_source="input_1" from_work_dir="too_long_output*" label="${tool.name} on ${on_string}: Too Long Read 1" >
            <filter>(output_options['too_long_file'] is True)</filter>
            <expand macro="inherit_format_1" />
        </data>
        <data name="too_long_paired_output" format="fastqsanger" metadata_source="input_2" from_work_dir="too_long_paired_output*" label="${tool.name} on ${on_string}: Too Long Read 2" >
            <filter>(library['type'] == 'paired' or library['type'] == 'paired_collection')</filter>
            <filter>(output_options['too_long_file'] is True)</filter>
            <expand macro="inherit_format_2" />
        </data>

        <collection name="split_output" type="list" label="${tool.name} on ${on_string}: Split outputs"  >
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fastq.*)" directory="split" />
            <filter>(output_options['multiple_output'] is True)</filter>
        </collection>

    </outputs>

    <tests>
        <!-- Ensure fastq works -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq" value="cutadapt_small.fastq" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <output name="out1" file="cutadapt_small.out" ftype="fastq"/>
        </test>
        <!-- Ensure single end fastq.gz works -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <output name="out1" decompress="True" file="cutadapt_out1.fq.gz" ftype="fastq.gz"/>
        </test>
        <!-- Ensure paired end fastq.gz works -->
        <test>
            <param name="type" value="paired" />
            <param name="input_1" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
            <param name="input_2" ftype="fastq.gz" value="bwa-mem-fastq2.fq.gz" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="adapter_source_list2" value="user"/>
            <param name="adapter2" value="AGATCGGAAGAGC"/>
            <output name="out1" decompress="True" file="cutadapt_out1.fq.gz" ftype="fastq.gz"/>
            <output name="out2" decompress="True" file="cutadapt_out2.fq.gz" ftype="fastq.gz"/>
            <assert_command>
                <not_has_text text="--discard-trimmed"/>
                <not_has_text text="--discard-untrimmed"/>
                <not_has_text text="--minimum-length"/>
                <not_has_text text="--maximum-length"/>
                <not_has_text text="--max-n"/>
                <has_text text="--pair-filter=any"/>
            </assert_command>
        </test>
        <!-- Ensure paired collection works -->
        <test>
            <param name="type" value="paired_collection" />
            <param name="input_1">
                <collection type="paired">
                    <element name="forward" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
                    <element name="reverse" ftype="fastq.gz" value="bwa-mem-fastq2.fq.gz" />
                </collection>
            </param>
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="adapter_source_list2" value="user"/>
            <param name="adapter2" value="AGATCGGAAGAGC"/>
            <output name="out1" decompress="True" file="cutadapt_out1.fq.gz" ftype="fastq.gz"/>
            <output name="out2" decompress="True" file="cutadapt_out2.fq.gz" ftype="fastq.gz"/>
        </test>
        <!-- Ensure built-in adapters work -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq" value="cutadapt_small.fastq" />
            <param name="adapter_source_list" value="builtin"/>
            <param name="adapter" value="TGTAGGCC"/>
            <output name="out1" file="cutadapt_builtin.out" ftype="fastq"/>
        </test>
        <!-- Ensure discard file output works -->
        <test>
            <param name="input_1" ftype="fastq" value="cutadapt_small.fastq" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="TTAGACATATCTCCGTCG"/>
            <param name="output_filtering" value="filter"/>
            <section name="filter_options">
                <param name="discard" value="True"/>
            </section>
            <param name="read_modification" value="none"/>
            <param name="output_type" value="default"/>
            <output name="out1" file="cutadapt_discard.out" ftype="fastq"/>
            <assert_command>
                <has_text text="--discard-trimmed"/>
            </assert_command>
        </test>
        <!-- Ensure rest file output works -->
        <test>
            <param name="input_1" ftype="fasta" value="cutadapt_rest.fa" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AAAGATG"/>
            <param name="output_filtering" value="default"/>
            <param name="read_modification" value="none"/>
            <param name="output_type" value="additional"/>
            <param name="rest_file" value="True"/>
            <output name="out1" file="cutadapt_rest.out" ftype="fasta"/>
            <output name="rest_output" file="cutadapt_rest2.out" ftype="fasta"/>
        </test>
        <!-- Ensure nextseq-trim option works -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="read_modification" value="modify"/>
            <param name="nextseq_trim" value="20" />
            <output name="out1" decompress="True" file="cutadapt_nextseq_out.fq.gz" ftype="fastq.gz"/>
        </test>
        <!-- Ensure Report and Info file output work -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq" value="cutadapt_small.fastq" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="report" value="True" />
            <param name="info_file" value="True" />
            <output name="out1" value="cutadapt_small.out" ftype="fastq"/>
            <output name="report">
                <assert_contents>
                    <has_text text="Summary"/>
                </assert_contents>
            </output>
            <output name="info_file" value="cutadapt_info_out.txt" ftype="txt"/>
        </test>


        <test>
            <conditional name="library">
                <param name="type" value="single" />
                <param name="input_1" ftype="fastq" value="cutadapt_in_split.fastq" />
                <section name="r1" >
                    <repeat name="front_adapters">
                        <conditional name="front_adapter_source">
                            <param name="front_adapter_source_list" value="user"/>
                            <param name="front_adapter_name" value="A1" />
                            <param name="front_adapter" value="^GTCGGTAA" />
                        </conditional>
                    </repeat>
                    <repeat name="front_adapters">
                        <conditional name="front_adapter_source">
                            <param name="front_adapter_source_list" value="user"/>
                            <param name="front_adapter_name" value="A2" />
                            <param name="front_adapter" value="^AGGTCACT" />
                        </conditional>
                    </repeat>
                </section>
            </conditional>
            <param name="report" value="False" />
            <param name="info_file" value="False" />
            <param name="multiple_output" value="True" />
            <output_collection name="split_output" type="list" count="3">
                   <element name="A1" value="A1.fastq" ftype="fastq">
                   </element>
                   <element name="A2" value="A2.fastq" ftype="fastq">
                   </element>
                   <element name="unknown" value="unknown.fastq" ftype="fastq">
                   </element>
             </output_collection>
        </test>

        <test>
            <conditional name="library">
                <param name="type" value="single" />
                <param name="input_1" ftype="fastq.gz" value="cutadapt_in_split.fastq.gz" />
                <section name="r1" >
                    <repeat name="front_adapters">
                        <conditional name="front_adapter_source">
                            <param name="front_adapter_source_list" value="file"/>
                            <param name="front_adapter_file" ftype="fasta"  value="barcodes.fasta" />
                        </conditional>
                    </repeat>
                </section>
            </conditional>
            <param name="report" value="False" />
            <param name="info_file" value="False" />
            <param name="multiple_output" value="True" />
            <output_collection name="split_output" type="list" count="3">
                   <element name="A1"  decompress="True" file="A1.fastq.gz" ftype="fastq.gz">
                   </element>
                   <element name="A2" decompress="True" file="A2.fastq.gz" ftype="fastq.gz">
                   </element>
                   <element name="unknown" decompress="True" file="unknown.fastq.gz" ftype="fastq.gz">
                   </element>
             </output_collection>
        </test>

        <!-- Ensure untrimmed file output works -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq" value="cutadapt_small.fastq" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AAAT"/>
            <param name="untrimmed_file" value="True" />
            <output name="out1" file="cutadapt_trimmed.out" ftype="fastq"/>
            <output name="untrimmed_output" file="cutadapt_untrimmed.out" ftype="fastq"/>
        </test>
        <!-- Ensure untrimmed gzip file output works -->
        <test>
            <param name="type" value="single" />
            <param name="input_1" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="untrimmed_file" value="True" />
            <output name="out1" decompress="True" file="cutadapt_trimmed.out.gz" ftype="fastq.gz"/>
            <!--
                Do not use the decompress option for this assertion, since it does NOT test that the file is compressed
                See discussion at https://github.com/galaxyproject/galaxy/issues/7671
                `delta="4000" is more than the difference between gzip level 1 and gzip level 9, but much less than the
                difference between level 1 compression and no compression
            -->
            <output name="untrimmed_output" file="cutadapt_untrimmed.out.gz" compare="sim_size" delta="4000" ftype="fastq.gz"/>
        </test>
        <!-- same as 1st test with paired data + filter options (because of discard_untrimmed no comparison is done) -->
        <test>
            <param name="type" value="paired" />
            <param name="input_1" ftype="fastq.gz" value="bwa-mem-fastq1.fq.gz" />
            <param name="input_2" ftype="fastq.gz" value="bwa-mem-fastq2.fq.gz" />
            <param name="adapter_source_list" value="user"/>
            <param name="adapter" value="AGATCGGAAGAGC"/>
            <param name="adapter_source_list2" value="user"/>
            <param name="adapter2" value="AGATCGGAAGAGC"/>
            <section name="filter_options">
                <param name="discard_untrimmed" value="true"/>
                <param name="min" value="1"/>
                <param name="max" value="1000"/>
                <param name="max_n" value="1"/>
                <param name="pair_filter" value="both"/>
            </section>
            <assert_command>
                <has_text text="--discard-untrimmed"/>
                <has_text text="--minimum-length=1"/>
                <has_text text="--maximum-length=1000"/>
                <has_text text="--max-n=1"/>
                <has_text text="--pair-filter=both"/>
            </assert_command>
        </test>
    </tests>

    <help><![CDATA[

.. class:: infomark

**What it does**

-------------------

**Cutadapt** finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads.

Cleaning your data in this way is often required: Reads from small-RNA sequencing contain the 3’ sequencing adapter because the read is longer than the molecule that is sequenced, such as in microRNA, or CRISPR data, or Poly-A tails that are useful for pulling out RNA from your sample but often you don’t want them to be in your reads.

Cutadapt_ helps with these trimming tasks by finding the adapter or primer sequences in an error-tolerant way. It can also modify and filter reads in various ways. Cutadapt searches for the adapter in all reads and removes it when it finds it. Unless you use a filtering option, all reads that were present in the input file will also be present in the output file, some of them trimmed, some of them not. Even reads that were trimmed entirely (because the adapter was found in the very beginning) are output. All of this can be changed with options in the tool form above.

The tool is based on the **Open Source** Cutadapt_ tool. See the complete `Cutadapt documentation`_ for additional details. If you use Cutadapt, please cite *Marcel, 2011* under **Citations** below.

-------------------

**Inputs**

-------------------

Input files for Cutadapt need to be:

- FASTQ.GZ, FASTQ.BZ2, FASTQ or FASTA

To trim an adapter, input the ADAPTER sequence in plain text or in a FASTA file e.g. AACCGGTT (with the characters: **$**, **^**, **...**, if anchored or linked).

    =============================================   ===================
    **Option**                                      **Sequence**
    ---------------------------------------------   -------------------
    3’ (End) Adapter                                ADAPTER
    Anchored 3’ Adapter                             ADAPTER$

    5’ (Front) Adapter                              ADAPTER
    Anchored 5’ Adapter                             ^ADAPTER

    5’ or 3’ (Both possible)                        ADAPTER

    Linked Adapter - 3' (End) only                  ADAPTER1...ADAPTER2
    Non-anchored Linked Adapter - 5' (Front) only   ADAPTER1...ADAPTER2
    =============================================   ===================

Below is an illustration of the allowed adapter locations relative to the read and depending on the adapter type:

.. image:: $PATH_TO_IMAGES/adapters.svg


-------------------

*Example: Illumina TruSeq Adapters*

-------------------

If you have reads containing Illumina TruSeq adapters, for example, follow these steps.


For Single-end reads as well as the first reads of Paired-end data:

**Read 1**

In the **3' (End) Adapters** option above, insert A + the “TruSeq Indexed Adapter” prefix that is common to all Indexed Adapter sequences, e.g insert:

AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC


For the second reads of Paired-end data:

**Read 2**

In the **3' (End) Adapters** option above, insert the reverse complement of the “TruSeq Universal Adapter”:

AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT

The adapter sequences can be found in the document `Illumina TruSeq Adapters De-Mystified`_.

-----------

**Outputs**

-----------

- Trimmed reads

Optionally, under **Output Options** you can choose to output

    * Report
    * Info file


**Report**

Cutadapt can output per-adapter statistics if you select to output the report above.

Example:

        *This is cutadapt 1.16 with Python 3.6.4*

        *Command line parameters: -j 1 --format=fastq -a AGATCGGAAGAGC --info-file=/tmp/tmpX0DlY1/files/000/dataset_21.dat --output=out1.fq --error-rate=0.1 --times=1 --overlap=3 input_f.fastq*
        *Running on 1 core*
        *Trimming 1 adapter with at most 10.0% errors in single-end mode ...*
        *Finished in 0.00 s (1426 us/read; 0.04 M reads/minute).*

        *=== Summary ===*

        * Total reads processed:                       3*
        * Reads with adapters:                         0 (0.0%)*
        * Reads written (passing filters):             3 (100.0%)*

        * Total basepairs processed:           102 bp*
        * Total written (filtered):            102 bp (100.0%)*

        *=== Adapter 1 ===*

        *Sequence: AGATCGGAAGAGC; Type: regular 3'; Length: 13; Trimmed: 0 times.*


**Info file**

The info file contains information about the found adapters. The output is a tab-separated text file. Each line corresponds to one read of the input file.

Columns contain the following data:

    * **1st**:   Read name
    * **2nd**:   Number of errors
    * **3rd**:   0-based start coordinate of the adapter match
    * **4th**:   0-based end coordinate of the adapter match
    * **5th**:   Sequence of the read to the left of the adapter match (can be empty)
    * **6th**:   Sequence of the read that was matched to the adapter
    * **7th**:   Sequence of the read to the right of the adapter match (can be empty)
    * **8th**:   Name of the found adapter
    * **9th**:   Quality values corresponding to sequence left of the adapter match (can be empty)
    * **10th**:  Quality values corresponding to sequence matched to the adapter (can be empty)
    * **11th**:  Quality values corresponding to sequence to the right of the adapter (can be empty)

The concatenation of columns 5-7 yields the full read sequence. Column 8 identifies the found adapter. Adapters without a name are numbered starting from 1. Fields 9-11 are empty if quality values are not available. Concatenating them yields the full sequence of quality values.

If no adapter was found, the format is as follows:

     #. Read name
     #. The value -1
     #. The read sequence
     #. Quality values

When parsing the file, be aware that additional columns may be added in the future. Note also that some fields can be empty, resulting in consecutive tabs within a line.

If the --times option is used and greater than 1, each read can appear more than once in the info file. There will be one line for each found adapter, all with identical read names. Only for the first of those lines will the concatenation of columns 5-7 be identical to the original read sequence (and accordingly for columns 9-11). For subsequent lines, the shown sequence are the ones that were used in subsequent rounds of adapter trimming, that is, they get successively shorter.

--------------------

**More Information**

--------------------

See the excellent `Cutadapt documentation`_

.. _Cutadapt: https://cutadapt.readthedocs.io/en/stable/
.. _`Cutadapt documentation`: https://cutadapt.readthedocs.io/en/latest/index.html
.. _`Illumina TruSeq Adapters De-Mystified`: http://tucf-genomics.tufts.edu/documents/protocols/TUCF_Understanding_Illumina_TruSeq_Adapters.pdf


--------------------

**Galaxy Wrapper Development**

--------------------

Author: Lance Parsons <lparsons@princeton.edu>

    ]]></help>

    <citations>
        <citation type="bibtex">
@article{marcel_cutadapt_2011,
	title = {Cutadapt removes adapter sequences from high-throughput sequencing reads},
	volume = {17},
	copyright = {Authors who publish with this journal agree to the following terms:     Authors retain copyright and grant the journal right of first publication with the work simultaneously licensed under a  Creative Commons Attribution License  that allows others to share the work with an acknowledgement of the work's authorship and initial publication in this journal.   Authors  are able to enter into separate, additional contractual arrangements  for the non-exclusive distribution of the journal's published version of  the work (e.g., post it to an institutional repository or publish it in  a book), with an acknowledgement of its initial publication in this  journal.   Authors are permitted and encouraged to post their  work online (e.g., in institutional repositories or on their website)  prior to and during the submission process, as it can lead to productive  exchanges, as well as earlier and greater citation of published work  (See  The Effect of Open Access ).},
	url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200},
	abstract = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features.

Cutadapt, including its MIT-licensed source code, is available for download at http://code.google.com/p/cutadapt/},
	number = {1},
	urldate = {2011-08-02},
	journal = {EMBnet.journal},
	author = {Marcel, Martin},
	year = {2011},
	note = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features.   Cutadapt, including its MIT-licensed source code, is available for download at  http://code.google.com/p/cutadapt/},
	keywords = {Adapter removal;, fastq, MicroRNA, Sequencing, Small RNA, software},
	file = {Cutadapt removes adapter sequences from high-throughput sequencing reads | Martin | EMBnet.journal:/Users/lparsons/Library/Application Support/Firefox/Profiles/thd2t4je.default/zotero/storage/ZXZT4PSE/200.html:text/html}
}
        </citation>
    </citations>

</tool>
author	iuc
date	Thu, 05 Mar 2020 06:45:31 -0500
parents	e4691e1589d3
children	093678460093