Mercurial > repos > petr-novak > re_utils

<tool id="single_fastq_filtering" name="Preprocessing of FASTQ reads" version="1.0.0.3">
    <stdio>
        <exit_code range="1:" level="fatal" description="Error"/>
    </stdio>
    <description>
        Preprocessing of FASTQ read files
        including trimming, quality filtering, cutadapt filtering and sampling
    </description>
    <requirements>
        <requirement type="package">blast</requirement>
        <requirement type="package" version="4.4">cutadapt</requirement>
        <requirement type="package">bioconductor-shortread</requirement>
        <requirement type="package">r-optparse</requirement>
    </requirements>
    <required_files>
        <include type="literal" path="single_fastq_filtering_wrapper.sh"/>
        <include type="literal" path="single_fastq_filtering.R"/>
    </required_files>
    <command>
        bash '$__tool_directory__'/single_fastq_filtering_wrapper.sh -a ${A} -o ${output}
        -c ${cut_off} -p ${percent_above} -N ${max_n} -G ${png_output}

        #if $sampling.sequence_sampling :
        -n $sampling.sample_size
        #end if

        #if $trimming.sequence_trimming :
        -e $trimming.trim_end -s $trimming.trim_start
        #end if

        #if $cutadapt.use_custom :
        -C "${cutadapt.custom_options}"
        #end if

        #if $similarity_filtering.include :
        -F "${similarity_filtering.filter_database}"
        #end if


    </command>

    <inputs>
        <param format="fastq,fastq.gz" type="data" name="A"
               label="Reads in FASTQ format"/>
        <conditional name="sampling">
            <param name="sequence_sampling" type="boolean" truevalue="true"
                   falsevalue="false" checked="False" label="Read sampling"/>
            <when value="false">
                <!-- do nothing here -->
            </when>
            <when value="true">
                <param name="sample_size" type="integer"
                       label="Sample size (number of reads)"
                       help="How many reads should be sampled" value="500000" min="0"/>
            </when>
        </conditional>

        <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0"
               help="See below how to correctly set the quality cutoff"/>
        <param type="integer" name="percent_above" label="Percent above cutoff" value="95"
               min="0"
               help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value"/>

        <conditional name="trimming">
            <param name="sequence_trimming" type="boolean" truevalue="true"
                   falsevalue="false" checked="False" label="Trim reads"/>
            <when value="false">
                <!-- do nothing here -->
            </when>
            <when value="true">
                <param type="integer" name="trim_start" label="Start position" value="1"
                       min="1"
                       help="Reads are trimmed at the specified start"/>
                <param type="integer" name="trim_end" label="End position" value="100"
                       min="1"
                       help="Reads are trimmed to the specified end position, shorted sequences are discarded"/>
            </when>

        </conditional>
        <param name="max_n" type="integer" label="maximum Ns"
               help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>

        <conditional name="cutadapt">
            <param name="use_custom" type="boolean" truevalue="true" falsevalue="false"
                   checked="False" label="Custom cutadapt options"/>
            <when value="false">
                <!-- do nothing here -->
            </when>
            <when value="true">
                <param name="custom_options" type="text" area="True" size="8x30"
                       label="Custom options" help="Consult cutadapt for usage" value="">
                    <sanitizer sanitize="False"/>
                </param>
                >
            </when>
        </conditional>

        <conditional name="similarity_filtering">
            <param name="include" type="boolean" truevalue="true" falsevalue="false"
                   checked="False" label="Use similarity search filtering"/>
            <when value="false">
                <!-- do nothing here -->
            </when>
            <when value="true">

                <param name="filter_database" format="fasta" type="data"
                       label="Sequence filter database"
                       help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
            </when>
        </conditional>

    </inputs>


    <outputs>
        <data format="fasta" name="output"
              label="Filtered FASTA reads from datasets ${A.hid}"/>
        <data format="png" name="png_output"
              label="Nucleotide composition after filtering of ${A.hid}"/>"
    </outputs>

    <tests>
        <test>
            <param name="A" value="ERR215189_1_part.fastq.gz"/>
            <param name="max_n" value="0"/>
            <param name="cut_off" value="10"/>
            <param name="percent_above" value="95"/>
            <output name="output" value="single_output.fasta"/>
            <output name="png_output" value="single_output.png"/>
        </test>
    </tests>

    <help>
        **What it does**

        This tool is designed to perform preprocessing of fastq file. Input files can be
        in GNU zipped archive (.gz extension). Reads are filtered based on the quality,
        presence of N bases and adapters. All reads which pass the quality filter fill
        be writen into output files. If sampling is specified, only sample of sequences
        will be returned.

        Cutadapt us run with this options::

        --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
        --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
        --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
        --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
        --anywhere='CAAGCAGAAGACGGCATACGAGAT'
        --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
        --error-rate=0.05
        --times=1 --overlap=15 --discard


        **Order of fastq files processing**

        1. Trimming (optional)
        #. Filter by quality
        #. Cutadapt filtering
        #. Sampling (optional)
        #. Interlacing two fasta files

        **Quality setting cutoff**

        To correctly set quality cutoff, you need to know how the quality is encoded in
        your fastq file, default
        filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::


        Default filtering cutoff
        |
        |
        V
        SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
        ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
        ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
        .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
        LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
        !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
        | | | | | |
        33 59 64 73 104 126
        0........................26...31.......40
        -5....0........9.............................40
        0........9.............................40
        3.....9.............................40
        0.2......................26...31........41

        S - Sanger Phred+33, raw reads typically (0, 40)
        X - Solexa Solexa+64, raw reads typically (-5, 40)
        I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
        J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
        with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
        (Note: See discussion above).
        L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)

    </help>
</tool>
author	petr-novak
date	Fri, 04 Aug 2023 08:09:40 +0000
parents	58807b35777a
children