view stacks_refmap.xml @ 4:827c8c6ec4c1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stacks commit 643293a896a2ccfac10fe995f48c7f01c1a89a7f
author iuc
date Mon, 26 Sep 2016 11:45:14 -0400
parents 26ad8a52d9fd
children 12aa3ad29980
line wrap: on
line source

<tool id="stacks_refmap" name="Stacks: reference map" version="@WRAPPER_VERSION@.0">
    <description>the Stacks pipeline with a reference genome (ref_map.pl)</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="stdio"/>
    <command><![CDATA[
        #from os.path import splitext
        #import re

        #if str( $options_usage.rad_analysis_type ) == "genetic":
            #for $input_parent in $options_usage.parent_alignments:
                #if $input_parent.ext == "sam":
                    #set $data_path = splitext($input_parent.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".sam"
                #else:
                    #set $data_path = splitext($input_parent.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".bam"
                #end if

                ln -s "${input_parent}" "${data_path}" &&
            #end for

            #for $input_progeny in $options_usage.progeny_alignments:

                #if $input_progeny:
                    #if $input_progeny.ext == "sam":
                        #set $data_path = splitext($input_progeny.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".sam"
                    #else:
                        #set $data_path = splitext($input_progeny.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".bam"
                    #end if

                    ln -s "${input_progeny}" "${data_path}" &&
                #end if
            #end for
        #else:
            #for $input_indiv in $options_usage.individual_sample:

                #if $input_indiv.ext == "sam":
                    #set $data_path = splitext($input_indiv.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".sam"
                #else:
                    #set $data_path = splitext($input_indiv.element_identifier)[0]
                    #set $data_path = re.sub(r'\.1$', '', $data_path)
                    #set $data_path = $data_path + ".bam"
                #end if

                ln -s "${input_indiv}" "${data_path}" &&
            #end for
        #end if

        mkdir stacks_outputs

        &&

        ref_map.pl

            -T \${GALAXY_SLOTS:-1}

            #if str( $options_usage.rad_analysis_type ) == "genetic":
                #for $input_parent in $options_usage.parent_alignments:
                    #if $input_parent.ext == "sam":
                        #set $data_path = splitext($input_parent.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".sam"
                    #else:
                        #set $data_path = splitext($input_parent.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".bam"
                    #end if

                    -p "${data_path}"
                #end for

                -A $options_usage.cross_type

                #for $input_progeny in $options_usage.progeny_alignments:
                    #if $input_progeny:
                        #if $input_progeny.ext == "sam":
                            #set $data_path = splitext($input_progeny.element_identifier)[0]
                            #set $data_path = re.sub(r'\.1$', '', $data_path)
                            #set $data_path = $data_path + ".sam"
                        #else:
                            #set $data_path = splitext($input_progeny.element_identifier)[0]
                            #set $data_path = re.sub(r'\.1$', '', $data_path)
                            #set $data_path = $data_path + ".bam"
                        #end if

                        -r "${data_path}"
                    #end if
                #end for
            #else:
                #for $input_indiv in $options_usage.individual_sample:

                    #if $input_indiv.ext == "sam":
                        #set $data_path = splitext($input_indiv.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".sam"
                    #else:
                        #set $data_path = splitext($input_indiv.element_identifier)[0]
                        #set $data_path = re.sub(r'\.1$', '', $data_path)
                        #set $data_path = $data_path + ".bam"
                    #end if

                    -s "${data_path}"
                #end for
                -O "$options_usage.popmap"
            #end if

            #if str($m):
                -m $m
            #end if
            #if str($P):
                -P $P
            #end if

            ## Batch description
            -b 1

            ## No SQL recording
            -S

            ## snp_model
            #if str( $snp_options.select_model.model_type) == "bounded":
                --bound_low $snp_options.select_model.bound_low
                --bound_high $snp_options.select_model.bound_high
                --alpha $snp_options.select_model.alpha
            #else if str( $snp_options.select_model.model_type) == "snp":
                --alpha $snp_options.select_model.alpha
            #end if

            -o stacks_outputs

            #if str( $options_usage.rad_analysis_type ) == "genetic":
                @NORM_GENOTYPES_OUTPUT_LIGHT@
            #end if

            ## If input is in bam format, stacks will output gzipped files (no option to control this)
            && if ls stacks_outputs/*.gz > /dev/null 2>&1; then gunzip stacks_outputs/*.gz; fi
    ]]></command>

    <inputs>
        <conditional name="options_usage">
            <param name="rad_analysis_type" type="select" label="Select your usage">
                <option value="genetic" selected="true">Genetic map</option>
                <option value="population">Population</option>
            </param>
            <when value="genetic">
                <param name="parent_alignments" format="sam,bam" type="data" multiple="true" label="Files containing parent alignments" argument="-p" help="Dataset names will be used as sample name (no space allowed)." />
                <param name="progeny_alignments" format="sam,bam" type="data" multiple="true" optional="true" label="Files containing progeny alignments" argument="-r" help="Dataset names will be used as sample name (no space allowed)." />

                <param name="cross_type" argument="-A" type="select" label="Cross type">
                    <expand macro="cross_types"/>
                </param>
            </when>
            <when value="population">
                <param name="individual_sample" format="sam,bam" type="data" multiple="true" label="Files containing an individual sample from a population" argument="-s" help="Dataset names will be used as sample name (no space allowed)." />
                <param name="popmap" type="data" format="tabular,txt" label="Specify a population map" argument="-O" />
            </when>
        </conditional>

        <param name="m" type="integer" value="3" label="Minimum depth of coverage" help="specify the minimum depth of coverage to report a stack in pstacks" argument="-m" />
        <param name="P" type="integer" value="" optional="true" label="Minimum depth of coverage in 'progeny' individuals" help="specify the minimum depth of coverage to report a stack in pstacks for 'progeny' individuals" argument="-P" />

        <!-- SNP Model options -->
        <section name="snp_options" title="SNP_Model_Options" expanded="False">
            <expand macro="snp_options"/>
        </section>
    </inputs>
    <outputs>
        <data format="txt" name="output_log" label="ref_map.log with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/ref_map.log" />

        <data format="tabular" name="catalogtags" label="Catalog assembled loci (tags) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.tags.tsv" />
        <data format="tabular" name="catalogsnps" label="Catalog model calls (snps) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.snps.tsv" />
        <data format="tabular" name="catalogalleles" label="Catalog haplotypes (alleles) with ${tool.name} on ${on_string}" from_work_dir="stacks_outputs/batch_1.catalog.alleles.tsv" />

        <expand macro="genotypes_output_light"/>
        <expand macro="populations_output_light"/>

        <collection name="tags" type="list" label="Assembled loci (tags) from ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.tags)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="snps" type="list" label="Model calls (snps) from each locus on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.snps)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="alleles" type="list" label="Haplotypes (alleles) recorded from each locus on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.alleles)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="matches" type="list" label="Matches to the catalog on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.matches)\.tsv$" ext="tabular" directory="stacks_outputs" />
        </collection>

        <collection name="all_output" type="list" label="Full output from ref_map on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(tags|snps|alleles|matches))\.tsv$" ext="tabular" directory="stacks_outputs" />
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(haplotypes|genotypes|markers|hapstats|sumstats|sumstats_summary))\.tsv$" ext="tabular" directory="stacks_outputs" />
            <discover_datasets pattern="(?P&lt;name&gt;.+\.(genotypes))\.(loc|txt)$" ext="txt" directory="stacks_outputs" />
        </collection>
    </outputs>

    <tests>
        <test>
            <param name="options_usage|rad_analysis_type" value="genetic"/>
            <param name="options_usage|parent_alignments" value="refmap/PopA_01.bam" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="ref_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>

            <!-- genotypes -->
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Catalog ID" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>
            <output name="out_joinmap">
                <assert_contents>
                    <has_text text="batch_1.genotypes_" />
                </assert_contents>
            </output>
            <output name="out_sql_genotypes">
                <assert_contents>
                    <has_text text="SQL ID" />
                </assert_contents>
            </output>
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Seg Dist" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="options_usage|rad_analysis_type" value="genetic"/>
            <param name="options_usage|parent_alignments" value="refmap/PopA_01.bam" />
            <param name="options_usage|progeny_alignments" value="refmap/PopA_02.bam" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="ref_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated" />
                </assert_contents>
            </output>

            <!-- genotypes -->
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Catalog ID" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>
            <output name="out_joinmap">
                <assert_contents>
                    <has_text text="batch_1.genotypes_" />
                </assert_contents>
            </output>
            <output name="out_sql_genotypes">
                <assert_contents>
                    <has_text text="SQL ID" />
                </assert_contents>
            </output>
            <output name="out_generic_haplo">
                <assert_contents>
                    <has_text text="Seg Dist" />
                </assert_contents>
            </output>
            <output name="out_sql_markers">
                <assert_contents>
                    <has_text text="Total Genotypes" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test>
            <param name="options_usage|rad_analysis_type" value="population"/>
            <param name="options_usage|individual_sample" value="refmap/PopA_01.bam,refmap/PopA_02.bam,refmap/PopA_03.bam,refmap/PopA_04.bam,refmap/PopB_01.bam,refmap/PopB_02.bam,refmap/PopB_03.bam,refmap/PopB_04.bam" />
            <param name="options_usage|popmap" value="denovo_map/popmap.tsv" />
            <output name="output_log">
                <assert_contents>
                    <has_text text="ref_map.pl completed" />
                </assert_contents>
            </output>

            <!-- catalog -->
            <output name="catalogtags">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogsnps">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>
            <output name="catalogalleles">
                <assert_contents>
                    <has_text text="catalog generated on" />
                </assert_contents>
            </output>

            <!-- populations -->
            <output name="out_haplotypes">
                <assert_contents>
                    <has_text text="PopA_01" />
                </assert_contents>
            </output>
            <output name="out_hapstats">
                <assert_contents>
                    <has_text text="Smoothed Gene Diversity" />
                </assert_contents>
            </output>
            <output name="out_populations_log">
                <assert_contents>
                    <has_text text="populations version" />
                </assert_contents>
            </output>
            <output name="out_sumstats_sum">
                <assert_contents>
                    <has_text text="Polymorphic Sites" />
                </assert_contents>
            </output>
            <output name="out_sumstats">
                <assert_contents>
                    <has_text text="Smoothed Pi" />
                </assert_contents>
            </output>

            <!-- samples -->
            <output_collection name="tags">
                <element name="PopA_01.tags">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="snps">
                <element name="PopA_01.snps">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="alleles">
                <element name="PopA_01.alleles">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="matches">
                <element name="PopA_01.matches">
                    <assert_contents>
                        <has_text text="generated on " />
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>

    <help>
<![CDATA[
.. class:: infomark

**What it does**

This program expects data that have been aligned to a reference genome, and can accept data directly from Bowtie, or from any aligner that can produce SAM format. To avoid datasets names problems, we recommand the use of the *Map with BWA for STACKS tool*. This program will execute each of the Stacks components: first, running pstacks on each of the samples specified, building loci (based on the reference alignment) and calling SNPs in each. Second, cstacks will be run to create a catalog of all loci specified as 'parents' or 'samples' on the command line, again using alignment to match loci in the catalog. Finally, sstacks will be executed to match each sample against the catalog. The ref_map.pl program will also load the results of each stage of the analysis: individual loci, the catalog, and matches against the catalog into the database (although this can be disabled). After matching the program will build a database index to speed up access (index_radtags.pl) and enable web-based filtering.

--------

**Input files**

- SAM, BAM

- Population map::

    indv_01    1
    indv_02    1
    indv_03    1
    indv_04    2
    indv_05    2
    indv_06    2

**Output files**

- XXX.tags.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: For the tags file, each stack will start in the file with a consensus sequence for the entire stack followed by the flags for that stack. Then, each individual read that was merged into that stack will follow. The next stack will start with another consensus sequence.


- XXX.snps.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: If a stack has two SNPs called within it, then there will be two lines in this file listing each one.


- XXX.alleles.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_


- XXX.matches.tsv file:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

Notes: Each line in this file records a match between a catalog locus and a locus in an individual, for a particular haplotype. The Batch ID plus the Catalog ID together represent a unique locus in the entire population, while the Sample ID and the Stack ID together represent a unique locus in an individual sample.


- other files:

See `Stacks output description <http://catchenlab.life.illinois.edu/stacks/manual/#files>`_

@STACKS_INFOS@
]]>
    </help>
    <expand macro="citation" />
</tool>