view smap_haplotype.xml @ 0:7d416d98d2c9 draft default tip

Uploaded
author ieguinoa
date Tue, 22 Mar 2022 13:49:39 +0000
parents
children
line wrap: on
line source

<tool id="smap_haplotype" name="SMAP haplotype-sites (BETA)" version="4.5.0">
    <requirements>
        <requirement type="package">ngs-smap</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        ### haplotype sites
        ## create output dir
        #set $out_dir='haplotype_sites_output';  
        mkdir $out_dir;
        #set $out_base='hs_base'
        ## create input dir for bam and bai files
        mkdir alignments_dir;
        #set $input_list = []
        #import re
        #for $i, $input in enumerate($input_bams):
            #set $safename = re.sub('[^\w\-_]', '_', $input.element_identifier)
            #if $safename in $input_list:
                #set $safename = str($safename) + "." + str($i)
            #end if
            ln -sf '${input}' 'alignments_dir/${safename}.bam' &&
            ln -sf '${input.metadata.bam_index}' 'alignments_dir/${safename}.bai' &&
        #end for
        smap haplotype-sites alignments_dir $bed_sites_file $vcf_file 
        -mapping_orientation $mapping_orientation
        -partial $partial
        --min_read_count $min_read_count 
        $no_indels
        --discrete_calls $discrete_calls
        --min_haplotype_frequency $min_haplotype_frequency
        -p "\${GALAXY_SLOTS:-1}"
        --min_distinct_haplotypes $min_distinct_haplotypes
        --plot_type png 
        --plot $plot 
        #if $frequency_interval_bounds:
            --frequency_interval_bounds '$frequency_interval_bounds'
        #end if
        #if $dosage_filter:
            --dosage_filter $dosage_filter
        #end if
        #if $locus_correctness:
            --locus_correctness $locus_correctness
        #end if
        -o $out_dir/$out_base
        ;
        ## mv main outputs to corresponding file destination
        mv $out_dir/$out_base\_coordinates.tsv $coordinates;
        mv $out_dir/$out_base\_read_counts* $read_counts;
        #set $barplot_dir='barplots_out'
        mkdir $barplot_dir;
        #if $plot != 'nothing':
            mv $out_dir/$out_base\_haplotype*.barplot.png $barplot_dir;
        #end if
        #set $haplotype_out_dir='haplotype_tsv_dir';
        mkdir $haplotype_out_dir;
        mv $out_dir/$out_base\_haplotype* $haplotype_out_dir/;
        ## when using --plot all there is 1 plot per bam file named **..frequency.histogram.png
        #if $plot == 'all':
            ## I should use discover outputs, but can simplify it if I move them first to a specific dir and discover all png from there
            mkdir frequency_plots;
            mv $out_dir/*.frequency.histogram.png frequency_plots;
        #end if
    ]]></command>
<inputs>
        <param name="input_bams" type="data" optional="false" label="Select your alignments files" format="bam" multiple="true"/>
        <param argument="--mapping_orientation" type="select" label="Should strandedness of read mapping be considered for haplotyping?">
            <option value="ignore" selected="True">Ignore strandedness</option>
            <option value="stranded">Consider strandedness</option>
        </param>
        <param argument="--partial" type="select" label="Select if partial alignments should be excluded" help="Specify if reads are expected to be aligned at both outer positions of the locus (HiPlex, Shotgun SNPs in sliding frames) or if reads are expected to display read mapping polymorphisms along the locus (GBS, Shotgun SVs).">
            <option value="exclude" selected="True">Partially mapped reads are excluded</option>
            <option value="include">Include reads that only partially cover the locus</option>
        </param>
        <param name="bed_sites_file" type="data" optional="false" label="BED File" help="BED file" format="bed" multiple="false"/>
        <param name="vcf_file" type="data" optional="false" label="Variants positions File" format="vcf" multiple="false" help="Should be in VCFv4.2 format, containing variant positions. It should contain at least the first 9 columns listing the SNP positions, sample-specific genotype calls across the sampleset are not required. Positional mandatory argument, should be the third argument after smap haplotype-sites."/>
        <param argument="--plot" type="select" label="Select which plots are to be generated.">
            <option value="summary" selected="True">Summary (plots with information for all samples)</option>
            <option value="all">All (per sample plots)</option>
            <option value="nothing">Nothing</option>
        </param>
        <param argument="--undefined_representation" type="text" label="Value to use for non-existing or masked data" value="NaN"/>
        <param argument="--min_mapping_quality" type="integer" value="30" label="Minimum .bam mapping quality to retain reads for analysis"/>
        <param argument="--no_indels" type="boolean" truevalue="--no_indels" falsevalue="" checked="false" label="Select true in this option if you want to exclude haplotypes that contain an InDel at the given SNP/SMAP positions." help="These reads are also ignored to evaluate the minimal read count"/>
        <param argument="--min_distinct_haplotypes" type="integer" value="0" label="Minimal number of distinct haplotypes per locus across all samples" help="Loci that do not fit this criterium are removed from the final output"/>
        <param argument="--max_distinct_haplotypes" type="text" value="inf"  label="Maximal number of distinct haplotypes per locus across all samples. Loci that do not fit this criterium are removed from the final output"/>
        <param argument="--min_read_count" type="integer" value="0" label="Minimal total number of reads per locus per sample"/>
        <param argument="--max_read_count" type="text" value="inf" label="Maximal number of reads per locus per sample, read count is calculated after filtering out the low frequency haplotypes"/>
        <param argument="--min_haplotype_frequency" type="float" value="0" label="Set minimal HF (in pecentage) to retain the haplotype in the genotyping matrix" help="Haplotypes above this threshold in at least one of the samples are retained. Haplotypes that never reach this threshold in any of the samples are removed"/>

        <param argument="--mask_frequency" type="float" value="0" label="Mask haplotype frequency values below this threshold for individual samples to remove noise from the final output" help="Haplotype frequency values below this threshold are set to -u. Haplotypes are not removed based on this value, use --min_haplotype_frequency for this purpose instead."/>
        <param argument="--discrete_calls" type="select" label="Discrete calls" help="Select dominant to transform haplotype frequency values into presence(1)/absence(0) calls per allele, or dosage to indicate the allele copy number">
            <option value="dominant" selected="True">Dominant</option>
            <option value="dosage">Dosage</option>
        </param>
        <param argument="--frequency_interval_bounds" type="text" optional="True" label="Frequency interval bounds for classifying the read frequencies into discrete calls" help="Custom thresholds can be defined by passing one or more space-separated integers or floats which represent relative frequencies in percentage. For dominant calling, one value should be specified. For dosage calling, an even total number of four or more thresholds should be specified"/>
        <param argument="--dosage_filter" type="integer" optional="True" label="Mask dosage calls in the loci for which the total allele count for a given locus at a given sample differs from the defined value" help="For example, in diploid organisms the total allele copy number must be 2, and in tetraploids the total allele copy number must be 4. (default no filtering)."/>
        <param argument="--locus_correctness" type="integer" optional="True" label="Threshold value: % of samples with locus correctness." help="Create a new .bed file defining only the loci that were correctly dosage called in at least the defined percentage of samples (default no filtering)"/>
</inputs>
    <outputs>
        <collection name="haplotype_frequencies" type="list" label="Haplotypes" >
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv" ext="tsv" directory="haplotype_tsv_dir/"/>
        </collection>
        <data format="tsv" name="coordinates" label="Coordinates" />
        <data format="tsv" name="read_counts" label="Read counts" />
        <collection name="barplots" type="list" label="${tool.name} on ${on_string}: Barplots">
            <filter>plot == "all" or plot == "summary"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.png" ext="png" directory="barplots_out/"/>
        </collection>
        <collection name="frequencies_histograms" type="list" label="${tool.name} on ${on_string}: Frequencies histograms">
            <filter>plot == "all"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.png" ext="png" directory="frequency_plots/"/>
        </collection>
    </outputs>
    <help><![CDATA[
        TODO: Fill in help.
    ]]></help>
</tool>