view instrain_profile.xml @ 1:4b0418b1f58b draft

"planemo upload for repository https://github.com/MrOlm/inStrain commit 0b4dfba748b795669e01561d6ebf5f551201e503"
author iuc
date Tue, 09 Nov 2021 15:14:48 +0000
parents 1f3730540302
children c332659ecdcc
line wrap: on
line source

<tool id="instrain_profile" name="InStrain Profile" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Creates an inStrain profile (microdiversity analysis) from a mapping file </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="requirements">
        <requirement type="package" version="3.0">zip</requirement>
    </expand>
    <version_command>inStrain profile --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
#set ext=$mapping_input.datatype.file_ext
ln -s '$mapping_input' 'inputbam.$ext'
#if $gene_profiling.gene_file
&&
ln -s '$gene_profiling.gene_file' 'gene_file.fna'
#end if
#if $stb
&&
ln -s '$stb' 'stb_file.stb'
#end if
&&
inStrain profile
    'inputbam.$ext'
    '$sequence_input'
    --output 'inStrain.IS'
    $use_full_fasta_header
    --processes "\${GALAXY_SLOTS:-6}"
    --min_mapq $read_filtering.min_mapq
    --max_insert_relative $read_filtering.max_insert_relative
    --min_insert $read_filtering.min_insert
    --pairing_filter '$read_filtering.pairing_filter'
#if $priority_reads
    --priority_reads '$read_filtering.priority_reads'
#end if
    $output.detailed_mapping_info
    --min_cov $variant_calling.min_cov
    --min_freq $variant_calling.min_freq
    --fdr $variant_calling.fdr
#if $gene_file
    --gene_file 'gene_file.fna'
#end if
#if $stb
    --stb 'stb_file.stb'
#end if    
    $mm_level
#if $profile.database_mode
    $profile.database_mode
#else
    --min_read_ani $read_filtering.min_read_ani
    --min_genome_coverage $profile.min_genome_coverage
    $skip_mm_profiling
#end if
    --min_scaffold_reads $profile.min_scaffold_reads
    --min_snp $profile.min_snp
    $profile.store_everything
#if $profile.scaffolds_to_profile
    --scaffolds_to_profile '$profile.scaffolds_to_profile'
#end if
    --rarefied_coverage $profile.rarefied_coverage
    --window_length $profile.window_length
    $output.skip_genome_wide
    $output.skip_plot_generation
&&
cd ./inStrain.IS && zip -r ../inStrain.IS.zip *
    ]]></command>
    <inputs>
        <param name="mapping_input" type="data" format="bam,sam" label="A file containing metagenomic reads mapped to a DNA sequence" help="Sorted Bam file"/>
        <param name="sequence_input" type="data" format="fasta" label="A file containing a DNA sequence."/>
        <param argument="--use_full_fasta_header" type="boolean" truevalue="--use_full_fasta_header" falsevalue="" checked="false" label="Use full fasta header" help="Instead of using the fasta ID (space in header before space), use the full header. Needed for some mapping tools (including bbMap)"/>
        <section name="read_filtering" title="Read Filtering" expanded="true">
            <param argument="--min_read_ani" type="float" value="0.95" min="0" max="1" label="Minimum percent identity" help=" Minimum percent identity of read pairs to consensus to use the reads. Must be >, not >="/>
            <param argument="--min_mapq" type="integer" value="-1" label="Minimum mapq score" help="Minimum mapq score of EITHER read in a pair to use that pair. Must be >, not >="/>
            <param argument="--max_insert_relative" type="integer" value="3" label="Maximum insert relative" help="Multiplier to determine maximum insert size between two reads - default is to use 3x median insert size. Must be >, not >="/>
            <param argument="--min_insert" type="integer" value="50" label="Minimum insert" help="Minimum insert size between two reads - default is 50 bp. If two reads are 50bp each and overlap completely, their insert will be 50. Must be >, not >="/>
            <param argument="--pairing_filter" type="select" label="How should paired reads be handled?">
                <option value="paired_only" selected="true">Only paired reads are retained</option>
                <option value="non_discordant">Keep all paired reads and singleton reads that map to a single scaffold</option>
                <option value="all_reads">Keep all reads regardless of pairing status (NOT RECOMMENDED; See documentation for deatils)</option>
            </param>
            <param argument="--priority_reads" type="data" format="fastqsanger,fastqsanger.gz" optional="true" label="The location of a list of reads that should be retained regardless of pairing status" help="For example long reads or merged reads. This can be a .fastq file or text file with list of read names (will assume file is compressed if ends in .gz"/>
        </section>
        <section name="variant_calling" title="Variant Calling" expanded="true">
            <param argument="--min_cov" type="integer" value="5" label="Minimum coverage" help=" Minimum coverage to call a variant"/>
            <param argument="--min_freq" type="float" value="0.05" label="Minimum SNP frequency" help="Minimum SNP frequency to confirm a SNV (both this AND the FDR snp count cutoff must be true to call a SNP)."/>
            <param argument="--fdr" type="float" value="1e-06" min="0" max="1" label="FDR" help="SNP false discovery rate- based on simulation data with a 0.1 percent error rate (Q30)"/>
        </section>
        <section name="gene_profiling" title="Gene Profiling" expanded="true">
            <param argument="--gene_file" type="data" format="fasta,genbank" optional="true" label="Path to prodigal .fna genes file. If file ends in .gb or .gbk, will treat as a genbank file" help="EXPERIMENTAL; the name of the gene must be in the gene qualifier"/>
        </section>
        <param argument="--stb" type="data" format="tabular" optional="true" label="Scaffold to bin" help="This can be a file with each line listing a scaffold and a bin name, tab-seperated. This can also be a space-seperated list of .fasta files, with one genome per .fasta file. If nothing is provided, all scaffolds will be treated as belonging to the same genome"/>
        <param argument="--mm_level" type="boolean" truevalue="--mm_level" falsevalue="" checked="false" label="Create output files on the mm level"/>
        <param argument="--skip_mm_profiling" type="boolean" truevalue="--skip_mm_profiling" falsevalue="" checked="false" label ="Skip mm profiling" help="Dont perform analysis on an mm level; saves RAM and time; impacts plots and raw_data"/>
        <section name="profile" title="Profile" expanded="true">
            <param argument="--database_mode" type="boolean" truevalue="--database_mode" falsevalue="" checked="false" label="Database mode" help="Set a number of parameters to values appropriate for mapping to a large fasta file."/>
            <param argument="--min_scaffold_reads" type="integer" value="1" label="Minimum scaffold reads" help="Minimum number of reads mapping to a scaffold to proceed with profiling it"/>
            <param argument="--min_genome_coverage" type="integer" value="0" label="Minimum genome coverage" help="Minimum number of reads mapping to a genome to proceed with profiling it. MUST profile .stb if this is set"/>
            <param argument="--min_snp" type="integer" value="20" label="Minimum SNP" help="Absolute minimum number of reads connecting two SNPs to calculate LD between them."/>
            <param argument="--store_everything" type="boolean" truevalue="--store_everything" falsevalue="" checked="false" label="Store everything" help="Store intermediate dictionaries in the pickle file; will result in significantly more RAM and disk usage"/>
            <param argument="--scaffolds_to_profile" type="data" format="fasta" optional="true" label="Scaffolds to profile" help="File containing a list of scaffolds to profile- if provided will ONLY profile those scaffolds"/>
            <param argument="--rarefied_coverage" type="integer" value="50" label="Rarefied coverage" help="When calculating nucleotide diversity, also calculate a rarefied version with this much coverage"/>
            <param argument="--window_length" type="integer" value="10000" label ="Window length" help="Break scaffolds into windows of this length when profiling"/>
        </section>
        <section name="output" title="Set Output Parameters" expanded="true">
            <param argument="--detailed_mapping_info" type="boolean" truevalue="--detailed_mapping_info" falsevalue="" checked="false" label="Detailed mapping info" help="Make a detailed read report indicating deatils about each individual mapped read"/>
            <param argument="--skip_genome_wide" type="boolean" truevalue="--skip_genome_wide" falsevalue="" checked="false" label="Skip genome wide" help="Do not generate tables that consider groups of scaffolds belonging to genomes"/>
            <param argument="--skip_plot_generation" type="boolean" truevalue="--skip_plot_generation" falsevalue="" checked="false" label="Skip plot generation" help="Do not make plots"/>
        </section>
    </inputs>
    <outputs>
        <data format="zip" name="inStrain_zip" from_work_dir="inStrain.IS.zip" label="inStrain Profile IS zip" />
        <data name="scaffold_info" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_scaffold_info.tsv" label="Scoffold Info, This gives basic information about the scaffolds in your sample at the highest allowed level of read identity." />
        <data name="mapping_info" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_mapping_info.tsv" label="Mapping Info, This provides an overview of the number of reads that map to each scaffold, and some basic metrics about their quality." />
        <data name="SNVs" format="tabular" from_work_dir="inStrain.IS/output/inStrain.IS_SNVs.tsv" label="SNV, This describes the SNVs and SNSs that are detected in this mapping." />
        <data format="tabular" name="linkage" from_work_dir="inStrain.IS/output/inStrain.IS_linkage.tsv" label="Linkage, This describes the linkage between pairs of SNPs in the mapping that are found on the same read pair at least min_snp times." />
        <data format="tabular" name="gene_info" from_work_dir="inStrain.IS/output/inStrain.IS_gene_info.tsv" label="Gene Info, This describes some basic information about the genes being profiled" />
        <data format="tabular" name="genome_info" from_work_dir="inStrain.IS/output/inStrain.IS_genome_info.tsv" label="Genome Info, This Describes many of the above metrics on a genome-by-genome level, rather than a scaffold-by-scaffold level." >
            <filter>(output['skip_genome_wide'] is False)</filter>
        </data>
        <collection name="figures_pdfs" type="list" label="Figures" >
            <discover_datasets pattern="(?P&lt;designation&gt;.+)" directory="inStrain.IS/figures/" format="pdf"/>
            <filter>(output['skip_plot_generation'] is False)</filter>
        </collection>
    </outputs>
    <tests>
    <test expect_num_outputs="6">
        <param name="mapping_input" value="SmallScaffold.fa.sorted.bam"/>
        <param name="sequence_input" value="SmallScaffold.fa"/>
        <param name="use_full_fasta_header" value="false"/>
        <param name="mm_level" value="false"/>
        <param name="skip_mm_profiling" value="false"/>
        <section name="read_filtering">
            <param name="min_read_ani" value="0.95"/>
            <param name="min_mapq" value="-1"/>
            <param name="max_insert_relative" value="3"/>
            <param name="min_insert" value="50"/>
            <param name="pairing_filter" value="paired_only"/>
        </section>
        <section name="variant_calling">
            <param name="min_cov" value="5"/>
            <param name="min_freq" value="0.05"/>
            <param name="fdr" value="1e-06"/>
        </section>
        <section name="profile">
            <param name="database_mode" value="false"/>
            <param name="min_scaffold_reads" value="1"/>
            <param name="min_genome_coverage" value="0"/>
            <param name="min_snp" value="20"/>
            <param name="store_everything" value="false"/>
            <param name="rarefied_coverage" value="50"/>
            <param name="window_length" value="10000"/>
        </section>
        <section name="output">
            <param name="detailed_mapping_info" value="false"/>
            <param name="skip_genome_wide" value="true"/>
            <param name="skip_plot_generation" value="true"/>
        </section>
        <output name="inStrain_zip">
            <assert_contents>
                <has_size value="21606" delta="1000" />
            </assert_contents>
        </output>        
        <output name="scaffold_info">
            <assert_contents>
                <has_text text="length"/>
                <has_n_lines n="2"/>
                <has_n_columns n="21"/>
            </assert_contents>
        </output>
        <output name="mapping_info">
            <assert_contents>
                <has_text text="scaffold"/>
                <has_n_lines n="5"/>
            </assert_contents>
        </output>
        <output name="SNVs">
            <assert_contents>
                <has_text text="position"/>
                <has_n_lines n="5"/>
                <has_n_columns n="16"/>
            </assert_contents>
        </output>
        <output name="linkage">
            <assert_contents>
                <has_n_lines n="1"/>
            </assert_contents>
        </output>
        <output name="gene_info">
            <assert_contents>
                <has_n_lines n="0"/>
            </assert_contents>
        </output>
    </test>
    <test expect_num_outputs="8">
        <param name="mapping_input" value="SmallScaffold.fa.sorted.bam"/>
        <param name="sequence_input" value="SmallScaffold.fa"/>
        <param name="use_full_fasta_header" value="false"/>
        <param name="mm_level" value="false"/>
        <param name="skip_mm_profiling" value="false"/>
        <section name="read_filtering">
            <param name="min_read_ani" value="0.95"/>
            <param name="min_mapq" value="-1"/>
            <param name="max_insert_relative" value="3"/>
            <param name="min_insert" value="50"/>
            <param name="pairing_filter" value="paired_only"/>
        </section>
        <section name="variant_calling">
            <param name="min_cov" value="5"/>
            <param name="min_freq" value="0.05"/>
            <param name="fdr" value="1e-06"/>
        </section>
        <section name="profile">
            <param name="database_mode" value="false"/>
            <param name="min_scaffold_reads" value="1"/>
            <param name="min_genome_coverage" value="0"/>
            <param name="min_snp" value="20"/>
            <param name="store_everything" value="false"/>
            <param name="rarefied_coverage" value="50"/>
            <param name="window_length" value="10000"/>
        </section>
        <section name="output">
            <param name="detailed_mapping_info" value="false"/>
            <param name="skip_genome_wide" value="false"/>
            <param name="skip_plot_generation" value="false"/>
        </section>
        <output name="inStrain_zip">
            <assert_contents>
                <has_size value="1468006" delta="100000" />
            </assert_contents>
        </output>        
        <output name="scaffold_info">
            <assert_contents>
                <has_text text="length"/>
                <has_n_lines n="2"/>
                <has_n_columns n="21"/>
            </assert_contents>
        </output>
        <output name="mapping_info">
            <assert_contents>
                <has_text text="scaffold"/>
                <has_n_lines n="5"/>
            </assert_contents>
        </output>
        <output name="SNVs">
            <assert_contents>
                <has_text text="position"/>
                <has_n_lines n="5"/>
                <has_n_columns n="16"/>
            </assert_contents>
        </output>
        <output name="linkage">
            <assert_contents>
                <has_n_lines n="1"/>
            </assert_contents>
        </output>
        <output name="gene_info">
            <assert_contents>
                <has_n_lines n="0"/>
            </assert_contents>
        </output>
        <output name="genome_info">
            <assert_contents>
                <has_text text="nucl_diversity"/>
                <has_n_lines n="2"/>
                <has_n_columns n="26"/>
            </assert_contents>
        </output>
        <output_collection name="figures_pdfs" type="list">
            <element name="inStrain.IS_CoverageAndBreadth_vs_readMismatch.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="383078" delta="10000" />
                </assert_contents>
            </element>
            <element name="inStrain.IS_MajorAllele_frequency_plot.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="383590" delta="10000" />
                </assert_contents>
            </element>            
            <element name="inStrain.IS_ReadFiltering_plot.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="383078" delta="10000" />
                </assert_contents>
            </element>            
            <element name="inStrain.IS_ScaffoldInspection_plot.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="208" delta="10" />
                </assert_contents>
            </element>
            <element name="inStrain.IS_genomeWide_microdiveristy_metrics.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="208" delta="10" />
                </assert_contents>
            </element>
            <element name="inStrain.IS_readANI_distribution.pdf" ftype="pdf">
                <assert_contents>
                    <has_size value="382771" delta="10000" />
                </assert_contents>
            </element>
        </output_collection>
    </test>
    </tests>
    <help><![CDATA[
@HELP_HEADER@

Profile
=======

is the heart of inStrain tool.

The functionality of inStrain profile is broken into several steps:

First, all reads in the .bam file are filtered to only keep those that map with sufficient quality. All non-paired reads will be filtered out by default, and an additional set of filters are applied to each read pair (not the individual reads):

  - Pairs must be mapped in the proper orientation with an expected insert size. The minimum insert distance can be set with the tool's corresponding parameter. The maximum insert distance is a multiple of the median insert distance. So if pairs have a median insert size of 500bp, by default all pairs with insert sizes over 1500bp will be excluded. For the max insert cutoff, the median_insert for all scaffolds is used.
  - Pairs must have a minimum mapQ score. MapQ scores are confusing and how they’re calculated varies based on the mapping algorithm being used, but are meant to represent both the number of mismatches in the mapping and how unique that mapping is. With bowtie2, if the read maps equally well to two positions on the genome (multi-mapped read), its mapQ score will be set to 2. The read in the pair with the higher mapQ is used for the pair.
  - Pairs must be above some minimum nucleotide identity (ANI) value. For example if reads in a pair are 100bp each, and each read has a single mismatch, the ANI of that pair would be 0.99

Next, using only read pairs that pass filters, a number of microdiversity metrics are calculated on a scaffold-by-scaffold basis. This includes:

  - Calculate the coverage at each position along the scaffold
  - Calculate the nucleotide diversity at each position along the scaffold in which the coverage is greater than the min_cov argument.
  - Identify SNSs and SNVs. The criteria for being reported as a divergent site are 1) More than min_cov number of bases at that position, 2) More than min_freq percentage of reads that are a variant base, 3) The number of reads with the variant base is more than the null model for that coverage.
  - Calculate linkage between divergent sites on the same read pair. For each pair harboring a divergent site, calculate the linkage of that site with other divergent sites within that same pair. This is only done for pairs of divergent sites that are both on at least MIN_SNP reads
  - Calculate scaffold-level properties. These include things like the overall coverage, breadth of coverage, average nucleotide identity (ANI) between the reads and the reference genome, and the expected breadth of coverage based on that true coverage.

Finally, this information is stored as an IS_profile object. This includes the locations of divergent sites, the number of read pairs that passed filters (and other information) for each scaffold, the linkage between SNV pairs, ect.

Inputs
======

A fasta file and a bam/sam file, 

Output
======

An IS_profile. (Zip file), Containing:
 
1. scaffold_info.tsv

   This gives basic information about the scaffolds in your sample at the highest allowed level of read identity.

2. mapping_info.tsv

   This provides an overview of the number of reads that map to each scaffold, and some basic metrics about their quality.

3. SNVs.tsv

   This describes the SNVs and SNSs that are detected in this mapping. While we should refer to these mutations as divergent sites, sometimes SNV is used to refer to both SNVs and SNSs.

4. linkage.tsv

   This describes the linkage between pairs of SNPs in the mapping that are found on the same read pair at least min_snp times.

5. gene_info.tsv

   This describes some basic information about the genes being profiled.

6. genome_info.tsv

   Describes many of the above metrics on a genome-by-genome level, rather than a scaffold-by-scaffold level. (To output it, set --skip_genome_wide to false)

7. Figures/Plots (When --skip_plot_generation is set to false):

  - Coverage and breadth vs. read mismatches
  - Genome-wide microdiversity metrics
  - Read-level ANI distribution
  - Major allele frequencies
  - Linkage decay
  - Read filtering plots
  - Scaffold inspection plot (large)
  - Linkage with SNP type (GENES REQUIRED)
  - Gene histograms (GENES REQUIRED)

    ]]></help>    
    <citations>
        <citation type="doi">10.1101/2020.01.22.915579</citation>
    </citations>
</tool>