Mercurial > repos > nml > refseq_masher
diff contains.xml @ 0:26df66c32861 draft
planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author | nml |
---|---|
date | Thu, 15 Feb 2018 13:59:31 -0500 |
parents | |
children | 2c1cb37a3ffe |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contains.xml Thu Feb 15 13:59:31 2018 -0500 @@ -0,0 +1,323 @@ +<tool id="refseq_masher_contains" name="RefSeq Masher Contains" version="0.1.1"> + <description> + Find NCBI RefSeq Genomes contained in your sequences + </description> + <requirements> + <requirement type="package" version="0.1.1">refseq_masher</requirement> + </requirements> + <command detect_errors="exit_code"> +<![CDATA[ + +#import re + +#if $input.type == 'fasta' +#set $input_files = '"{}"'.format($input.fasta.name) + ln -s "$input.fasta" $input_files && +#elif $input.type == 'paired' +#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq' +#set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext) +#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq' +#set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext) +#set $input_files = '{} {}'.format($_forward, $_reverse) + ln -s "$input.forward" $_forward && + ln -s "$input.reverse" $_reverse && +#elif $input.type == 'single' +#set $input_files = '"{}"'.format($input.single.name) + ln -s "$input.single" $input_files && +#elif $input.type == 'paired_collection' +#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq' +#set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext) +#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq' +#set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext) +#set $input_files = '{} {}'.format($_forward, $_reverse) + ln -s "$input.paired_collection.forward" $_forward && + ln -s "$input.paired_collection.reverse" $_reverse && +#end if + +refseq_masher + $adv.verbosity + contains + --output refseq_masher-contains.${adv.output_type} + --output-type $adv.output_type + --top-n-results $adv.top_n_results + --parallelism "\${GALAXY_SLOTS:-1}" + --min-identity $adv.min_identity + --max-pvalue $adv.max_pvalue + $input_files + +]]> + </command> + <inputs> + <conditional name="input"> + <param name="type" type="select" label="Sequence input type"> + <option value="fasta">FASTA</option> + <option value="paired">Paired-end FASTQs</option> + <option value="single">Single-end FASTQ</option> + <option value="paired_collection">Paired-end FASTQ collection</option> + </param> + <when value="fasta"> + <param name="fasta" + type="data" format="fasta" + optional="false" + label="FASTA file" + /> + </when> + <when value="paired"> + <param name="forward" + type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" + optional="false" + label="Forward FASTQ file" + help="Must have ASCII encoded quality scores" + /> + <param name="reverse" + type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" + optional="false" + label="Reverse FASTQ file" + help="File format must match the Forward FASTQ file" + /> + </when> + <when value="single"> + <param name="single" + type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" + optional="false" + label="Single-end FASTQ file" + /> + </when> + <when value="paired_collection"> + <param name="paired_collection" + type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt" + collection_type="paired" + optional="false" + label="Paired-end FASTQ collection" + help="" + /> + </when> + </conditional> + <section name="adv" title="Advanced Options" expanded="false"> + <param name="top_n_results" + type="integer" + label="Top N matches to report (0 to report all)" + min="0" + value="0" + optional="true" + /> + <param name="min_identity" + type="float" value="0.9" min="0.0" max="1.0" + label="Mash dist min. identity to report" + optional="true" + /> + <param name="max_pvalue" + type="float" value="0.01" min="0.0" max="1.0" + label="Mash screen max. p-value to report" + optional="true" + /> + <param name="output_type" + type="select" + label="Output type" + multiple="false"> + <option value="tab" selected="true">Tabular (tab-delimited values)</option> + <option value="csv">CSV (Comma Separated Values)</option> + </param> + <param name="verbosity" + type="select" + label="Logging verbosity"> + <option value="">Error messages only</option> + <option value="-v">Show warning messages</option> + <option value="-vv" selected="true">Show info messages</option> + <option value="-vvv">Show debug messages</option> + </param> + </section> + </inputs> + <outputs> + <data + name="output_path_csv" + format="csv" + label="RefSeq Masher contains table" + from_work_dir="refseq_masher-contains.csv"> + <filter>adv['output_type'] == 'csv'</filter> + </data> + <data + name="output_path_tab" + format="tabular" + label="RefSeq Masher contains table" + from_work_dir="refseq_masher-contains.tab"> + <filter>adv['output_type'] == 'tab'</filter> + </data> + </outputs> + <tests> + <test> + <conditional name="input"> + <param name="type" value="single"/> + <param name="single" value="SRR1203042_1-head4000.fastq"/> + </conditional> + <section name="adv"> + <param name="top_n_results" value="5"/> + <param name="output_type" value="tab"/> + <param name="min_identity" value="0.9"/> + <param name="max_pvalue" value="0.01"/> + </section> + <output name="output_path_tab" + value="SRR1203042_1-head4000-contains.tab" + ftype="tabular" + lines_diff="0"> + </output> + </test> + </tests> + <help> +<![CDATA[ +RefSeq Masher - Containment +=========================== + +Find what NCBI RefSeq genomes are contained within your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes. + + +Source code available on Github at https://github.com/phac-nml/refseq_masher + + +`contains` - find what NCBI RefSeq Genomes are contained in your input sequences +-------------------------------------------------------------------------------- + +If you have a metagenomic sample or maybe a sample with some contamination, you may be interested in seeing what's in your sample. You can do this with `refseq_masher contains <INPUT>`.:: + + Usage: refseq_masher contains [OPTIONS] INPUT... + + Find the NCBI RefSeq genomes contained in your sequence files using Mash + Screen + + Input is expected to be one or more FASTA/FASTQ files or one or more + directories containing FASTA/FASTQ files. Files can be Gzipped. + + Options: + --mash-bin TEXT Mash binary path (default="mash") + -o, --output PATH Output file path (default="-"/stdout) + --output-type [tab|csv] Output file type (tab|csv) + -n, --top-n-results INTEGER Output top N results sorted by identity in + ascending order (default=0/all) + -i, --min-identity FLOAT Mash screen min identity to report + (default=0.9) + -v, --max-pvalue FLOAT Mash screen max p-value to report + (default=0.01) + -p, --parallelism INTEGER Mash screen parallelism; number of threads to + spawn (default=1) + -h, --help Show this message and exit. + + +Example - metagenomic a sample SAMEA1877340_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For this example, we're going to see what RefSeq genomes are contained within sample SAMEA1877340_ from BioProject PRJEB1775_. + + +Description from BioProject PRJEB1775_: + +.. epigraph:: + + Design, Setting and Patients Forty-five samples were selected from a set of fecal specimens obtained from patients with diarrhea during the 2011 outbreak of STEC O104:H4 in Germany. Samples were chosen to represent STEC-positive patients with a range of clinical conditions and colony counts together with a small number of patients with other infections (Campylobacter jejnuni, Clostridium difficile and Salmonella enterica). Samples were subjected to high-throughput sequencing on the Illumina MiSeq and HiSeq 2500, followed by bioinformatics analysis. + + +We're going to download the FASTQ files for ERR260489_:: + + wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_1.fastq.gz + wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_2.fastq.gz + + +We're going to run `refseq_masher` against these FASTQ files:: + + refseq_masher -vv contains --top-n-results 50 -p 12 -o containment-ERR260489.tab ERR260489_1.fastq.gz ERR260489_2.fastq.gz + +**Log**:: + + 2018-01-29 10:59:25,849 INFO: Grouped 2 fastqs into 1 groups [in ...refseq_masher/refseq_masher/utils.py:174] + 2018-01-29 10:59:25,849 INFO: Collected 0 FASTA inputs and 1 read sets [in ...refseq_masher/refseq_masher/utils.py:185] + 2018-01-29 10:59:25,849 INFO: Running Mash Screen with NCBI RefSeq sketch database against sample "ERR260489" with inputs: ['../ERR260489_1.fastq.gz', '../ERR260489_2.fastq.gz'] [in ...refseq_masher/refseq_masher/mash/screen.py:44] + Loading ...refseq_masher/refseq_masher/data/RefSeqSketches.msh... + 4669418 distinct hashes. + Streaming from 2 inputs... + Estimated distinct k-mers in pool: 206836855 + Summing shared... + Computing coverage medians... + Writing output... + 2018-01-29 11:00:19,665 INFO: Ran Mash Screen on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:134] + 2018-01-29 11:00:19,666 INFO: Fetching all taxonomy info for 23 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35] + 2018-01-29 11:00:19,669 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38] + 2018-01-29 11:00:19,671 INFO: Columns with all NA values dropped (ncol=12) [in ...refseq_masher/refseq_masher/taxonomy.py:40] + 2018-01-29 11:00:19,671 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41] + 2018-01-29 11:00:19,674 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43] + 2018-01-29 11:00:19,674 INFO: Merged taxonomic information into results output [in ...refseq_masher/refseq_masher/cli.py:136] + 2018-01-29 11:00:19,674 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:137] + 2018-01-29 11:00:19,677 INFO: Wrote output to "containment-ERR260489.tab" [in ...refseq_masher/refseq_masher/writers.py:20] + + + +**Output** + ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| sample | top_taxonomy_name | identity | shared_hashes | median_multiplicity | pvalue | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id | taxonomic_species group | match_comment | ++===========+======================================+==========+================+======================+========+==================================================================================================================================================+=======================+==============================+==================+====================+==================+=====================+===================+=========================+============+=========+==========+============+===========+=========+=====================+==============================================================================================+==========================+================+ +| ERR260489 | Bacteroides fragilis | 1.0 | 400/400 | 786 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; fragilis | | Bacteroides fragilis | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pLV22a | | | 817 | | ./rcn/refseq-NG-817-.-.-.-pLV22a-Bacteroides_fragilis.fna | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| [1 row] | | | | | | | | | | | | | | | | | | | | | | | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| ERR260489 | Escherichia coli O104:H4 str. E92/11 | 1.0 | 400/400 | 48 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Escherichia; coli; O104:H4; str. E92/11 | | Escherichia coli | Escherichia | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pE9211p3 | | | 1090927 | NZ_AHAU | ./rcn/refseq-NZ-1090927-.-.-NZ_AHAU-pE9211p3-Escherichia_coli_O104_H4_str._E92_11.fna | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| [3 rows] | | | | | | | | | | | | | | | | | | | | | | | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| ERR260489 | Kingella kingae KKC2005004457 | 1.0 | 400/400 | 5 | 0.0 | Bacteria; Proteobacteria; Betaproteobacteria; Neisseriales; Neisseriaceae; Kingella; kingae; KKC2005004457 | | Kingella kingae | Kingella | Neisseriaceae | Neisseriales | Betaproteobacteria | Proteobacteria | Bacteria | | | unnamed | | | 1229911 | | ./rcn/refseq-NG-1229911-.-.-.-unnamed-Kingella_kingae_KKC2005004457.fna | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| ERR260489 | Bacteroides cellulosilyticus WH2 | 0.99984 | 399/400 | 772 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; cellulosilyticus; WH2 | | Bacteroides cellulosilyticus | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pBWH2B | | | 1268240 | NZ_ATFI | ./rcn/refseq-NZ-1268240-.-.-NZ_ATFI-pBWH2B-Bacteroides_cellulosilyticus_WH2.fna | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| [1 row] | | | | | | | | | | | | | | | | | | | | | | | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| ERR260489 | Klebsiella pneumoniae | 0.99984 | 399/400 | 4 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Klebsiella; pneumoniae | | Klebsiella pneumoniae | Klebsiella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pMRC151 | | | 573 | | ./rcn/refseq-NG-573-.-.-.-pMRC151-Klebsiella_pneumoniae.fna | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ +| [37 rows] | | | | | | | | | | | | | | | | | | | | | | | | | ++-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ + +Some of the top genomes contained in this sample are sorted by identity and median multiplicity are: + +- *Bacteroides fragilis* - fully contained (400/400) and high multiplicity (768) +- *Escherichia coli* O104:H4 - fully contained (400/400) and median multiplicity of 48 +- *Kingella kingae* - fully contained (400/400) and median multiplicity of 5 +- *Klebsiella pneumoniae* - 399/400 sketches contained with median multiplicity of 4 + +So with Mash we are able to find that the sample contained the expected genomic data (especially *E. coli* O104:H4). + + + +Legal +----- + +Copyright Government of Canada 2017 + +Written by: National Microbiology Laboratory, Public Health Agency of Canada + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this work except in compliance with the License. You may obtain a copy of the +License at: + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +Contact +------- + +**Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca + + + +.. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x +.. _SAMEA1877340: https://www.ebi.ac.uk/ena/data/view/SAMEA1877340 +.. _PRJEB1775: https://www.ebi.ac.uk/ena/data/view/PRJEB1775 +.. _ERR260489: https://www.ebi.ac.uk/ena/data/view/ERR260489&display=html + +]]> + </help> + <citations> + <!-- Citation for Mash paper --> + <citation type="doi">10.1186/s13059-016-0997-x</citation> + </citations> +</tool>