annotate vsnp_determine_ref_from_data.xml @ 8:ce9f22394382 draft

Uploaded
author greg
date Mon, 02 Aug 2021 17:00:31 +0000
parents d5e66f9fe086
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
d5e66f9fe086 Uploaded
greg
parents: 4
diff changeset
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1+galaxy0" profile="@PROFILE@">
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
2 <description>from input data</description>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
3 <macros>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
4 <import>macros.xml</import>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
5 </macros>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
6 <requirements>
8
ce9f22394382 Uploaded
greg
parents: 7
diff changeset
7 <expand macro="biopython_requirement"/>
ce9f22394382 Uploaded
greg
parents: 7
diff changeset
8 <expand macro="pyyaml_requirement"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
9 </requirements>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
10 <command detect_errors="exit_code"><![CDATA[
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
11 #import re
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
12 #set gzipped = 'false'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
13 #set input_type = $input_type_cond.input_type
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
14
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
15 #if $input_type in ["single", "pair"]:
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
16 #set read1 = $input_type_cond.read1
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
17 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
18 ln -s '${read1}' '${read1_identifier}' &&
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
19 #if $input_type == "pair":
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
20 #set read2 = $input_type_cond.read2
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
21 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
22 ln -s '${read2}' '${read2_identifier}' &&
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
23 #else:
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
24 #set read2 = None
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
25 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
26 #else:
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
27 #set read1 = $input_type_cond.reads_collection['forward']
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
28 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
29 ln -s '${read1}' '${read1_identifier}' &&
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
30 #set read2 = $input_type_cond.reads_collection['reverse']
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
31 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
32 ln -s '${read2}' '${read2_identifier}' &&
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
33 #end if
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
34
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
35 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
36 --read1 '${read1_identifier}'
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
37 #if $read2 is not None
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
38 --read2 '${read2_identifier}'
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
39 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
40 --output_dbkey '$output_dbkey'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
41 --output_metrics '$output_metrics'
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
42 #if $read1.is_of_type('fastqsanger.gz'):
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
43 --gzipped
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
44 #end if
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
45 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
46 #for $i in $dnaprint_fields:
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
47 --dnaprint_fields '${i[0]}' '${i[2]}'
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
48 #end for
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
49 ]]></command>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
50 <inputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
51 <conditional name="input_type_cond">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
52 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
53 <option value="single" selected="true">Single files</option>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
54 <option value="paired">Paired reads</option>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
55 <option value="pair">Paired reads in separate data sets</option>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
56 </param>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
57 <when value="single">
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
58 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
59 </when>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
60 <when value="paired">
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
61 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
62 </when>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
63 <when value="pair">
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
64 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
65 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
66 </when>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
67 </conditional>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
68 </inputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
69 <outputs>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
70 <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
71 <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
72 </outputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
73 <tests>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
74 <!-- 1 single read -->
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
75 <test expect_num_outputs="2">
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
76 <param name="input_type" value="single"/>
1
bca267738b33 Uploaded
greg
parents: 0
diff changeset
77 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
78 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
79 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
80 </test>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
81 <!-- 1 set of paired reads -->
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
82 <test expect_num_outputs="2">
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
83 <param name="input_type" value="pair"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
84 <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
85 <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
86 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
87 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
88 </test>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
89 <!-- A collection of paired reads -->
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
90 <test expect_num_outputs="2">
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
91 <param name="input_type" value="paired"/>
1
bca267738b33 Uploaded
greg
parents: 0
diff changeset
92 <param name="reads_collection">
bca267738b33 Uploaded
greg
parents: 0
diff changeset
93 <collection type="paired">
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
94 <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
95 <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
1
bca267738b33 Uploaded
greg
parents: 0
diff changeset
96 </collection>
bca267738b33 Uploaded
greg
parents: 0
diff changeset
97 </param>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
98 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
99 <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
1
bca267738b33 Uploaded
greg
parents: 0
diff changeset
100 </test>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
101 </tests>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
102 <help>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
103 **What it does**
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
104
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
105 Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
106 inspects the data to discover the best reference genome for aligning the reads.
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
107
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
108 The information needed to discover the best reference is maintained by the USDA in this repository_. References are curreently
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
109
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
110 .. _repository: https://github.com/USDA-VS/vSNP_reference_options
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
111
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
112 limited to TB complex, paraTB, and Brucella, but information for additional references will be added. The information for each
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
113 reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print". These strings
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
114 are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
115
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
116 This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
117 input sample data. During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
118 consisting of zeros and ones based on the counts, (i.e., a DNA print). This string is then compared to the strings contained
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
119 in the in-memory dictionary of DNA prints to find a match.
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
120
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
121 The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
122 the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
123 to align the reads to the associated reference.
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
124
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
125 This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
126 about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
127
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
128 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
129 and discovering the primary species is critical. DNA print matching is currently supported for the following genomes.
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
130
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
131 * Mycobacterium bovis AF2122/97
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
132 * Brucella abortus bv. 1 str. 9-941
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
133 * Brucella abortus strain BER
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
134 * Brucella canis ATCC 23365
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
135 * Brucella ceti TE10759-12
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
136 * Brucella melitensis bv. 1 str. 16M
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
137 * Brucella melitensis bv. 3 str. Ether
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
138 * Brucella melitensis BwIM_SOM_36b
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
139 * Brucella melitensis ATCC 23457
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
140 * Brucella ovis ATCC 25840
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
141 * Brucella suis 1330
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
142 * Mycobacterium tuberculosis H37Rv
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
143 * Mycobacterium avium subsp. paratuberculosis strain Telford
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
144 * Mycobacterium avium subsp. paratuberculosis K-10
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
145 * Brucella suis ATCC 23445
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
146 * Brucella suis bv. 3 str. 686
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
147
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
148 **Required Options**
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
149
1
bca267738b33 Uploaded
greg
parents: 0
diff changeset
150 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
151 </help>
4
36bdf8b439ed Uploaded
greg
parents: 3
diff changeset
152 <expand macro="citations"/>
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
153 </tool>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
154