0
|
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
|
|
2 <description>from input data</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="1.76">biopython</requirement>
|
|
5 <requirement type="package" version="5.3">pyyaml</requirement>
|
|
6 </requirements>
|
|
7 <command detect_errors="exit_code"><![CDATA[
|
|
8 #import os
|
|
9 #import re
|
|
10 #set gzipped = 'false'
|
|
11 #set input_type = $input_type_cond.input_type
|
|
12 #set input_reads_dir = 'input_reads'
|
|
13 #set output_dbkey_dir = 'output_dbkey'
|
|
14 #set output_metrics_dir = 'output_metrics'
|
|
15 mkdir -p $input_reads_dir &&
|
|
16 mkdir -p $output_dbkey_dir &&
|
|
17 mkdir -p $output_metrics_dir &&
|
|
18 #if str($input_type) == "single":
|
|
19 #set read_type_cond = $input_type_cond.read_type_cond
|
|
20 #set read1 = $read_type_cond.read1
|
|
21 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
|
|
22 #if str($read_type_cond.read_type) == "single":
|
|
23 ln -s '${read1}' '${read1_identifier}' &&
|
|
24 #if $read1.is_of_type('fastqsanger.gz'):
|
|
25 #set gzipped = 'true'
|
|
26 #end if
|
|
27 #else:
|
|
28 #set read2 = $read_type_cond.read2
|
|
29 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
|
|
30 ln -s '${read1}' '${read1_identifier}' &&
|
|
31 ln -s '${read2}' '${read2_identifier}' &&
|
|
32 #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
|
|
33 #set gzipped = 'true'
|
|
34 #end if
|
|
35 #end if
|
|
36 #else:
|
3
|
37 #set collection_type = $input_type_cond.collection_type_cond.collection_type
|
1
|
38 #for $i in $input_type_cond.collection_type_cond.reads_collection:
|
0
|
39 #if $i.is_of_type('fastqsanger.gz'):
|
|
40 #set gzipped = 'true'
|
|
41 #end if
|
|
42 #set filename = $i.file_name
|
3
|
43 #if str($collection_type) == 'single_reads':
|
|
44 #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
|
|
45 #else:
|
|
46 ## Galaxy builds lists of pairs as nested lists with elements
|
|
47 ## named forward and reverse. When flattened, these lists
|
|
48 ## will work as inputs to the Parse parameter value expression
|
|
49 ## tool in workflows. However, the output list created by the
|
|
50 ## expression tool will not function correctly with the bwa_mem
|
|
51 ## mapper. Naming the identifier as follows is a solution.
|
|
52 #set identifier = re.sub('[^\s\w\-]', '_', str($i.name))
|
|
53 #end if
|
1
|
54 ln -s '$filename' '$input_reads_dir/$identifier' &&
|
0
|
55 #end for
|
|
56 #end if
|
|
57 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
|
|
58 #if str($input_type) == "single":
|
|
59 #if str($read_type_cond.read_type) == "single":
|
|
60 --read1 '${read1_identifier}'
|
|
61 #else:
|
|
62 --read1 '${read1_identifier}'
|
|
63 --read2 '${read2_identifier}'
|
|
64 #end if
|
|
65 --output_dbkey '$output_dbkey'
|
|
66 --output_metrics '$output_metrics'
|
|
67 #end if
|
|
68 --gzipped $gzipped
|
|
69 --processes $processes
|
1
|
70 #if str($in_test_mode) == "false":
|
|
71 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
|
|
72 #for $i in $dnaprint_fields:
|
|
73 --dnaprint_fields '${i[0]}' '${i[2]}'
|
|
74 #end for
|
|
75 #else:
|
|
76 --in_test_mode '$in_test_mode'
|
|
77 #end if
|
0
|
78 ]]></command>
|
|
79 <inputs>
|
|
80 <conditional name="input_type_cond">
|
|
81 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
|
|
82 <option value="single" selected="true">Single files</option>
|
1
|
83 <option value="collection">Collection of files</option>
|
0
|
84 </param>
|
|
85 <when value="single">
|
|
86 <conditional name="read_type_cond">
|
|
87 <param name="read_type" type="select" label="Choose the read type">
|
|
88 <option value="paired" selected="true">Paired</option>
|
|
89 <option value="single">Single</option>
|
|
90 </param>
|
|
91 <when value="paired">
|
|
92 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
|
|
93 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
|
|
94 </when>
|
|
95 <when value="single">
|
|
96 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
|
|
97 </when>
|
|
98 </conditional>
|
|
99 </when>
|
|
100 <when value="collection">
|
1
|
101 <conditional name="collection_type_cond">
|
|
102 <param name="collection_type" type="select" label="Collection of single reads or paired reads?">
|
|
103 <option value="single_reads" selected="true">Single reads</option>
|
|
104 <option value="paired_reads">Paired reads</option>
|
|
105 </param>
|
|
106 <when value="single_reads">
|
|
107 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
|
|
108 </when>
|
|
109 <when value="paired_reads">
|
|
110 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
|
|
111 </when>
|
|
112 </conditional>
|
0
|
113 </when>
|
|
114 </conditional>
|
|
115 <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
|
1
|
116 <!-- Functional testing -->
|
|
117 <param name="in_test_mode" type="hidden" value="false"/>
|
0
|
118 </inputs>
|
|
119 <outputs>
|
|
120 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}">
|
|
121 <filter>input_type_cond['input_type'] == 'single'</filter>
|
|
122 </data>
|
|
123 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}">
|
|
124 <filter>input_type_cond['input_type'] == 'single'</filter>
|
|
125 </data>
|
2
|
126 <collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}">
|
1
|
127 <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
|
0
|
128 <filter>input_type_cond['input_type'] == 'collection'</filter>
|
|
129 </collection>
|
2
|
130 <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
|
1
|
131 <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
|
0
|
132 <filter>input_type_cond['input_type'] == 'collection'</filter>
|
|
133 </collection>
|
|
134 </outputs>
|
|
135 <tests>
|
|
136 <test>
|
1
|
137 <param name="in_test_mode" value="true"/>
|
|
138 <param name="read_type" value="single"/>
|
|
139 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
|
0
|
140 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
|
|
141 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
|
|
142 </test>
|
1
|
143 <test>
|
|
144 <param name="in_test_mode" value="true"/>
|
|
145 <param name="input_type" value="collection"/>
|
|
146 <param name="collection_type" value="paired_reads"/>
|
|
147 <param name="reads_collection">
|
|
148 <collection type="paired">
|
|
149 <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
|
|
150 <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
|
|
151 </collection>
|
|
152 </param>
|
|
153 <output_collection name="output_dbkey_collection" type="list">
|
|
154 <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/>
|
|
155 <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
|
|
156 </output_collection>
|
|
157 <output_collection name="output_metrics_collection" type="list">
|
|
158 <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
|
|
159 <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
|
|
160 </output_collection>
|
|
161 </test>
|
0
|
162 </tests>
|
|
163 <help>
|
|
164 **What it does**
|
|
165
|
1
|
166 Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the
|
0
|
167 best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
|
|
168 perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
|
|
169 the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
|
|
170 manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated
|
|
171 with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
|
|
172 **Map with BWA-MEM**) to align the reads to the associated reference.
|
|
173
|
|
174 The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
|
|
175 used to compile the "DNA print" string.
|
|
176
|
|
177 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
|
|
178 and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes.
|
|
179
|
|
180 * Mycobacterium bovis AF2122/97
|
|
181 * Brucella abortus bv. 1 str. 9-941
|
|
182 * Brucella abortus strain BER
|
|
183 * Brucella canis ATCC 23365
|
|
184 * Brucella ceti TE10759-12
|
|
185 * Brucella melitensis bv. 1 str. 16M
|
|
186 * Brucella melitensis bv. 3 str. Ether
|
|
187 * Brucella melitensis BwIM_SOM_36b
|
|
188 * Brucella melitensis ATCC 23457
|
|
189 * Brucella ovis ATCC 25840
|
|
190 * Brucella suis 1330
|
|
191 * Mycobacterium tuberculosis H37Rv
|
|
192 * Mycobacterium avium subsp. paratuberculosis strain Telford
|
|
193 * Mycobacterium avium subsp. paratuberculosis K-10
|
|
194 * Brucella suis ATCC 23445
|
|
195 * Brucella suis bv. 3 str. 686
|
|
196
|
|
197 **Required Options**
|
|
198
|
1
|
199 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
|
0
|
200 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
|
|
201 </help>
|
|
202 <citations>
|
|
203 <citation type="bibtex">
|
|
204 @misc{None,
|
|
205 journal = {None},
|
|
206 author = {1. Stuber T},
|
|
207 title = {Manuscript in preparation},
|
|
208 year = {None},
|
|
209 url = {https://github.com/USDA-VS/vSNP},}
|
|
210 </citation>
|
|
211 </citations>
|
|
212 </tool>
|
|
213
|