comparison datasets_genome.xml @ 11:ac24fff14f23 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
author iuc
date Fri, 02 Dec 2022 10:52:48 +0000
parents a3395b1d871b
children d78faac2c6ef
comparison
equal deleted inserted replaced
10:a3395b1d871b 11:ac24fff14f23
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements"></expand> 6 <expand macro="requirements"></expand>
7 <command><![CDATA[ 7 <command><![CDATA[
8 #import re
8 @SETUP_CERTIFICATES@ 9 @SETUP_CERTIFICATES@
9 datasets download genome $query.subcommand.download_by 10 datasets download genome $query.subcommand.download_by
10 #if $query.subcommand.download_by == 'accession': 11 #if $query.subcommand.download_by == 'accession':
11 #if $query.subcommand.text_or_file.text_or_file == 'text': 12 #if $query.subcommand.text_or_file.text_or_file == 'text':
12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) 13 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
13 #else 14 #else
14 --inputfile '$query.subcommand.text_or_file.inputfile' 15 --inputfile '$query.subcommand.text_or_file.inputfile'
15 #end if 16 #end if
16 #else: 17 #else:
17 '$query.subcommand.taxon' 18 '$query.subcommand.taxon_positional'
18 $query.subcommand.tax_exact_match 19 $query.subcommand.tax_exact_match
19 #end if 20 #end if
20 $filters.reference 21 $filters.reference
21 $filters.annotated 22 $filters.annotated
22 #if $filters.assembly_level: 23 #if $filters.assembly_level:
35 @RELEASED_AFTER@ 36 @RELEASED_AFTER@
36 #for search_term in $filters.search: 37 #for search_term in $filters.search:
37 --search '$filters.search_term' 38 --search '$filters.search_term'
38 #end for 39 #end for
39 --no-progressbar 40 --no-progressbar
40 #if $uncompressed 41 --dehydrated
41 && 7z x -y ncbi_dataset.zip 42
42 #else 43 ## produce TSV report file
43 && 7z l ncbi_dataset.zip > ncbi_dataset.txt 44 && dataformat tsv genome
45 --package ncbi_dataset.zip
46 --fields #echo ",".join($file_choices.report_columns)
47 > genome_data_report.tsv
48
49 ## unzip and rehydrate if any data is to be downloaded (include is not None)
50 #if $file_choices.include
51 ## unzip
52 && 7z x -y ncbi_dataset.zip > 7z.log
53
54 ## rehydrate
55 && datasets rehydrate
56 --directory ./
57 #if not $file_choices.decompress
58 --gzip
59 #end if
60 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
61
62 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
63 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
64
65 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
66 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
67 #if $file_choices.decompress
68 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
69 #end if
70
71 #if "seq-report" in $file_choices.include
72 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
73 #end if
74
75 && true ## because Galaxy removes trailing ; from command
44 #end if 76 #end if
45 ]]></command> 77 ]]></command>
46 <inputs> 78 <inputs>
47 <section name="query" title="Query" expanded="true"> 79 <section name="query" title="Query" expanded="true">
48 <conditional name="subcommand"> 80 <conditional name="subcommand">
49 <param name="download_by" type="select" label="Choose how to find genomes to download"> 81 <param name="download_by" type="select" label="Choose how to find genomes to download">
50 <option value="accession">Download by NCBI assembly or BioProject accession</option> 82 <option value="accession">By NCBI assembly or BioProject accession</option>
51 <option value="taxon">Download by taxon</option> 83 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
52 </param> 84 </param>
53 <when value="accession"> 85 <when value="accession">
54 <expand macro="text_or_file"/> 86 <expand macro="text_or_file"/>
55 </when> 87 </when>
56 <when value="taxon"> 88 <when value="taxon">
57 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/> 89 <expand macro="taxon_positional"/>
58 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> 90 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
59 </when> 91 </when>
60 </conditional> 92 </conditional>
61 </section> 93 </section>
62 <section name="filters" title="Filters and Limit"> 94 <section name="filters" title="Filters and Limit">
65 <expand macro="assembly_level"/> 97 <expand macro="assembly_level"/>
66 <param argument="--assembly-version" type="select" label="Assembly version(s)"> 98 <param argument="--assembly-version" type="select" label="Assembly version(s)">
67 <option value="latest">Latest</option> 99 <option value="latest">Latest</option>
68 <option value="all">All</option> 100 <option value="all">All</option>
69 </param> 101 </param>
70 <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank-->
71 <expand macro="assembly_source"/> 102 <expand macro="assembly_source"/>
72 <expand macro="chromosomes"/> 103 <expand macro="chromosomes"/>
73 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> 104 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
74 <expand macro="released_options"/> 105 <expand macro="released_options"/>
75 <expand macro="released_options" before_or_after="after"/> 106 <expand macro="released_options" before_or_after="after"/>
76 107
77 <repeat name="search" title="Add search terms"> 108 <repeat name="search" title="Add search terms">
78 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> 109 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
79 </repeat> 110 </repeat>
80 </section> 111 </section>
81 <section name="file_choices" title="File Choices" expanded="true"> 112 <section name="file_choices" title="Output options" expanded="true">
82 <expand macro="include"/> 113 <expand macro="tsv_report_columns">
114 <option value="accession" selected="true">accession</option>
115 <option value="organism-name" selected="true">organism-name</option>
116 <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
117 <option value="assminfo-name" selected="true">assminfo-name</option>
118 </expand>
119 <expand macro="include">
120 <expand macro="genome_includes"/>
121 </expand>
122 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
83 </section> 123 </section>
84 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/>
85 </inputs> 124 </inputs>
86 <outputs> 125 <outputs>
87 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> 126 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
88 <filter>not uncompressed</filter>
89 </data>
90 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt">
91 <filter>not uncompressed</filter>
92 </data>
93 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl">
94 <filter>uncompressed</filter>
95 </data>
96 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> 127 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
97 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 128 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
98 <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter> 129 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
99 </collection> 130 </collection>
100 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> 131 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
101 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 132 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
102 <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter> 133 <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
103 </collection> 134 </collection>
104 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> 135 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
105 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 136 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
106 <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter> 137 <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
107 </collection> 138 </collection>
108 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> 139 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
109 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 140 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
110 <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter> 141 <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
111 </collection> 142 </collection>
112 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> 143 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
113 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 144 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
114 <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter> 145 <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
115 </collection> 146 </collection>
116 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> 147 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
117 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 148 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
118 <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter> 149 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
119 </collection> 150 </collection>
120 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> 151 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
121 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 152 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
122 <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter> 153 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
123 </collection> 154 </collection>
124 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> 155 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
125 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> 156 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
126 <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter> 157 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
127 </collection> 158 </collection>
128 </outputs> 159 </outputs>
129 <tests> 160 <tests>
161 <!-- Note: All but one test use the non-default decompress="true"
162
163 this is because (at 11/22) Galaxy can not apply text assertions on the content
164 of compressed files https://github.com/galaxyproject/galaxy/pull/15085
165
166 So with decompress="true" more powerfull assertions are powerful.
167 A single test checks the default, ie decompress="false".
168 -->
169 <test expect_num_outputs="1">
170 <conditional name="query|subcommand">
171 <param name="download_by" value="taxon"/>
172 <param name="taxon_positional" value="human"/>
173 </conditional>
174 <param name="chromosomes" value="21"/>
175 <param name="released_before" value="01/01/2018"/>
176 <section name="file_choices">
177 <param name="include" value=""/>
178 </section>
179 <output name="genome_data_report">
180 <assert_contents>
181 <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
182 <has_n_lines n="144"/>
183 <has_n_columns n="4"/>
184 </assert_contents>
185 </output>
186 </test>
130 <test expect_num_outputs="2"> 187 <test expect_num_outputs="2">
131 <conditional name="query|subcommand"> 188 <conditional name="query|subcommand">
132 <param name="download_by" value="taxon"/> 189 <param name="download_by" value="taxon"/>
133 <param name="text_or_file" value="text"/> 190 <param name="taxon_positional" value="human"/>
134 <param name="taxon" value="human"/>
135 </conditional> 191 </conditional>
136 <param name="chromosomes" value="21"/> 192 <param name="chromosomes" value="21"/>
137 <param name="include" value=""/>
138 <param name="uncompressed" value="false"/>
139 <param name="released_before" value="01/01/2018"/>
140 <output name="archive_contents">
141 <assert_contents>
142 <has_text text="ncbi_dataset/data/dataset_catalog.json"/>
143 </assert_contents>
144 </output>
145 </test>
146 <test expect_num_outputs="2">
147 <conditional name="query|subcommand">
148 <param name="download_by" value="taxon"/>
149 <param name="text_or_file" value="text"/>
150 <param name="taxon" value="human"/>
151 </conditional>
152 <param name="chromosomes" value="21"/>
153 <param name="include" value="genome"/>
154 <param name="uncompressed" value="true"/>
155 <param name="assembly_level" value="chromosome,complete"/> 193 <param name="assembly_level" value="chromosome,complete"/>
156 <param name="released_before" value="01/01/2018"/> 194 <param name="released_before" value="01/01/2018"/>
195 <section name="file_choices">
196 <param name="include" value="genome"/>
197 <param name="decompress" value="true"/>
198 </section>
157 <output_collection name="genome_fasta" type="list:list" count="14"> 199 <output_collection name="genome_fasta" type="list:list" count="14">
158 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> 200 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
159 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> 201 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
160 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> 202 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
161 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> 203 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
172 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> 214 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
173 </output_collection> 215 </output_collection>
174 <output name="genome_data_report"> 216 <output name="genome_data_report">
175 <assert_contents> 217 <assert_contents>
176 <has_text text="Homo sapiens"/> 218 <has_text text="Homo sapiens"/>
177 </assert_contents> 219 <has_n_columns n="4"/>
178 </output> 220 </assert_contents>
179 </test> 221 </output>
180 <!-- same as precious test but assembly_source (refseq which removes some of the genomes) --> 222 </test>
223 <!-- same as previous test but assembly_source (refseq which removes some of the genomes) -->
181 <test expect_num_outputs="2"> 224 <test expect_num_outputs="2">
182 <conditional name="query|subcommand"> 225 <conditional name="query|subcommand">
183 <param name="download_by" value="taxon"/> 226 <param name="download_by" value="taxon"/>
184 <param name="text_or_file" value="text"/> 227 <param name="taxon_positional" value="human"/>
185 <param name="taxon" value="human"/>
186 </conditional> 228 </conditional>
187 <param name="chromosomes" value="21"/> 229 <param name="chromosomes" value="21"/>
188 <param name="include" value="genome"/>
189 <param name="uncompressed" value="true"/>
190 <param name="assembly_level" value="chromosome,complete"/> 230 <param name="assembly_level" value="chromosome,complete"/>
191 <param name="assembly_source" value="refseq"/> 231 <param name="assembly_source" value="refseq"/>
192 <param name="released_before" value="01/01/2018"/> 232 <param name="released_before" value="01/01/2018"/>
233 <section name="file_choices">
234 <param name="include" value="genome"/>
235 <param name="decompress" value="true"/>
236 </section>
193 <output_collection name="genome_fasta" type="list:list" count="2"> 237 <output_collection name="genome_fasta" type="list:list" count="2">
194 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> 238 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
195 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> 239 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
196 </output_collection> 240 </output_collection>
197 <output name="genome_data_report"> 241 <output name="genome_data_report">
198 <assert_contents> 242 <assert_contents>
199 <has_text text="Homo sapiens"/> 243 <has_text text="Homo sapiens"/>
244 <has_n_lines n="5"/>
245 <has_n_columns n="4"/>
200 </assert_contents> 246 </assert_contents>
201 </output> 247 </output>
202 </test> 248 </test>
203 <test expect_num_outputs="4"> 249 <test expect_num_outputs="4">
204 <conditional name="query|subcommand"> 250 <conditional name="query|subcommand">
206 <conditional name="text_or_file"> 252 <conditional name="text_or_file">
207 <param name="text_or_file" value="text"/> 253 <param name="text_or_file" value="text"/>
208 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> 254 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
209 </conditional> 255 </conditional>
210 </conditional> 256 </conditional>
211 <param name="include" value="seq-report,gtf,cds"/>
212 <param name="uncompressed" value="true"/>
213 <param name="released_before" value="01/01/2007"/> 257 <param name="released_before" value="01/01/2007"/>
258 <section name="file_choices">
259 <param name="include" value="seq-report,gtf,cds"/>
260 <param name="decompress" value="true"/>
261 </section>
214 <output name="genome_data_report"> 262 <output name="genome_data_report">
215 <assert_contents> 263 <assert_contents>
216 <has_text text="GCF_000013305.1"/> 264 <has_text text="GCF_000013305.1"/>
217 </assert_contents> 265 <has_n_lines n="3"/>
218 </output> 266 <has_n_columns n="4"/>
267 </assert_contents>
268 </output>
269 <output_collection name="sequence_report" type="list" count="2" >
270 <element name="GCF_000007445.1">
271 <assert_contents>
272 <has_text text="GCF_000007445.1"/>
273 <has_n_lines n="2"/>
274 <has_n_columns n="14"/>
275 </assert_contents>
276 </element>
277 <element name="GCF_000013305.1">
278 <assert_contents>
279 <has_text text="GCF_000013305.1"/>
280 <has_n_lines n="2"/>
281 <has_n_columns n="14"/>
282 </assert_contents>
283 </element>
284 </output_collection>
219 <output_collection name="genomic_gtf" type="list"> 285 <output_collection name="genomic_gtf" type="list">
220 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> 286 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
221 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> 287 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
222 </output_collection> 288 </output_collection>
223 <output_collection name="genomic_cds" type="list"> 289 <output_collection name="genomic_cds" type="list">
224 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> 290 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/>
225 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> 291 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/>
226 </output_collection> 292 </output_collection>
227 </test> 293 </test>
228 <test expect_num_outputs="4"> 294 <test expect_num_outputs="4">
229 <conditional name="query|subcommand"> 295 <conditional name="query|subcommand">
230 <param name="download_by" value="accession"/> 296 <param name="download_by" value="accession"/>
231 <conditional name="text_or_file"> 297 <conditional name="text_or_file">
232 <param name="text_or_file" value="file"/> 298 <param name="text_or_file" value="file"/>
233 <param name="inputfile" value="accessions.txt"/> 299 <param name="inputfile" value="accessions.txt"/>
234 </conditional> 300 </conditional>
235 </conditional> 301 </conditional>
236 <param name="include" value="seq-report,gbff,gff3"/>
237 <param name="uncompressed" value="true"/>
238 <param name="released_before" value="01/01/2007"/> 302 <param name="released_before" value="01/01/2007"/>
239 <output name="genome_data_report"> 303 <section name="file_choices">
240 <assert_contents> 304 <param name="include" value="seq-report,gff3,gbff"/>
241 <has_text text="SAMN02604181"/> 305 <param name="decompress" value="true"/>
306 </section>
307 <output name="genome_data_report">
308 <assert_contents>
309 <has_text text="GCF_000013305.1"/>
310 <has_text text="GCF_000007445.1"/>
311 <has_n_lines n="3"/>
312 <has_n_columns n="4"/>
242 </assert_contents> 313 </assert_contents>
243 </output> 314 </output>
244 <output_collection name="genomic_gff" type="list"> 315 <output_collection name="genomic_gff" type="list">
245 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> 316 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
246 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> 317 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
248 <output_collection name="genomic_gbff" type="list"> 319 <output_collection name="genomic_gbff" type="list">
249 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> 320 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
250 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> 321 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
251 </output_collection> 322 </output_collection>
252 </test> 323 </test>
253 <test expect_num_outputs="2"> 324
325 <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
326 <test expect_num_outputs="2" expect_failure="true">
254 <conditional name="query|subcommand"> 327 <conditional name="query|subcommand">
255 <param name="download_by" value="accession"/> 328 <param name="download_by" value="accession"/>
256 <conditional name="text_or_file"> 329 <conditional name="text_or_file">
257 <param name="text_or_file" value="text"/> 330 <param name="text_or_file" value="text"/>
258 <param name="accession" value="GCF_000001405"/> 331 <param name="accession" value="GCF_000001405"/>
259 </conditional> 332 </conditional>
260 </conditional> 333 </conditional>
261 <param name="include" value="seq-report"/>
262 <param name="uncompressed" value="true"/>
263 <param name="released_before" value="01/01/2015"/> 334 <param name="released_before" value="01/01/2015"/>
264 <param name="assembly_version" value="all"/> 335 <param name="assembly_version" value="all"/>
265 <output_collection name="sequence_report" count="4"> 336 <section name="file_choices">
266 <element name="GCF_000001405.25"> 337 <param name="include" value="seq-report"/>
267 <assert_contents> 338 </section>
268 <has_text text="assignedMoleculeLocationType"/> 339 <!--
269 </assert_contents> 340 <output_collection name="sequence_report" type="list" count="4" >
270 </element> 341 -->
271 <element name="GCF_000001405.26">
272 <assert_contents>
273 <has_text text="assignedMoleculeLocationType"/>
274 </assert_contents>
275 </element>
276 <element name="GCF_000001405.27">
277 <assert_contents>
278 <has_text text="assignedMoleculeLocationType"/>
279 </assert_contents>
280 </element>
281 <element name="GCF_000001405.28">
282 <assert_contents>
283 <has_text text="assignedMoleculeLocationType"/>
284 </assert_contents>
285 </element>
286 </output_collection>
287 </test> 342 </test>
288 <test expect_num_outputs="5"> 343 <test expect_num_outputs="5">
289 <conditional name="query|subcommand"> 344 <conditional name="query|subcommand">
290 <param name="download_by" value="accession"/> 345 <param name="download_by" value="accession"/>
291 <conditional name="text_or_file"> 346 <conditional name="text_or_file">
292 <param name="text_or_file" value="text"/> 347 <param name="text_or_file" value="text"/>
293 <param name="accession" value="GCF_000146045.2"/> 348 <param name="accession" value="GCF_000146045.2"/>
294 </conditional> 349 </conditional>
295 </conditional> 350 </conditional>
296 <param name="include" value="seq-report,genome,rna,cds"/> 351 <section name="file_choices">
297 <param name="uncompressed" value="true"/> 352 <param name="include" value="genome,protein,rna,cds"/>
353 <param name="decompress" value="true"/>
354 </section>
298 <output_collection name="genome_fasta" type="list:list" count="1"> 355 <output_collection name="genome_fasta" type="list:list" count="1">
299 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> 356 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
300 </output_collection> 357 </output_collection>
358 <output_collection name="protein_fasta" type="list" count="1">
359 <element name="GCF_000146045.2" decompress="true">
360 <assert_contents>
361 <has_text text=">"/>
362 </assert_contents>
363 </element>
364 </output_collection>
365 <output_collection name="rna_fasta" type="list" count="1">
366 <element name="GCF_000146045.2" decompress="true">
367 <assert_contents>
368 <has_text text=">"/>
369 </assert_contents>
370 </element>
371 </output_collection>
372 </test>
373 <!-- same as the previous test, but use the default value for decompress,
374 see comment at the beginning of the tests -->
375 <test expect_num_outputs="5">
376 <conditional name="query|subcommand">
377 <param name="download_by" value="accession"/>
378 <conditional name="text_or_file">
379 <param name="text_or_file" value="text"/>
380 <param name="accession" value="GCF_000146045.2"/>
381 </conditional>
382 </conditional>
383 <section name="file_choices">
384 <param name="include" value="genome,protein,rna,cds"/>
385 </section>
386 <output_collection name="genome_fasta" type="list:list" count="1">
387 <element name="GCF_000146045.2">
388 <element name="GCF_000146045.2_R64" ftype="fasta.gz">
389 <assert_contents>
390 <has_size value="3843460"/>
391 </assert_contents>
392 </element>
393 </element>
394 </output_collection>
395 <output_collection name="protein_fasta" type="list" count="1">
396 <element name="GCF_000146045.2" ftype="fasta.gz">
397 <assert_contents>
398 <has_size value="1844838"/>
399 </assert_contents>
400 </element>
401 </output_collection>
402 <output_collection name="rna_fasta" type="list" count="1">
403 <element name="GCF_000146045.2" ftype="fasta.gz">
404 <assert_contents>
405 <has_size value="2784534"/>
406 </assert_contents>
407 </element>
408 </output_collection>
301 </test> 409 </test>
302 <test expect_num_outputs="3"> 410 <test expect_num_outputs="3">
303 <conditional name="query|subcommand"> 411 <conditional name="query|subcommand">
304 <param name="download_by" value="accession"/> 412 <param name="download_by" value="accession"/>
305 <conditional name="text_or_file"> 413 <conditional name="text_or_file">
306 <param name="text_or_file" value="text"/> 414 <param name="text_or_file" value="text"/>
307 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> 415 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
308 </conditional> 416 </conditional>
309 </conditional> 417 </conditional>
310 <param name="include" value="seq-report,genome"/> 418 <section name="file_choices">
311 <param name="uncompressed" value="true"/> 419 <param name="include" value="seq-report,genome"/>
420 <param name="decompress" value="true"/>
421 </section>
422 <output_collection name="sequence_report" type="list" count="2"/>
312 <output_collection name="genome_fasta" type="list:list" count="2"> 423 <output_collection name="genome_fasta" type="list:list" count="2">
313 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> 424 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
314 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> 425 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
315 </output_collection> 426 </output_collection>
316 </test> 427 </test>
318 https://github.com/ncbi/datasets/issues/187 429 https://github.com/ncbi/datasets/issues/187
319 hence we set expect_test_failure="true"--> 430 hence we set expect_test_failure="true"-->
320 <test expect_num_outputs="1" expect_test_failure="true"> 431 <test expect_num_outputs="1" expect_test_failure="true">
321 <conditional name="query|subcommand"> 432 <conditional name="query|subcommand">
322 <param name="download_by" value="taxon"/> 433 <param name="download_by" value="taxon"/>
323 <param name="text_or_file" value="text"/> 434 <param name="taxon_positional" value="4932"/>
324 <param name="taxon" value="4932"/>
325 <param name="tax_exact_match" value="true"/> 435 <param name="tax_exact_match" value="true"/>
326 </conditional> 436 </conditional>
327 <param name="include" value=""/> 437 <param name="include" value=""/>
328 <param name="uncompressed" value="true"/>
329 <output name="genome_data_report"> 438 <output name="genome_data_report">
330 <assert_contents> 439 <assert_contents>
331 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> 440 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
332 </assert_contents> 441 </assert_contents>
333 </output> 442 </output>
336 <help> 445 <help>
337 <![CDATA[ 446 <![CDATA[
338 **Download Genome Datasets from NCBI** 447 **Download Genome Datasets from NCBI**
339 448
340 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. 449 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
341 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. 450 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
342 451
343 Tthe default genome dataset includes the following files (if available): 452 The download is a three step process:
344 * data_report.jsonl (genome assembly and annotation metadata, not always available) 453
345 * genomic.fna (genomic sequences) 454 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
346 * rna.fna (transcript sequences) 455 2. The metadata is transformed into a tabular (TSV) file
347 * protein.faa (protein sequences) 456 3. The data is hydrated (the actual data is downloaded)
348 * genomic.gff (genome annotation in gff3 format) 457
349 * dataset_catalog.json (a list of files and file types included in the dataset) 458 The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
459 Thereby its possible to inspect the metadata prior to the actual data download. Also this
460 allows to use the tool for querying data sets (and their accessions) of interest which
461 can then be downloaded in a second call using the accessions.
350 ]]> 462 ]]>
351 </help> 463 </help>
352 464
353 </tool> 465 </tool>