Mercurial > repos > iuc > ncbi_datasets
comparison datasets_genome.xml @ 11:ac24fff14f23 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
author | iuc |
---|---|
date | Fri, 02 Dec 2022 10:52:48 +0000 |
parents | a3395b1d871b |
children | d78faac2c6ef |
comparison
equal
deleted
inserted
replaced
10:a3395b1d871b | 11:ac24fff14f23 |
---|---|
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements"></expand> | 6 <expand macro="requirements"></expand> |
7 <command><![CDATA[ | 7 <command><![CDATA[ |
8 #import re | |
8 @SETUP_CERTIFICATES@ | 9 @SETUP_CERTIFICATES@ |
9 datasets download genome $query.subcommand.download_by | 10 datasets download genome $query.subcommand.download_by |
10 #if $query.subcommand.download_by == 'accession': | 11 #if $query.subcommand.download_by == 'accession': |
11 #if $query.subcommand.text_or_file.text_or_file == 'text': | 12 #if $query.subcommand.text_or_file.text_or_file == 'text': |
12 #echo " ".join(f"'{x}'" for x in $query.subcommand.text_or_file.accession.split(' ') if x) | 13 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x) |
13 #else | 14 #else |
14 --inputfile '$query.subcommand.text_or_file.inputfile' | 15 --inputfile '$query.subcommand.text_or_file.inputfile' |
15 #end if | 16 #end if |
16 #else: | 17 #else: |
17 '$query.subcommand.taxon' | 18 '$query.subcommand.taxon_positional' |
18 $query.subcommand.tax_exact_match | 19 $query.subcommand.tax_exact_match |
19 #end if | 20 #end if |
20 $filters.reference | 21 $filters.reference |
21 $filters.annotated | 22 $filters.annotated |
22 #if $filters.assembly_level: | 23 #if $filters.assembly_level: |
35 @RELEASED_AFTER@ | 36 @RELEASED_AFTER@ |
36 #for search_term in $filters.search: | 37 #for search_term in $filters.search: |
37 --search '$filters.search_term' | 38 --search '$filters.search_term' |
38 #end for | 39 #end for |
39 --no-progressbar | 40 --no-progressbar |
40 #if $uncompressed | 41 --dehydrated |
41 && 7z x -y ncbi_dataset.zip | 42 |
42 #else | 43 ## produce TSV report file |
43 && 7z l ncbi_dataset.zip > ncbi_dataset.txt | 44 && dataformat tsv genome |
45 --package ncbi_dataset.zip | |
46 --fields #echo ",".join($file_choices.report_columns) | |
47 > genome_data_report.tsv | |
48 | |
49 ## unzip and rehydrate if any data is to be downloaded (include is not None) | |
50 #if $file_choices.include | |
51 ## unzip | |
52 && 7z x -y ncbi_dataset.zip > 7z.log | |
53 | |
54 ## rehydrate | |
55 && datasets rehydrate | |
56 --directory ./ | |
57 #if not $file_choices.decompress | |
58 --gzip | |
59 #end if | |
60 --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} | |
61 | |
62 ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery | |
63 && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; | |
64 | |
65 ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) | |
66 && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; | |
67 #if $file_choices.decompress | |
68 && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; | |
69 #end if | |
70 | |
71 #if "seq-report" in $file_choices.include | |
72 && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; | |
73 #end if | |
74 | |
75 && true ## because Galaxy removes trailing ; from command | |
44 #end if | 76 #end if |
45 ]]></command> | 77 ]]></command> |
46 <inputs> | 78 <inputs> |
47 <section name="query" title="Query" expanded="true"> | 79 <section name="query" title="Query" expanded="true"> |
48 <conditional name="subcommand"> | 80 <conditional name="subcommand"> |
49 <param name="download_by" type="select" label="Choose how to find genomes to download"> | 81 <param name="download_by" type="select" label="Choose how to find genomes to download"> |
50 <option value="accession">Download by NCBI assembly or BioProject accession</option> | 82 <option value="accession">By NCBI assembly or BioProject accession</option> |
51 <option value="taxon">Download by taxon</option> | 83 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option> |
52 </param> | 84 </param> |
53 <when value="accession"> | 85 <when value="accession"> |
54 <expand macro="text_or_file"/> | 86 <expand macro="text_or_file"/> |
55 </when> | 87 </when> |
56 <when value="taxon"> | 88 <when value="taxon"> |
57 <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/> | 89 <expand macro="taxon_positional"/> |
58 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> | 90 <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/> |
59 </when> | 91 </when> |
60 </conditional> | 92 </conditional> |
61 </section> | 93 </section> |
62 <section name="filters" title="Filters and Limit"> | 94 <section name="filters" title="Filters and Limit"> |
65 <expand macro="assembly_level"/> | 97 <expand macro="assembly_level"/> |
66 <param argument="--assembly-version" type="select" label="Assembly version(s)"> | 98 <param argument="--assembly-version" type="select" label="Assembly version(s)"> |
67 <option value="latest">Latest</option> | 99 <option value="latest">Latest</option> |
68 <option value="all">All</option> | 100 <option value="all">All</option> |
69 </param> | 101 </param> |
70 <!-- TODO add test for assembly source: according to CLI doc args are RefSeq, GenBank, All and not refseq / genbank--> | |
71 <expand macro="assembly_source"/> | 102 <expand macro="assembly_source"/> |
72 <expand macro="chromosomes"/> | 103 <expand macro="chromosomes"/> |
73 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> | 104 <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/> |
74 <expand macro="released_options"/> | 105 <expand macro="released_options"/> |
75 <expand macro="released_options" before_or_after="after"/> | 106 <expand macro="released_options" before_or_after="after"/> |
76 | 107 |
77 <repeat name="search" title="Add search terms"> | 108 <repeat name="search" title="Add search terms"> |
78 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> | 109 <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/> |
79 </repeat> | 110 </repeat> |
80 </section> | 111 </section> |
81 <section name="file_choices" title="File Choices" expanded="true"> | 112 <section name="file_choices" title="Output options" expanded="true"> |
82 <expand macro="include"/> | 113 <expand macro="tsv_report_columns"> |
114 <option value="accession" selected="true">accession</option> | |
115 <option value="organism-name" selected="true">organism-name</option> | |
116 <option value="assminfo-submitter" selected="true">assminfo-submitter</option> | |
117 <option value="assminfo-name" selected="true">assminfo-name</option> | |
118 </expand> | |
119 <expand macro="include"> | |
120 <expand macro="genome_includes"/> | |
121 </expand> | |
122 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/> | |
83 </section> | 123 </section> |
84 <param name="uncompressed" type="boolean" label="Uncompress the dataset archive" checked="true"/> | |
85 </inputs> | 124 </inputs> |
86 <outputs> | 125 <outputs> |
87 <data name="compressed_archive" format="zip" label="Compressed Archive" from_work_dir="ncbi_dataset.zip"> | 126 <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/> |
88 <filter>not uncompressed</filter> | |
89 </data> | |
90 <data name="archive_contents" format="txt" label="Archive Contents" from_work_dir="ncbi_dataset.txt"> | |
91 <filter>not uncompressed</filter> | |
92 </data> | |
93 <data name="genome_data_report" format="json" label="NCBI Genome Datasets: Data Report" from_work_dir="ncbi_dataset/data/assembly_data_report.jsonl"> | |
94 <filter>uncompressed</filter> | |
95 </data> | |
96 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> | 127 <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list"> |
97 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.jsonl" ext="json" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 128 <discover_datasets pattern="(?P<identifier_0>.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
98 <filter>uncompressed and file_choices['include'] and "seq-report" in file_choices['include']</filter> | 129 <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter> |
99 </collection> | 130 </collection> |
100 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> | 131 <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list"> |
101 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 132 <discover_datasets pattern="(?P<identifier_0>.*?)/(?!rna|cds_from)(?P<identifier_1>.*?)(_genomic)?\.(?P<ext>fasta(\.gz)?)" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
102 <filter>uncompressed and file_choices['include'] and "genome" in file_choices['include']</filter> | 133 <filter>file_choices['include'] and "genome" in file_choices['include']</filter> |
103 </collection> | 134 </collection> |
104 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> | 135 <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list"> |
105 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 136 <discover_datasets pattern="(?P<identifier_0>.*?)\/rna\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
106 <filter>uncompressed and file_choices['include'] and "rna" in file_choices['include']</filter> | 137 <filter>file_choices['include'] and "rna" in file_choices['include']</filter> |
107 </collection> | 138 </collection> |
108 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> | 139 <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list"> |
109 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.faa" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 140 <discover_datasets pattern="(?P<identifier_0>.*?)\/protein\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
110 <filter>uncompressed and file_choices['include'] and "protein" in file_choices['include']</filter> | 141 <filter>file_choices['include'] and "protein" in file_choices['include']</filter> |
111 </collection> | 142 </collection> |
112 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> | 143 <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list"> |
113 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.fna" ext="fasta" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 144 <discover_datasets pattern="(?P<identifier_0>.*?)\/cds_from_genomic\.(?P<ext>fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
114 <filter>uncompressed and file_choices['include'] and "cds" in file_choices['include']</filter> | 145 <filter>file_choices['include'] and "cds" in file_choices['include']</filter> |
115 </collection> | 146 </collection> |
116 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> | 147 <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list"> |
117 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 148 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
118 <filter>uncompressed and file_choices['include'] and "gff3" in file_choices['include']</filter> | 149 <filter>file_choices['include'] and "gff3" in file_choices['include']</filter> |
119 </collection> | 150 </collection> |
120 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> | 151 <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list"> |
121 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 152 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
122 <filter>uncompressed and file_choices['include'] and "gtf" in file_choices['include']</filter> | 153 <filter>file_choices['include'] and "gtf" in file_choices['include']</filter> |
123 </collection> | 154 </collection> |
124 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> | 155 <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list"> |
125 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> | 156 <discover_datasets pattern="(?P<identifier_0>.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/> |
126 <filter>uncompressed and file_choices['include'] and "gbff" in file_choices['include']</filter> | 157 <filter>file_choices['include'] and "gbff" in file_choices['include']</filter> |
127 </collection> | 158 </collection> |
128 </outputs> | 159 </outputs> |
129 <tests> | 160 <tests> |
161 <!-- Note: All but one test use the non-default decompress="true" | |
162 | |
163 this is because (at 11/22) Galaxy can not apply text assertions on the content | |
164 of compressed files https://github.com/galaxyproject/galaxy/pull/15085 | |
165 | |
166 So with decompress="true" more powerfull assertions are powerful. | |
167 A single test checks the default, ie decompress="false". | |
168 --> | |
169 <test expect_num_outputs="1"> | |
170 <conditional name="query|subcommand"> | |
171 <param name="download_by" value="taxon"/> | |
172 <param name="taxon_positional" value="human"/> | |
173 </conditional> | |
174 <param name="chromosomes" value="21"/> | |
175 <param name="released_before" value="01/01/2018"/> | |
176 <section name="file_choices"> | |
177 <param name="include" value=""/> | |
178 </section> | |
179 <output name="genome_data_report"> | |
180 <assert_contents> | |
181 <has_text text="Assembly Accession	Assembly Name	Assembly Submitter	Organism Name"/> | |
182 <has_n_lines n="144"/> | |
183 <has_n_columns n="4"/> | |
184 </assert_contents> | |
185 </output> | |
186 </test> | |
130 <test expect_num_outputs="2"> | 187 <test expect_num_outputs="2"> |
131 <conditional name="query|subcommand"> | 188 <conditional name="query|subcommand"> |
132 <param name="download_by" value="taxon"/> | 189 <param name="download_by" value="taxon"/> |
133 <param name="text_or_file" value="text"/> | 190 <param name="taxon_positional" value="human"/> |
134 <param name="taxon" value="human"/> | |
135 </conditional> | 191 </conditional> |
136 <param name="chromosomes" value="21"/> | 192 <param name="chromosomes" value="21"/> |
137 <param name="include" value=""/> | |
138 <param name="uncompressed" value="false"/> | |
139 <param name="released_before" value="01/01/2018"/> | |
140 <output name="archive_contents"> | |
141 <assert_contents> | |
142 <has_text text="ncbi_dataset/data/dataset_catalog.json"/> | |
143 </assert_contents> | |
144 </output> | |
145 </test> | |
146 <test expect_num_outputs="2"> | |
147 <conditional name="query|subcommand"> | |
148 <param name="download_by" value="taxon"/> | |
149 <param name="text_or_file" value="text"/> | |
150 <param name="taxon" value="human"/> | |
151 </conditional> | |
152 <param name="chromosomes" value="21"/> | |
153 <param name="include" value="genome"/> | |
154 <param name="uncompressed" value="true"/> | |
155 <param name="assembly_level" value="chromosome,complete"/> | 193 <param name="assembly_level" value="chromosome,complete"/> |
156 <param name="released_before" value="01/01/2018"/> | 194 <param name="released_before" value="01/01/2018"/> |
195 <section name="file_choices"> | |
196 <param name="include" value="genome"/> | |
197 <param name="decompress" value="true"/> | |
198 </section> | |
157 <output_collection name="genome_fasta" type="list:list" count="14"> | 199 <output_collection name="genome_fasta" type="list:list" count="14"> |
158 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> | 200 <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/> |
159 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> | 201 <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/> |
160 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> | 202 <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/> |
161 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> | 203 <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/> |
172 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> | 214 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> |
173 </output_collection> | 215 </output_collection> |
174 <output name="genome_data_report"> | 216 <output name="genome_data_report"> |
175 <assert_contents> | 217 <assert_contents> |
176 <has_text text="Homo sapiens"/> | 218 <has_text text="Homo sapiens"/> |
177 </assert_contents> | 219 <has_n_columns n="4"/> |
178 </output> | 220 </assert_contents> |
179 </test> | 221 </output> |
180 <!-- same as precious test but assembly_source (refseq which removes some of the genomes) --> | 222 </test> |
223 <!-- same as previous test but assembly_source (refseq which removes some of the genomes) --> | |
181 <test expect_num_outputs="2"> | 224 <test expect_num_outputs="2"> |
182 <conditional name="query|subcommand"> | 225 <conditional name="query|subcommand"> |
183 <param name="download_by" value="taxon"/> | 226 <param name="download_by" value="taxon"/> |
184 <param name="text_or_file" value="text"/> | 227 <param name="taxon_positional" value="human"/> |
185 <param name="taxon" value="human"/> | |
186 </conditional> | 228 </conditional> |
187 <param name="chromosomes" value="21"/> | 229 <param name="chromosomes" value="21"/> |
188 <param name="include" value="genome"/> | |
189 <param name="uncompressed" value="true"/> | |
190 <param name="assembly_level" value="chromosome,complete"/> | 230 <param name="assembly_level" value="chromosome,complete"/> |
191 <param name="assembly_source" value="refseq"/> | 231 <param name="assembly_source" value="refseq"/> |
192 <param name="released_before" value="01/01/2018"/> | 232 <param name="released_before" value="01/01/2018"/> |
233 <section name="file_choices"> | |
234 <param name="include" value="genome"/> | |
235 <param name="decompress" value="true"/> | |
236 </section> | |
193 <output_collection name="genome_fasta" type="list:list" count="2"> | 237 <output_collection name="genome_fasta" type="list:list" count="2"> |
194 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> | 238 <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/> |
195 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> | 239 <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/> |
196 </output_collection> | 240 </output_collection> |
197 <output name="genome_data_report"> | 241 <output name="genome_data_report"> |
198 <assert_contents> | 242 <assert_contents> |
199 <has_text text="Homo sapiens"/> | 243 <has_text text="Homo sapiens"/> |
244 <has_n_lines n="5"/> | |
245 <has_n_columns n="4"/> | |
200 </assert_contents> | 246 </assert_contents> |
201 </output> | 247 </output> |
202 </test> | 248 </test> |
203 <test expect_num_outputs="4"> | 249 <test expect_num_outputs="4"> |
204 <conditional name="query|subcommand"> | 250 <conditional name="query|subcommand"> |
206 <conditional name="text_or_file"> | 252 <conditional name="text_or_file"> |
207 <param name="text_or_file" value="text"/> | 253 <param name="text_or_file" value="text"/> |
208 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> | 254 <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/> |
209 </conditional> | 255 </conditional> |
210 </conditional> | 256 </conditional> |
211 <param name="include" value="seq-report,gtf,cds"/> | |
212 <param name="uncompressed" value="true"/> | |
213 <param name="released_before" value="01/01/2007"/> | 257 <param name="released_before" value="01/01/2007"/> |
258 <section name="file_choices"> | |
259 <param name="include" value="seq-report,gtf,cds"/> | |
260 <param name="decompress" value="true"/> | |
261 </section> | |
214 <output name="genome_data_report"> | 262 <output name="genome_data_report"> |
215 <assert_contents> | 263 <assert_contents> |
216 <has_text text="GCF_000013305.1"/> | 264 <has_text text="GCF_000013305.1"/> |
217 </assert_contents> | 265 <has_n_lines n="3"/> |
218 </output> | 266 <has_n_columns n="4"/> |
267 </assert_contents> | |
268 </output> | |
269 <output_collection name="sequence_report" type="list" count="2" > | |
270 <element name="GCF_000007445.1"> | |
271 <assert_contents> | |
272 <has_text text="GCF_000007445.1"/> | |
273 <has_n_lines n="2"/> | |
274 <has_n_columns n="14"/> | |
275 </assert_contents> | |
276 </element> | |
277 <element name="GCF_000013305.1"> | |
278 <assert_contents> | |
279 <has_text text="GCF_000013305.1"/> | |
280 <has_n_lines n="2"/> | |
281 <has_n_columns n="14"/> | |
282 </assert_contents> | |
283 </element> | |
284 </output_collection> | |
219 <output_collection name="genomic_gtf" type="list"> | 285 <output_collection name="genomic_gtf" type="list"> |
220 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> | 286 <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/> |
221 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> | 287 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/> |
222 </output_collection> | 288 </output_collection> |
223 <output_collection name="genomic_cds" type="list"> | 289 <output_collection name="genomic_cds" type="list"> |
224 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/> | 290 <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains" decompress="true"/> |
225 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/> | 291 <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains" decompress="true"/> |
226 </output_collection> | 292 </output_collection> |
227 </test> | 293 </test> |
228 <test expect_num_outputs="4"> | 294 <test expect_num_outputs="4"> |
229 <conditional name="query|subcommand"> | 295 <conditional name="query|subcommand"> |
230 <param name="download_by" value="accession"/> | 296 <param name="download_by" value="accession"/> |
231 <conditional name="text_or_file"> | 297 <conditional name="text_or_file"> |
232 <param name="text_or_file" value="file"/> | 298 <param name="text_or_file" value="file"/> |
233 <param name="inputfile" value="accessions.txt"/> | 299 <param name="inputfile" value="accessions.txt"/> |
234 </conditional> | 300 </conditional> |
235 </conditional> | 301 </conditional> |
236 <param name="include" value="seq-report,gbff,gff3"/> | |
237 <param name="uncompressed" value="true"/> | |
238 <param name="released_before" value="01/01/2007"/> | 302 <param name="released_before" value="01/01/2007"/> |
239 <output name="genome_data_report"> | 303 <section name="file_choices"> |
240 <assert_contents> | 304 <param name="include" value="seq-report,gff3,gbff"/> |
241 <has_text text="SAMN02604181"/> | 305 <param name="decompress" value="true"/> |
306 </section> | |
307 <output name="genome_data_report"> | |
308 <assert_contents> | |
309 <has_text text="GCF_000013305.1"/> | |
310 <has_text text="GCF_000007445.1"/> | |
311 <has_n_lines n="3"/> | |
312 <has_n_columns n="4"/> | |
242 </assert_contents> | 313 </assert_contents> |
243 </output> | 314 </output> |
244 <output_collection name="genomic_gff" type="list"> | 315 <output_collection name="genomic_gff" type="list"> |
245 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> | 316 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/> |
246 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> | 317 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/> |
248 <output_collection name="genomic_gbff" type="list"> | 319 <output_collection name="genomic_gbff" type="list"> |
249 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> | 320 <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/> |
250 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> | 321 <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/> |
251 </output_collection> | 322 </output_collection> |
252 </test> | 323 </test> |
253 <test expect_num_outputs="2"> | 324 |
325 <!-- should not fail https://github.com/ncbi/datasets/issues/194 --> | |
326 <test expect_num_outputs="2" expect_failure="true"> | |
254 <conditional name="query|subcommand"> | 327 <conditional name="query|subcommand"> |
255 <param name="download_by" value="accession"/> | 328 <param name="download_by" value="accession"/> |
256 <conditional name="text_or_file"> | 329 <conditional name="text_or_file"> |
257 <param name="text_or_file" value="text"/> | 330 <param name="text_or_file" value="text"/> |
258 <param name="accession" value="GCF_000001405"/> | 331 <param name="accession" value="GCF_000001405"/> |
259 </conditional> | 332 </conditional> |
260 </conditional> | 333 </conditional> |
261 <param name="include" value="seq-report"/> | |
262 <param name="uncompressed" value="true"/> | |
263 <param name="released_before" value="01/01/2015"/> | 334 <param name="released_before" value="01/01/2015"/> |
264 <param name="assembly_version" value="all"/> | 335 <param name="assembly_version" value="all"/> |
265 <output_collection name="sequence_report" count="4"> | 336 <section name="file_choices"> |
266 <element name="GCF_000001405.25"> | 337 <param name="include" value="seq-report"/> |
267 <assert_contents> | 338 </section> |
268 <has_text text="assignedMoleculeLocationType"/> | 339 <!-- |
269 </assert_contents> | 340 <output_collection name="sequence_report" type="list" count="4" > |
270 </element> | 341 --> |
271 <element name="GCF_000001405.26"> | |
272 <assert_contents> | |
273 <has_text text="assignedMoleculeLocationType"/> | |
274 </assert_contents> | |
275 </element> | |
276 <element name="GCF_000001405.27"> | |
277 <assert_contents> | |
278 <has_text text="assignedMoleculeLocationType"/> | |
279 </assert_contents> | |
280 </element> | |
281 <element name="GCF_000001405.28"> | |
282 <assert_contents> | |
283 <has_text text="assignedMoleculeLocationType"/> | |
284 </assert_contents> | |
285 </element> | |
286 </output_collection> | |
287 </test> | 342 </test> |
288 <test expect_num_outputs="5"> | 343 <test expect_num_outputs="5"> |
289 <conditional name="query|subcommand"> | 344 <conditional name="query|subcommand"> |
290 <param name="download_by" value="accession"/> | 345 <param name="download_by" value="accession"/> |
291 <conditional name="text_or_file"> | 346 <conditional name="text_or_file"> |
292 <param name="text_or_file" value="text"/> | 347 <param name="text_or_file" value="text"/> |
293 <param name="accession" value="GCF_000146045.2"/> | 348 <param name="accession" value="GCF_000146045.2"/> |
294 </conditional> | 349 </conditional> |
295 </conditional> | 350 </conditional> |
296 <param name="include" value="seq-report,genome,rna,cds"/> | 351 <section name="file_choices"> |
297 <param name="uncompressed" value="true"/> | 352 <param name="include" value="genome,protein,rna,cds"/> |
353 <param name="decompress" value="true"/> | |
354 </section> | |
298 <output_collection name="genome_fasta" type="list:list" count="1"> | 355 <output_collection name="genome_fasta" type="list:list" count="1"> |
299 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> | 356 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> |
300 </output_collection> | 357 </output_collection> |
358 <output_collection name="protein_fasta" type="list" count="1"> | |
359 <element name="GCF_000146045.2" decompress="true"> | |
360 <assert_contents> | |
361 <has_text text=">"/> | |
362 </assert_contents> | |
363 </element> | |
364 </output_collection> | |
365 <output_collection name="rna_fasta" type="list" count="1"> | |
366 <element name="GCF_000146045.2" decompress="true"> | |
367 <assert_contents> | |
368 <has_text text=">"/> | |
369 </assert_contents> | |
370 </element> | |
371 </output_collection> | |
372 </test> | |
373 <!-- same as the previous test, but use the default value for decompress, | |
374 see comment at the beginning of the tests --> | |
375 <test expect_num_outputs="5"> | |
376 <conditional name="query|subcommand"> | |
377 <param name="download_by" value="accession"/> | |
378 <conditional name="text_or_file"> | |
379 <param name="text_or_file" value="text"/> | |
380 <param name="accession" value="GCF_000146045.2"/> | |
381 </conditional> | |
382 </conditional> | |
383 <section name="file_choices"> | |
384 <param name="include" value="genome,protein,rna,cds"/> | |
385 </section> | |
386 <output_collection name="genome_fasta" type="list:list" count="1"> | |
387 <element name="GCF_000146045.2"> | |
388 <element name="GCF_000146045.2_R64" ftype="fasta.gz"> | |
389 <assert_contents> | |
390 <has_size value="3843460"/> | |
391 </assert_contents> | |
392 </element> | |
393 </element> | |
394 </output_collection> | |
395 <output_collection name="protein_fasta" type="list" count="1"> | |
396 <element name="GCF_000146045.2" ftype="fasta.gz"> | |
397 <assert_contents> | |
398 <has_size value="1844838"/> | |
399 </assert_contents> | |
400 </element> | |
401 </output_collection> | |
402 <output_collection name="rna_fasta" type="list" count="1"> | |
403 <element name="GCF_000146045.2" ftype="fasta.gz"> | |
404 <assert_contents> | |
405 <has_size value="2784534"/> | |
406 </assert_contents> | |
407 </element> | |
408 </output_collection> | |
301 </test> | 409 </test> |
302 <test expect_num_outputs="3"> | 410 <test expect_num_outputs="3"> |
303 <conditional name="query|subcommand"> | 411 <conditional name="query|subcommand"> |
304 <param name="download_by" value="accession"/> | 412 <param name="download_by" value="accession"/> |
305 <conditional name="text_or_file"> | 413 <conditional name="text_or_file"> |
306 <param name="text_or_file" value="text"/> | 414 <param name="text_or_file" value="text"/> |
307 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> | 415 <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/> |
308 </conditional> | 416 </conditional> |
309 </conditional> | 417 </conditional> |
310 <param name="include" value="seq-report,genome"/> | 418 <section name="file_choices"> |
311 <param name="uncompressed" value="true"/> | 419 <param name="include" value="seq-report,genome"/> |
420 <param name="decompress" value="true"/> | |
421 </section> | |
422 <output_collection name="sequence_report" type="list" count="2"/> | |
312 <output_collection name="genome_fasta" type="list:list" count="2"> | 423 <output_collection name="genome_fasta" type="list:list" count="2"> |
313 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> | 424 <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/> |
314 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> | 425 <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/> |
315 </output_collection> | 426 </output_collection> |
316 </test> | 427 </test> |
318 https://github.com/ncbi/datasets/issues/187 | 429 https://github.com/ncbi/datasets/issues/187 |
319 hence we set expect_test_failure="true"--> | 430 hence we set expect_test_failure="true"--> |
320 <test expect_num_outputs="1" expect_test_failure="true"> | 431 <test expect_num_outputs="1" expect_test_failure="true"> |
321 <conditional name="query|subcommand"> | 432 <conditional name="query|subcommand"> |
322 <param name="download_by" value="taxon"/> | 433 <param name="download_by" value="taxon"/> |
323 <param name="text_or_file" value="text"/> | 434 <param name="taxon_positional" value="4932"/> |
324 <param name="taxon" value="4932"/> | |
325 <param name="tax_exact_match" value="true"/> | 435 <param name="tax_exact_match" value="true"/> |
326 </conditional> | 436 </conditional> |
327 <param name="include" value=""/> | 437 <param name="include" value=""/> |
328 <param name="uncompressed" value="true"/> | |
329 <output name="genome_data_report"> | 438 <output name="genome_data_report"> |
330 <assert_contents> | 439 <assert_contents> |
331 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> | 440 <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/> |
332 </assert_contents> | 441 </assert_contents> |
333 </output> | 442 </output> |
336 <help> | 445 <help> |
337 <![CDATA[ | 446 <![CDATA[ |
338 **Download Genome Datasets from NCBI** | 447 **Download Genome Datasets from NCBI** |
339 | 448 |
340 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. | 449 Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. |
341 Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. | 450 Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. |
342 | 451 |
343 Tthe default genome dataset includes the following files (if available): | 452 The download is a three step process: |
344 * data_report.jsonl (genome assembly and annotation metadata, not always available) | 453 |
345 * genomic.fna (genomic sequences) | 454 1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) |
346 * rna.fna (transcript sequences) | 455 2. The metadata is transformed into a tabular (TSV) file |
347 * protein.faa (protein sequences) | 456 3. The data is hydrated (the actual data is downloaded) |
348 * genomic.gff (genome annotation in gff3 format) | 457 |
349 * dataset_catalog.json (a list of files and file types included in the dataset) | 458 The 3rd step can be skipped by unselecting all output types in the `Include` parameter. |
459 Thereby its possible to inspect the metadata prior to the actual data download. Also this | |
460 allows to use the tool for querying data sets (and their accessions) of interest which | |
461 can then be downloaded in a second call using the accessions. | |
350 ]]> | 462 ]]> |
351 </help> | 463 </help> |
352 | 464 |
353 </tool> | 465 </tool> |