Mercurial > repos > iuc > ncbi_acc_download
view ncbi_acc_download.xml @ 2:e063168e0a81 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_acc_download commit 7077bbc8c1e416f717b545c5ac18b0c8b5a066b2
author | iuc |
---|---|
date | Fri, 18 Nov 2022 18:21:57 +0000 |
parents | 1c58de56d587 |
children |
line wrap: on
line source
<tool id="ncbi_acc_download" name="NCBI Accession Download" version="@TOOL_VERSION@+galaxy0"> <description>Download sequences from GenBank/RefSeq by accession through the NCBI ENTREZ API</description> <macros> <import>macros.xml</import> </macros> <requirements> <requirement type="package" version="@TOOL_VERSION@">ncbi-acc-download</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ #if $query_source.select == "accession_file": { grep -v "^[ \t]*$" $query_source.accession_file > accessions || { echo "No accession numbers in input. Aborting." 1>&2; exit 1; } } && #else if $query_source.select == "accession_list": echo '$query_source.accession_list' | sed -r 's/(\,|__cn__)/\n/g' | grep -v "^[ \t]*$" > accessions && #end if mkdir outdir && cd outdir && ignore_errors=$ignore_failed && while read accession; do echo "Downloading accession number: " \$accession " ..." >> ../error.log && ncbi-acc-download --molecule '${molecule.select}' --format '${molecule.format}' #if $molecule.format != 'featuretable' and $molecule.format != 'gff3': --extended-validation all #end if #if $range != "" --range $range #end if \${accession}; failure=\$?; if [ \$failure -ne 0 ]; then echo " failed." >> ../error.log; if [ \$ignore_errors -ne 0 ]; then echo \$accession >> ../failed.txt; else exit 1; fi; else echo " done." >> ../error.log; fi; sleep 2; done < ../accessions 2> >(tee -a ../error.log >&2); ]]></command> <inputs> <conditional name="query_source"> <param name="select" type="select" label="Select source for IDs"> <option value="accession_file">File containing Accessions (one per line)</option> <option value="accession_list">Direct Entry</option> </param> <when value="accession_file"> <param label="Accession File" name="accession_file" type="data" format="txt,tabular"/> </when> <when value="accession_list"> <param label="ID List" name="accession_list" type="text" area="true" help="Newline/Comma separated list of IDs"> <validator type="expression" message="ID list cannot be empty">value.strip()</validator> </param> </when> </conditional> <conditional name="molecule"> <param name="select" type="select" label="Molecule Type"> <option value="nucleotide" selected="true">Nucleotide</option> <option value="protein">Protein</option> </param> <when value="nucleotide"> <param name="format" type="select" label="File Format"> <option value="fasta" selected="true">FASTA</option> <option value="genbank">GenBank</option> <option value="featuretable">Feature Table</option> <option value="gff3">GFF3</option> </param> </when> <when value="protein"> <param name="format" type="select" label="File Format"> <option value="fasta" selected="true">FASTA</option> </param> </when> </conditional> <param argument="--range" type="text" label="Range" help="Region to subset accession. Start and end position separated by ':', '..', or '.'. Only for single accession"> <sanitizer invalid_char=""> <valid initial="string.digits"> <add value=":" /> <add value=".." /> <add value="." /> </valid> </sanitizer> </param> <param name="ignore_failed" type="select" display="radio" label="How to handle download failures"> <option value="0">Abort with error on first failure</option> <option value="1">Add accession to failed list and continue</option> </param> </inputs> <outputs> <collection name="output" type="list" label="${tool.name} on ${on_string}: Downloaded Files"> <discover_datasets pattern="(?P<name>.+)\.fa$" directory="outdir" format="fasta"/> <discover_datasets pattern="(?P<name>.+)\.gbk$" directory="outdir" format="genbank"/> <discover_datasets pattern="(?P<name>.+)\.gff$" directory="outdir" format="gff"/> <discover_datasets pattern="(?P<name>.+)\.ft$" directory="outdir" format="txt"/> </collection> <data name="error_log" from_work_dir="error.log" label="${tool.name} on ${on_string}: Log" format="txt"/> <data name="failed_accessions" from_work_dir="failed.txt" label="${tool.name} on ${on_string}: Failed accessions" format="txt"> <filter>str(ignore_failed)=='1'</filter> </data> </outputs> <tests> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_file" /> <param name="accession_file" value="accessions_1.tsv"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="fasta"> <assert_contents> <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> </assert_contents> </element> <element name="CP021680" ftype="fasta"> <assert_contents> <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="genbank"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_file" /> <param name="accession_file" value="accessions_1.tsv"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="genbank"> <assert_contents> <has_line line="DEFINITION Escherichia coli str. Sanji plasmid pSJ_94, complete sequence." /> </assert_contents> </element> <element name="CP021680" ftype="genbank"> <assert_contents> <has_line line="DEFINITION Escherichia coli strain AR_0162 plasmid tig00002623, complete" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="gff3"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_file" /> <param name="accession_file" value="accessions_1.tsv"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="gff"> <assert_contents> <has_line line="##sequence-region CP011064.1 1 94712" /> </assert_contents> </element> <element name="CP021680" ftype="gff"> <assert_contents> <has_line line="##sequence-region CP021680.1 1 23332" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="featuretable"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_file" /> <param name="accession_file" value="accessions_1.tsv"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="txt"> <assert_contents> <has_line line=">Feature gb|CP011064.1|" /> </assert_contents> </element> <element name="CP021680" ftype="txt"> <assert_contents> <has_line line=">Feature gb|CP021680.1|" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="CP011064,CP021680"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="fasta"> <assert_contents> <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> </assert_contents> </element> <element name="CP021680" ftype="fasta"> <assert_contents> <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> </conditional> <param name="ignore_failed" value="1" /> <output_collection name="output" type="list"> <element name="CP011064" ftype="fasta"> <assert_contents> <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> </assert_contents> </element> <element name="CP021680" ftype="fasta"> <assert_contents> <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> </assert_contents> </element> </output_collection> <output name="failed_accessions"> <assert_contents> <has_line line="CP0XXXXX" /> </assert_contents> </output> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="CP0XXXXX"/> </conditional> <param name="ignore_failed" value="1" /> <output name="failed_accessions"> <assert_contents> <has_line line="CP0XXXXX" /> </assert_contents> </output> </test> <test expect_failure="true"> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> </conditional> <param name="ignore_failed" value="0" /> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="CP011064 CP021680"/> </conditional> <output_collection name="output" type="list"> <element name="CP011064" ftype="fasta"> <assert_contents> <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> </assert_contents> </element> <element name="CP021680" ftype="fasta"> <assert_contents> <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="protein"/> <param name="format" value="fasta"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="NP_003192"/> </conditional> <output_collection name="output" type="list"> <element name="NP_003192" ftype="fasta"> <assert_contents> <has_line line=">NP_003192.1 transcription factor A, mitochondrial isoform 1 precursor [Homo sapiens]" /> </assert_contents> </element> </output_collection> </test> <test> <conditional name="molecule"> <param name="select" value="nucleotide"/> <param name="format" value="genbank"/> </conditional> <conditional name="query_source"> <param name="select" value="accession_list" /> <param name="accession_list" value="NC_006666"/> </conditional> <param name="range" value="1697:2764"/> <output_collection name="output" type="list"> <element name="NC_006666" ftype="genbank"> <assert_contents> <has_text text="ND2" /> <not_has_text text="tRNA-Ile" /> </assert_contents> </element> </output_collection> </test> </tests> <help><![CDATA[ **What it does** Given a file containing a list of NCBI accession numbers or a direct entry of accession numbers in the tool text input box, this tool will download the corresponding sequence records via the NCBI API. **Limitations** - For protein sequence downloads, only fasta format is supported - To avoid rate-limits imposed by the NCBI API, records are downloaded sequentially with a delay between requests. This may make it impractical to use this tool to download many (>100) records. **Output** A collection of sequence records in the desired format. ]]></help> <citations> </citations> </tool>