Mercurial > repos > artbio > fetch_fasta_from_ncbi
changeset 3:8be88084f89c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ab45487db8cc69750f92a40d763d51ffac940e25
author | artbio |
---|---|
date | Wed, 08 Nov 2017 13:00:26 -0500 |
parents | 50f5ef3313bb |
children | c667d0ee39f5 |
files | fetch_fasta_from_NCBI.py fetch_fasta_from_NCBI.xml test-data/dry_run.log |
diffstat | 3 files changed, 59 insertions(+), 101 deletions(-) [+] |
line wrap: on
line diff
--- a/fetch_fasta_from_NCBI.py Sun Oct 15 20:13:13 2017 -0400 +++ b/fetch_fasta_from_NCBI.py Wed Nov 08 13:00:26 2017 -0500 @@ -21,9 +21,9 @@ queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) """ +import argparse import httplib import logging -import optparse import re import sys import time @@ -55,26 +55,15 @@ self.count = 0 self.webenv = "" self.query_key = "" - self.datetype = options.datetype - if options.reldate: - self.reldate = options.reldate - else: - self.reldate = '' - if options.mindate: - self.mindate = options.mindate - else: - self.mindate = '' - if options.maxdate: - self.maxdate = options.maxdate - else: - self.maxdate = '' + + def dry_run(self): + self.get_count_value() def retrieve(self): """ Retrieve the fasta sequences corresponding to the query """ self.get_count_value() - # If no UIDs are found exit script if self.count > 0: self.get_uids_list() @@ -101,8 +90,7 @@ self.logger.info("for Query: %s and database: %s" % (self.query_string, self.dbname)) querylog = self.esearch(self.dbname, self.query_string, '', '', - "count", self.datetype, self.reldate, - self.mindate, self.maxdate) + "count") self.logger.debug("Query response:") for line in querylog: self.logger.debug(line.rstrip()) @@ -127,8 +115,7 @@ num_batches) for n in range(num_batches): querylog = self.esearch(self.dbname, self.query_string, n*retmax, - retmax, '', self.datetype, self.reldate, - self.mindate, self.maxdate) + retmax, '') for line in querylog: if '<Id>' in line and '</Id>' in line: uid = (line[line.find('<Id>')+len('<Id>'): @@ -136,19 +123,14 @@ self.ids.append(uid) self.logger.info("Retrieved %d UIDs" % len(self.ids)) - def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, - mindate, maxdate): + def esearch(self, db, term, retstart, retmax, rettype): url = self.base + "esearch.fcgi" self.logger.debug("url: %s" % url) values = {'db': db, 'term': term, 'rettype': rettype, 'retstart': retstart, - 'retmax': retmax, - 'datetype': datetype, - 'reldate': reldate, - 'mindate': mindate, - 'maxdate': maxdate} + 'retmax': retmax} data = urllib.urlencode(values) self.logger.debug("data: %s" % str(data)) req = urllib2.Request(url, data) @@ -225,9 +207,14 @@ counter += 1 self.logger.info("Server Transaction Trial: %s" % (counter)) try: + self.logger.debug("Going to open") response = urllib2.urlopen(req) + self.logger.debug("Going to get code") response_code = response.getcode() + self.logger.debug("Going to read, de code was : %s", + str(response_code)) fasta = response.read() + self.logger.debug("Did all that") response.close() if((response_code != 200) or ("Resource temporarily unavailable" in fasta) or @@ -348,63 +335,46 @@ LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] +def command_parse(): + parser = argparse.ArgumentParser(description='Retrieve data from NCBI') + parser.add_argument('-i', dest='query_string', help='NCBI Query String', + required=True) + parser.add_argument('-o', dest='outname', help='output file name') + parser.add_argument('-d', dest='dbname', help='database type') + parser.add_argument('--count', '-c', dest='count_ids', + action='store_true', default=False, + help='dry run ouputing only the number of sequences\ + found') + parser.add_argument('-l', '--logfile', help='log file (default=stderr)') + parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO', + help='logging level (default: INFO)') + args = parser.parse_args() + return args + + def __main__(): """ main function """ - parser = optparse.OptionParser(description='Retrieve data from NCBI') - parser.add_option('-i', dest='query_string', help='NCBI Query String') - parser.add_option('-o', dest='outname', help='output file name') - parser.add_option('-d', dest='dbname', help='database type') - parser.add_option('-l', '--logfile', help='log file (default=stderr)') - parser.add_option('--datetype', dest='datetype', - choices=['mdat', 'pdat'], - help='Type of date used to limit a search.\ - [ mdat(modification date), pdat(publication date)]\ - (default=pdat)', default='pdat') - parser.add_option('--reldate', dest='reldate', - help='When reldate is set to an integer n, the search\ - returns only those items that have a date\ - specified by datetype within the last n days.') - parser.add_option('--maxdate', dest='maxdate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--mindate', dest='mindate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', - help='logging level (default: INFO)') - (options, args) = parser.parse_args() - if len(args) > 0: - parser.error('Wrong number of arguments') - if((options.reldate and options.maxdate) or - (options.reldate and options.mindate)): - parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ - parameters") - if((options.mindate and not options.maxdate) or - (options.maxdate and not options.mindate)): - parser.error("mindate and maxdate must be used together") - - log_level = getattr(logging, options.loglevel) + args = command_parse() + log_level = getattr(logging, args.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} - if options.logfile: - kwargs['filename'] = options.logfile + if args.logfile: + kwargs['filename'] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger('data_from_NCBI') - E = Eutils(options, logger) - try: - E.retrieve() - except Exception: - sys.exit(1) + E = Eutils(args, logger) + if args.count_ids: + try: + E.dry_run() + except Exception: + sys.exit(1) + else: + try: + E.retrieve() + except Exception: + sys.exit(1) if __name__ == "__main__":
--- a/fetch_fasta_from_NCBI.xml Sun Oct 15 20:13:13 2017 -0400 +++ b/fetch_fasta_from_NCBI.xml Wed Nov 08 13:00:26 2017 -0500 @@ -1,16 +1,12 @@ -<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.1.1"> +<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1"> <description></description> <command><![CDATA[ python '$__tool_directory__'/fetch_fasta_from_NCBI.py -i "$queryString" -d $dbname - -o '$outfilename' -l '$logfile' - #if $date_condition.date_filter == "YES": - --datetype $date_condition.datetype - --mindate $date_condition.mindate - --maxdate $date_condition.maxdate - #end if + $dry_run + -o '$outfile' ]]></command> <inputs> @@ -30,21 +26,12 @@ <option value="nuccore">Nucleotide</option> <option value="protein">Protein</option> </param> - <conditional name="date_condition"> - <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/> - <when value="YES"> - <param name="datetype" type="select"> - <option value="pdat">Publication date</option> - <option value="mdat">Modification date</option> - </param> - <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> - <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> - </when> - <when value="NO"/> - </conditional> + <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/> </inputs> <outputs> - <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" /> + <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > + <filter> dry_run == False</filter> + </data> <data format="txt" name="logfile" label="${tool.name}: log"/> </outputs> <tests> @@ -52,16 +39,13 @@ <param name="queryString" value="9629650[gi]" /> <param name="dbname" value="nuccore" /> <output name="outfilename" ftype="fasta" file="output.fa" /> - <!-- <output name="logfile" ftype="txt" file="log.txt" /> log.txt changes with timestamp. removed to pass the test --> </test> <test> <param name="queryString" value="CU929326[Accession]" /> <param name="dbname" value="nuccore" /> - <param name="date_filter" value="YES"/> - <param name="datetype" value="pdat"/> - <param name="mindate" value="2008/09"/> - <param name="maxdate" value="2008/09"/> - <output name="outfilename" ftype="fasta" file="zebra_output.fa" /> + <param name="date_filter" value="1"/> + <param name="dry_run" value="True"/> + <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> </test> </tests> <help>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dry_run.log Wed Nov 08 13:00:26 2017 -0500 @@ -0,0 +1,4 @@ +2017-11-08 16:27:09|INFO |retrieving data from https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ +2017-11-08 16:27:09|INFO |for Query: Drosophila melanogaster[Organism] AND Gcn5[Title] and database: nuccore +2017-11-08 16:27:10|INFO |Found 4 UIDs +