# HG changeset patch # User artbio # Date 1510164026 18000 # Node ID 8be88084f89cc6b30ebd8f7172822800b44e6ff0 # Parent 50f5ef3313bb811358f443de4b02c4139415e952 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ab45487db8cc69750f92a40d763d51ffac940e25 diff -r 50f5ef3313bb -r 8be88084f89c fetch_fasta_from_NCBI.py --- a/fetch_fasta_from_NCBI.py Sun Oct 15 20:13:13 2017 -0400 +++ b/fetch_fasta_from_NCBI.py Wed Nov 08 13:00:26 2017 -0500 @@ -21,9 +21,9 @@ queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) """ +import argparse import httplib import logging -import optparse import re import sys import time @@ -55,26 +55,15 @@ self.count = 0 self.webenv = "" self.query_key = "" - self.datetype = options.datetype - if options.reldate: - self.reldate = options.reldate - else: - self.reldate = '' - if options.mindate: - self.mindate = options.mindate - else: - self.mindate = '' - if options.maxdate: - self.maxdate = options.maxdate - else: - self.maxdate = '' + + def dry_run(self): + self.get_count_value() def retrieve(self): """ Retrieve the fasta sequences corresponding to the query """ self.get_count_value() - # If no UIDs are found exit script if self.count > 0: self.get_uids_list() @@ -101,8 +90,7 @@ self.logger.info("for Query: %s and database: %s" % (self.query_string, self.dbname)) querylog = self.esearch(self.dbname, self.query_string, '', '', - "count", self.datetype, self.reldate, - self.mindate, self.maxdate) + "count") self.logger.debug("Query response:") for line in querylog: self.logger.debug(line.rstrip()) @@ -127,8 +115,7 @@ num_batches) for n in range(num_batches): querylog = self.esearch(self.dbname, self.query_string, n*retmax, - retmax, '', self.datetype, self.reldate, - self.mindate, self.maxdate) + retmax, '') for line in querylog: if '' in line and '' in line: uid = (line[line.find('')+len(''): @@ -136,19 +123,14 @@ self.ids.append(uid) self.logger.info("Retrieved %d UIDs" % len(self.ids)) - def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, - mindate, maxdate): + def esearch(self, db, term, retstart, retmax, rettype): url = self.base + "esearch.fcgi" self.logger.debug("url: %s" % url) values = {'db': db, 'term': term, 'rettype': rettype, 'retstart': retstart, - 'retmax': retmax, - 'datetype': datetype, - 'reldate': reldate, - 'mindate': mindate, - 'maxdate': maxdate} + 'retmax': retmax} data = urllib.urlencode(values) self.logger.debug("data: %s" % str(data)) req = urllib2.Request(url, data) @@ -225,9 +207,14 @@ counter += 1 self.logger.info("Server Transaction Trial: %s" % (counter)) try: + self.logger.debug("Going to open") response = urllib2.urlopen(req) + self.logger.debug("Going to get code") response_code = response.getcode() + self.logger.debug("Going to read, de code was : %s", + str(response_code)) fasta = response.read() + self.logger.debug("Did all that") response.close() if((response_code != 200) or ("Resource temporarily unavailable" in fasta) or @@ -348,63 +335,46 @@ LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] +def command_parse(): + parser = argparse.ArgumentParser(description='Retrieve data from NCBI') + parser.add_argument('-i', dest='query_string', help='NCBI Query String', + required=True) + parser.add_argument('-o', dest='outname', help='output file name') + parser.add_argument('-d', dest='dbname', help='database type') + parser.add_argument('--count', '-c', dest='count_ids', + action='store_true', default=False, + help='dry run ouputing only the number of sequences\ + found') + parser.add_argument('-l', '--logfile', help='log file (default=stderr)') + parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO', + help='logging level (default: INFO)') + args = parser.parse_args() + return args + + def __main__(): """ main function """ - parser = optparse.OptionParser(description='Retrieve data from NCBI') - parser.add_option('-i', dest='query_string', help='NCBI Query String') - parser.add_option('-o', dest='outname', help='output file name') - parser.add_option('-d', dest='dbname', help='database type') - parser.add_option('-l', '--logfile', help='log file (default=stderr)') - parser.add_option('--datetype', dest='datetype', - choices=['mdat', 'pdat'], - help='Type of date used to limit a search.\ - [ mdat(modification date), pdat(publication date)]\ - (default=pdat)', default='pdat') - parser.add_option('--reldate', dest='reldate', - help='When reldate is set to an integer n, the search\ - returns only those items that have a date\ - specified by datetype within the last n days.') - parser.add_option('--maxdate', dest='maxdate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--mindate', dest='mindate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', - help='logging level (default: INFO)') - (options, args) = parser.parse_args() - if len(args) > 0: - parser.error('Wrong number of arguments') - if((options.reldate and options.maxdate) or - (options.reldate and options.mindate)): - parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ - parameters") - if((options.mindate and not options.maxdate) or - (options.maxdate and not options.mindate)): - parser.error("mindate and maxdate must be used together") - - log_level = getattr(logging, options.loglevel) + args = command_parse() + log_level = getattr(logging, args.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} - if options.logfile: - kwargs['filename'] = options.logfile + if args.logfile: + kwargs['filename'] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger('data_from_NCBI') - E = Eutils(options, logger) - try: - E.retrieve() - except Exception: - sys.exit(1) + E = Eutils(args, logger) + if args.count_ids: + try: + E.dry_run() + except Exception: + sys.exit(1) + else: + try: + E.retrieve() + except Exception: + sys.exit(1) if __name__ == "__main__": diff -r 50f5ef3313bb -r 8be88084f89c fetch_fasta_from_NCBI.xml --- a/fetch_fasta_from_NCBI.xml Sun Oct 15 20:13:13 2017 -0400 +++ b/fetch_fasta_from_NCBI.xml Wed Nov 08 13:00:26 2017 -0500 @@ -1,16 +1,12 @@ - + @@ -30,21 +26,12 @@ - - - - - - - - - - - - + - + + dry_run == False + @@ -52,16 +39,13 @@ - - - - - - + + + diff -r 50f5ef3313bb -r 8be88084f89c test-data/dry_run.log --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dry_run.log Wed Nov 08 13:00:26 2017 -0500 @@ -0,0 +1,4 @@ +2017-11-08 16:27:09|INFO |retrieving data from https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ +2017-11-08 16:27:09|INFO |for Query: Drosophila melanogaster[Organism] AND Gcn5[Title] and database: nuccore +2017-11-08 16:27:10|INFO |Found 4 UIDs +