Mercurial > repos > artbio > fetch_fasta_from_ncbi
diff fetch_fasta_from_NCBI.py @ 1:7e41bbb94159 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit 6008aafac37eec1916d6b72c05d9cfcb002b8095
author | artbio |
---|---|
date | Sun, 15 Oct 2017 13:56:26 -0400 |
parents | |
children | 50f5ef3313bb |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_fasta_from_NCBI.py Sun Oct 15 13:56:26 2017 -0400 @@ -0,0 +1,411 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +From a taxonomy ID retrieves all the nucleotide sequences +It returns a multiFASTA nuc/prot file + +Entrez Database UID common name E-utility Database Name +Nucleotide GI number nuccore +Protein GI number protein + +Retrieve strategy: + +esearch to get total number of UIDs (count) +esearch to get UIDs in batches +loop untile end of UIDs list: + epost to put a batch of UIDs in the history server + efetch to retrieve info from previous post + +retmax of efetch is 1/10 of declared value from NCBI + +queries are 1 sec delayed, to satisfy NCBI guidelines +(more than what they request) +""" +import httplib +import logging +import optparse +import re +import sys +import time +import urllib +import urllib2 + + +class QueryException(Exception): + pass + + +class Eutils: + + def __init__(self, options, logger): + """ + Initialize retrieval parameters + """ + self.logger = logger + self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.query_string = options.query_string + self.dbname = options.dbname + if options.outname: + self.outname = options.outname + else: + self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' + self.ids = [] + self.retmax_esearch = 100000 + self.retmax_efetch = 500 + self.count = 0 + self.webenv = "" + self.query_key = "" + self.datetype = options.datetype + if options.reldate: + self.reldate = options.reldate + else: + self.reldate = '' + if options.mindate: + self.mindate = options.mindate + else: + self.mindate = '' + if options.maxdate: + self.maxdate = options.maxdate + else: + self.maxdate = '' + + def retrieve(self): + """ + Retrieve the fasta sequences corresponding to the query + """ + self.get_count_value() + + # If no UIDs are found exit script + if self.count > 0: + self.get_uids_list() + try: + self.get_sequences() + except QueryException as e: + self.logger.error("Exiting script.") + raise e + else: + self.logger.error("No UIDs were found. Exiting script.") + raise Exception("") + + def get_count_value(self): + """ + just to retrieve Count (number of UIDs) + Total number of UIDs from the retrieved set to be shown in the XML + output (default=20). By default, ESearch only includes the first 20 + UIDs retrieved in the XML output. If usehistory is set to 'y', + the remainder of the retrieved set will be stored on the History server + + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + """ + self.logger.info("retrieving data from %s" % self.base) + self.logger.info("for Query: %s and database: %s" % + (self.query_string, self.dbname)) + querylog = self.esearch(self.dbname, self.query_string, '', '', + "count", self.datetype, self.reldate, + self.mindate, self.maxdate) + self.logger.debug("Query response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</Count>' in line: + self.count = int(line[line.find('<Count>')+len('<Count>'): + line.find('</Count>')]) + self.logger.info("Found %d UIDs" % self.count) + + def get_uids_list(self): + """ + Increasing retmax allows more of the retrieved UIDs to be included in + the XML output, up to a maximum of 100,000 records. + from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch + """ + retmax = self.retmax_esearch + if (self.count > retmax): + num_batches = (self.count / retmax) + 1 + else: + num_batches = 1 + self.logger.info("Batch size for esearch action: %d UIDs" % retmax) + self.logger.info("Number of batches for esearch action: %d " % + num_batches) + for n in range(num_batches): + querylog = self.esearch(self.dbname, self.query_string, n*retmax, + retmax, '', self.datetype, self.reldate, + self.mindate, self.maxdate) + for line in querylog: + if '<Id>' in line and '</Id>' in line: + uid = (line[line.find('<Id>')+len('<Id>'): + line.find('</Id>')]) + self.ids.append(uid) + self.logger.info("Retrieved %d UIDs" % len(self.ids)) + + def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, + mindate, maxdate): + url = self.base + "esearch.fcgi" + self.logger.debug("url: %s" % url) + values = {'db': db, + 'term': term, + 'rettype': rettype, + 'retstart': retstart, + 'retmax': retmax, + 'datetype': datetype, + 'reldate': reldate, + 'mindate': mindate, + 'maxdate': maxdate} + data = urllib.urlencode(values) + self.logger.debug("data: %s" % str(data)) + req = urllib2.Request(url, data) + response = urllib2.urlopen(req) + querylog = response.readlines() + response.close() + time.sleep(1) + return querylog + + def epost(self, db, ids): + url = self.base + "epost.fcgi" + self.logger.debug("url_epost: %s" % url) + values = {'db': db, + 'id': ids} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + serverResponse = False + nb_trials = 0 + while not serverResponse: + nb_trials += 1 + try: + self.logger.debug("Try number %s for opening and readin URL %s" + % (nb_trials, url+data)) + response = urllib2.urlopen(req) + querylog = response.readlines() + response.close() + serverResponse = True + except urllib2.HTTPError as e: + self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except urllib2.URLError as e: + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % (e.reason)) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except httplib.IncompleteRead as e: + self.logger.info("IncompleteRead error: %s" % (e.partial)) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + self.logger.debug("query response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</QueryKey>' in line: + self.query_key = str(line[line.find('<QueryKey>') + + len('<QueryKey>'):line.find('</QueryKey>') + ]) + if '</WebEnv>' in line: + self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'): + line.find('</WebEnv>')]) + self.logger.debug("*** epost action ***") + self.logger.debug("query_key: %s" % self.query_key) + self.logger.debug("webenv: %s" % self.webenv) + time.sleep(1) + + def efetch(self, db, query_key, webenv): + url = self.base + "efetch.fcgi" + self.logger.debug("url_efetch: %s" % url) + values = {'db': db, + 'query_key': query_key, + 'webenv': webenv, + 'rettype': "fasta", + 'retmode': "text"} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + self.logger.debug("data: %s" % str(data)) + serverTransaction = False + counter = 0 + response_code = 0 + while not serverTransaction: + counter += 1 + self.logger.info("Server Transaction Trial: %s" % (counter)) + try: + response = urllib2.urlopen(req) + response_code = response.getcode() + fasta = response.read() + response.close() + if((response_code != 200) or + ("Resource temporarily unavailable" in fasta) or + ("Error" in fasta) or (not fasta.startswith(">"))): + serverTransaction = False + if (response_code != 200): + self.logger.info("urlopen error: Response code is not\ + 200") + elif ("Resource temporarily unavailable" in fasta): + self.logger.info("Ressource temporarily unavailable") + elif ("Error" in fasta): + self.logger.info("Error in fasta") + else: + self.logger.info("Fasta doesn't start with '>'") + else: + serverTransaction = True + except urllib2.HTTPError as e: + serverTransaction = False + self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) + except urllib2.URLError as e: + serverTransaction = False + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % (e.reason)) + except httplib.IncompleteRead as e: + serverTransaction = False + self.logger.info("IncompleteRead error: %s" % (e.partial)) + if (counter > 500): + serverTransaction = True + if (counter > 500): + raise QueryException({"message": + "500 Server Transaction Trials attempted for\ + this batch. Aborting."}) + fasta = self.sanitiser(self.dbname, fasta) + time.sleep(0.1) + return fasta + + def sanitiser(self, db, fastaseq): + if(db not in "nuccore protein"): + return fastaseq + regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") + sane_seqlist = [] + seqlist = fastaseq.split("\n\n") + for seq in seqlist[:-1]: + fastalines = seq.split("\n") + if len(fastalines) < 2: + self.logger.info("Empty sequence for %s" % + ("|".join(fastalines[0].split("|")[:4]))) + self.logger.info("%s download is skipped" % + ("|".join(fastalines[0].split("|")[:4]))) + continue + if db == "nuccore": + badnuc = 0 + for nucleotide in fastalines[1]: + if nucleotide not in "ATGC": + badnuc += 1 + if float(badnuc)/len(fastalines[1]) > 0.4: + self.logger.info("%s ambiguous nucleotides in %s\ + or download interrupted at this offset\ + | %s" % (float(badnuc)/len(fastalines[1]), + "|".join(fastalines[0].split("|") + [:4]), + fastalines[1])) + self.logger.info("%s download is skipped" % + (fastalines[0].split("|")[:4])) + continue + """ remove spaces and trim the header to 100 chars """ + fastalines[0] = fastalines[0].replace(" ", "_")[:100] + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + elif db == "protein": + fastalines[0] = fastalines[0][0:100] + fastalines[0] = fastalines[0].replace(" ", "_") + fastalines[0] = fastalines[0].replace("[", "_") + fastalines[0] = fastalines[0].replace("]", "_") + fastalines[0] = fastalines[0].replace("=", "_") + """ because blast makedb doesn't like it """ + fastalines[0] = fastalines[0].rstrip("_") + fastalines[0] = re.sub(regex, "_", fastalines[0]) + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) + return "\n".join(sane_seqlist) + + def get_sequences(self): + """ + Total number of records from the input set to be retrieved, + up to a maximum of 10,000. Optionally, for a large set the value of + retstart can be iterated while holding retmax constant, thereby + downloading the entire set in batches of size retmax. + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + """ + batch_size = self.retmax_efetch + count = self.count + uids_list = self.ids + self.logger.info("Batch size for efetch action: %d" % batch_size) + self.logger.info("Number of batches for efetch action: %d" % + ((count / batch_size) + 1)) + with open(self.outname, 'w') as out: + for start in range(0, count, batch_size): + end = min(count, start+batch_size) + batch = uids_list[start:end] + if self.epost(self.dbname, ",".join(batch)) != -1: + mfasta = '' + while not mfasta: + self.logger.info("retrieving batch %d" % + ((start / batch_size) + 1)) + try: + mfasta = self.efetch(self.dbname, self.query_key, + self.webenv) + out.write(mfasta + '\n') + except QueryException as e: + self.logger.error("%s" % e.message) + raise e + + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def __main__(): + """ main function """ + parser = optparse.OptionParser(description='Retrieve data from NCBI') + parser.add_option('-i', dest='query_string', help='NCBI Query String') + parser.add_option('-o', dest='outname', help='output file name') + parser.add_option('-d', dest='dbname', help='database type') + parser.add_option('-l', '--logfile', help='log file (default=stderr)') + parser.add_option('--datetype', dest='datetype', + choices=['mdat', 'pdat'], + help='Type of date used to limit a search.\ + [ mdat(modification date), pdat(publication date)]\ + (default=pdat)', default='pdat') + parser.add_option('--reldate', dest='reldate', + help='When reldate is set to an integer n, the search\ + returns only those items that have a date\ + specified by datetype within the last n days.') + parser.add_option('--maxdate', dest='maxdate', + help='Date range used to limit a search result by the\ + date specified by datetype. These two parameters\ + (mindate, maxdate) must be used together to\ + specify an arbitrary date range. The general date\ + format is YYYY/MM/DD, and these variants are also\ + allowed: YYYY, YYYY/MM.') + parser.add_option('--mindate', dest='mindate', + help='Date range used to limit a search result by the\ + date specified by datetype. These two parameters\ + (mindate, maxdate) must be used together to\ + specify an arbitrary date range. The general date\ + format is YYYY/MM/DD, and these variants are also\ + allowed: YYYY, YYYY/MM.') + parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', + help='logging level (default: INFO)') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + if((options.reldate and options.maxdate) or + (options.reldate and options.mindate)): + parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ + parameters") + if((options.mindate and not options.maxdate) or + (options.maxdate and not options.mindate)): + parser.error("mindate and maxdate must be used together") + + log_level = getattr(logging, options.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if options.logfile: + kwargs['filename'] = options.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('data_from_NCBI') + + E = Eutils(options, logger) + try: + E.retrieve() + except Exception as e: + sys.exit(1) + + +if __name__ == "__main__": + __main__()