Mercurial > repos > crs4 > get_fasta_from_taxon
view get_fasta_from_taxon.py @ 0:7ce8da107910 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 11 Sep 2013 06:07:50 -0400 |
parents | |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ From a taxonomy ID retrieves all the nucleotide sequences It returns a multiFASTA nuc/prot file Entrez Database UID common name E-utility Database Name Nucleotide GI number nuccore Protein GI number protein Retrieve strategy: esearch to get total number of UIDs (count) esearch to get UIDs in batches loop untile end of UIDs list: epost to put a batch of UIDs in the history server efetch to retrieve info from previous post retmax of efetch is 1/10 of declared value from NCBI queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) python get_fasta_from_taxon.py -i 1638 -o test.out -d protein python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs """ import logging import optparse import time import urllib import urllib2 class Eutils: def __init__(self, options, logger): self.logger = logger self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.taxid = options.taxid self.dbname = options.dbname if options.outname: self.outname = options.outname else: self.outname = 'taxid' + self.taxid + '.' + self.dbname + '.fasta' self.ids = [] self.retmax_esearch = 100000 self.retmax_efetch = 1000 self.count = 0 self.webenv = "" self.query_key = "" def retrieve(self): """ """ self.get_count_value() self.get_uids_list() self.get_sequences() def get_count_value(self): """ just to retrieve Count (number of UIDs) Total number of UIDs from the retrieved set to be shown in the XML output (default=20). By default, ESearch only includes the first 20 UIDs retrieved in the XML output. If usehistory is set to 'y', the remainder of the retrieved set will be stored on the History server; http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch """ self.logger.info("retrieving data from %s" % self.base) self.logger.info("for TaxID: %s and database: %s" % (self.taxid, self.dbname)) querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, '', '', "count") self.logger.debug("taxonomy response:") for line in querylog: self.logger.debug(line.rstrip()) if '</Count>' in line: self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) self.logger.info("Founded %d UIDs" % self.count) def get_uids_list(self): """ Increasing retmax allows more of the retrieved UIDs to be included in the XML output, up to a maximum of 100,000 records. from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch """ retmax = self.retmax_esearch if (self.count > retmax): num_batches = (self.count / retmax) + 1 else: num_batches = 1 self.logger.info("Batch size for esearch action: %d UIDs" % retmax) self.logger.info("Number of batches for esearch action: %d " % num_batches) for n in range(num_batches): querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, n*retmax, retmax, '') for line in querylog: if '<Id>' in line and '</Id>' in line: uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')]) self.ids.append(uid) self.logger.info("Retrieved %d UIDs" % len(self.ids)) def esearch(self, db, term, retstart, retmax, rettype): url = self.base + "esearch.fcgi" self.logger.debug("url: %s" % url) values = {'db': db, 'term': term, 'rettype': rettype, 'retstart': retstart, 'retmax': retmax} data = urllib.urlencode(values) self.logger.debug("data: %s" % str(data)) req = urllib2.Request(url, data) response = urllib2.urlopen(req) querylog = response.readlines() time.sleep(1) return querylog def epost(self, db, ids): url = self.base + "epost.fcgi" self.logger.debug("url_epost: %s" % url) values = {'db': db, 'id': ids} data = urllib.urlencode(values) req = urllib2.Request(url, data) #self.logger.debug("data: %s" % str(data)) req = urllib2.Request(url, data) response = urllib2.urlopen(req) querylog = response.readlines() self.logger.debug("taxonomy response:") for line in querylog: self.logger.debug(line.rstrip()) if '</QueryKey>' in line: self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')]) if '</WebEnv>' in line: self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')]) self.logger.debug("*** epost action ***") self.logger.debug("query_key: %s" % self.query_key) self.logger.debug("webenv: %s" % self.webenv) time.sleep(1) def efetch(self, db, query_key, webenv): url = self.base + "efetch.fcgi" self.logger.debug("url_efetch: %s" % url) values = {'db': db, 'query_key': query_key, 'webenv': webenv, 'rettype': "fasta", 'retmode': "text"} data = urllib.urlencode(values) req = urllib2.Request(url, data) self.logger.debug("data: %s" % str(data)) req = urllib2.Request(url, data) response = urllib2.urlopen(req) fasta = response.read() assert fasta.startswith(">"), fasta time.sleep(1) return fasta def get_sequences(self): """ Total number of records from the input set to be retrieved, up to a maximum of 10,000. Optionally, for a large set the value of retstart can be iterated while holding retmax constant, thereby downloading the entire set in batches of size retmax. http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch """ batch_size = self.retmax_efetch count = self.count uids_list = self.ids self.logger.info("Batch size for efetch action: %d" % batch_size) self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1)) with open(self.outname, 'w') as out: for start in range(0, count, batch_size): end = min(count, start+batch_size) batch = uids_list[start:end] self.epost(self.dbname, ",".join(batch)) self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) mfasta = self.efetch(self.dbname, self.query_key, self.webenv) out.write(mfasta + '\n') LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] def __main__(): """ main function """ parser = optparse.OptionParser(description='Retrieve data from NCBI') parser.add_option('-i', dest='taxid', help='taxonomy ID mandatory') parser.add_option('-o', dest='outname', help='output file name') parser.add_option('-l', '--logfile', help='log file (default=stderr)') parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)') parser.add_option('-d', dest='dbname', help='database type') (options, args) = parser.parse_args() if len(args) > 0: parser.error('Wrong number of arguments') log_level = getattr(logging, options.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} if options.logfile: kwargs['filename'] = options.logfile logging.basicConfig(**kwargs) logger = logging.getLogger('data_from_NCBI') E = Eutils(options, logger) E.retrieve() if __name__ == "__main__": __main__()