Mercurial > repos > crs4 > get_fasta_from_taxon
changeset 0:7ce8da107910 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 11 Sep 2013 06:07:50 -0400 |
parents | |
children | |
files | COPYING get_fasta_from_taxon.py get_fasta_from_taxon.xml |
diffstat | 3 files changed, 272 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING Wed Sep 11 06:07:50 2013 -0400 @@ -0,0 +1,24 @@ +Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/ +Created by: +Massimiliano Orsini <massimiliano.orsini@crs4.it> +Gianmauro Cuccuru <gianmauro.cuccuru@crs4.it> +Nicola Soranzo <nicola.soranzo@crs4.it> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_fasta_from_taxon.py Wed Sep 11 06:07:50 2013 -0400 @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +""" +From a taxonomy ID retrieves all the nucleotide sequences +It returns a multiFASTA nuc/prot file + +Entrez Database UID common name E-utility Database Name +Nucleotide GI number nuccore +Protein GI number protein + +Retrieve strategy: + +esearch to get total number of UIDs (count) +esearch to get UIDs in batches +loop untile end of UIDs list: + epost to put a batch of UIDs in the history server + efetch to retrieve info from previous post + +retmax of efetch is 1/10 of declared value from NCBI + +queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) + + +python get_fasta_from_taxon.py -i 1638 -o test.out -d protein +python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs +""" + +import logging +import optparse +import time +import urllib +import urllib2 + +class Eutils: + + def __init__(self, options, logger): + self.logger = logger + self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.taxid = options.taxid + self.dbname = options.dbname + if options.outname: + self.outname = options.outname + else: + self.outname = 'taxid' + self.taxid + '.' + self.dbname + '.fasta' + self.ids = [] + self.retmax_esearch = 100000 + self.retmax_efetch = 1000 + self.count = 0 + self.webenv = "" + self.query_key = "" + + def retrieve(self): + """ """ + self.get_count_value() + self.get_uids_list() + self.get_sequences() + + def get_count_value(self): + """ + just to retrieve Count (number of UIDs) + Total number of UIDs from the retrieved set to be shown in the XML + output (default=20). By default, ESearch only includes the first 20 + UIDs retrieved in the XML output. If usehistory is set to 'y', + the remainder of the retrieved set will be stored on the History server; + + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + """ + self.logger.info("retrieving data from %s" % self.base) + self.logger.info("for TaxID: %s and database: %s" % + (self.taxid, self.dbname)) + querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, '', '', "count") + self.logger.debug("taxonomy response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</Count>' in line: + self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) + self.logger.info("Founded %d UIDs" % self.count) + + def get_uids_list(self): + """ + Increasing retmax allows more of the retrieved UIDs to be included in the XML output, + up to a maximum of 100,000 records. + from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch + """ + retmax = self.retmax_esearch + if (self.count > retmax): + num_batches = (self.count / retmax) + 1 + else: + num_batches = 1 + self.logger.info("Batch size for esearch action: %d UIDs" % retmax) + self.logger.info("Number of batches for esearch action: %d " % num_batches) + for n in range(num_batches): + querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, n*retmax, retmax, '') + for line in querylog: + if '<Id>' in line and '</Id>' in line: + uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')]) + self.ids.append(uid) + self.logger.info("Retrieved %d UIDs" % len(self.ids)) + + def esearch(self, db, term, retstart, retmax, rettype): + url = self.base + "esearch.fcgi" + self.logger.debug("url: %s" % url) + values = {'db': db, + 'term': term, + 'rettype': rettype, + 'retstart': retstart, + 'retmax': retmax} + data = urllib.urlencode(values) + self.logger.debug("data: %s" % str(data)) + req = urllib2.Request(url, data) + response = urllib2.urlopen(req) + querylog = response.readlines() + time.sleep(1) + return querylog + + def epost(self, db, ids): + url = self.base + "epost.fcgi" + self.logger.debug("url_epost: %s" % url) + values = {'db': db, + 'id': ids} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + #self.logger.debug("data: %s" % str(data)) + req = urllib2.Request(url, data) + response = urllib2.urlopen(req) + querylog = response.readlines() + self.logger.debug("taxonomy response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</QueryKey>' in line: + self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')]) + if '</WebEnv>' in line: + self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')]) + self.logger.debug("*** epost action ***") + self.logger.debug("query_key: %s" % self.query_key) + self.logger.debug("webenv: %s" % self.webenv) + time.sleep(1) + + def efetch(self, db, query_key, webenv): + url = self.base + "efetch.fcgi" + self.logger.debug("url_efetch: %s" % url) + values = {'db': db, + 'query_key': query_key, + 'webenv': webenv, + 'rettype': "fasta", + 'retmode': "text"} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + self.logger.debug("data: %s" % str(data)) + req = urllib2.Request(url, data) + response = urllib2.urlopen(req) + fasta = response.read() + assert fasta.startswith(">"), fasta + time.sleep(1) + return fasta + + def get_sequences(self): + """ + Total number of records from the input set to be retrieved, up to a maximum + of 10,000. Optionally, for a large set the value of retstart can be iterated + while holding retmax constant, thereby downloading the entire set in batches + of size retmax. + + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + + """ + batch_size = self.retmax_efetch + count = self.count + uids_list = self.ids + self.logger.info("Batch size for efetch action: %d" % batch_size) + self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1)) + with open(self.outname, 'w') as out: + for start in range(0, count, batch_size): + end = min(count, start+batch_size) + batch = uids_list[start:end] + self.epost(self.dbname, ",".join(batch)) + self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) + mfasta = self.efetch(self.dbname, self.query_key, self.webenv) + out.write(mfasta + '\n') + + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def __main__(): + """ main function """ + parser = optparse.OptionParser(description='Retrieve data from NCBI') + parser.add_option('-i', dest='taxid', help='taxonomy ID mandatory') + parser.add_option('-o', dest='outname', help='output file name') + parser.add_option('-l', '--logfile', help='log file (default=stderr)') + parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)') + parser.add_option('-d', dest='dbname', help='database type') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + log_level = getattr(logging, options.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if options.logfile: + kwargs['filename'] = options.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('data_from_NCBI') + + E = Eutils(options, logger) + E.retrieve() + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_fasta_from_taxon.xml Wed Sep 11 06:07:50 2013 -0400 @@ -0,0 +1,36 @@ +<tool id="get_fasta_from_taxon" name="Get FASTA from taxon" version="1.0.1"> + <description></description> + <command interpreter="python">get_fasta_from_taxon.py -i $taxid -d $dbname -o $outfilename -l $logfile </command> + + <inputs> + <param name="taxid" type="integer" value="" label="NCBI taxonomy ID" /> + <param name="dbname" type="select" label="NCBI database"> + <option value="nuccore">Nucleotide</option> + <option value="protein">Protein</option> + </param> + </inputs> + <outputs> + <data name="outfilename" format="fasta" label="${tool.name} on ${on_string}: taxid${taxid.value}.${dbname.value_label}.fasta" /> + <data format="txt" name="logfile" label="${tool.name} on ${on_string}: log"/> + </outputs> + <tests> + + </tests> + <help> +**What it does** + +This tool retrieves all nucleotidic/peptidic sequences from the corresponding NCBI database for a given taxonomy ID. + +**License and citation** + +This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +If you use this tool in Galaxy, please cite |Cuccuru2013|_. + +.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted* +.. _Cuccuru2013: http://orione.crs4.it/ + </help> +</tool>