Mercurial > repos > crs4 > get_fasta_from_taxon

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING	Wed Sep 11 06:07:50 2013 -0400
@@ -0,0 +1,24 @@
+Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/
+Created by:
+Massimiliano Orsini <massimiliano.orsini@crs4.it>
+Gianmauro Cuccuru <gianmauro.cuccuru@crs4.it>
+Nicola Soranzo <nicola.soranzo@crs4.it>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_fasta_from_taxon.py	Wed Sep 11 06:07:50 2013 -0400
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+"""
+From a taxonomy ID retrieves all the nucleotide sequences
+It returns a multiFASTA nuc/prot file
+
+Entrez Database  UID common name  E-utility Database Name
+Nucleotide       GI number        nuccore
+Protein          GI number        protein
+
+Retrieve strategy:
+
+esearch to get total number of UIDs (count)
+esearch to get UIDs in batches
+loop untile end of UIDs list:
+  epost to put a batch of UIDs in the history server
+  efetch to retrieve info from previous post
+
+retmax of efetch is 1/10 of declared value from NCBI
+
+queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
+
+
+python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
+python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
+"""
+
+import logging
+import optparse
+import time
+import urllib
+import urllib2
+
+class Eutils:
+
+    def __init__(self, options, logger):
+        self.logger = logger
+        self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        self.taxid = options.taxid
+        self.dbname = options.dbname
+        if options.outname:
+            self.outname = options.outname
+        else:
+            self.outname = 'taxid' + self.taxid + '.' + self.dbname + '.fasta'
+        self.ids = []
+        self.retmax_esearch = 100000
+        self.retmax_efetch = 1000
+        self.count = 0
+        self.webenv = ""
+        self.query_key = ""
+
+    def retrieve(self):
+        """ """
+        self.get_count_value()
+        self.get_uids_list()
+        self.get_sequences()
+
+    def get_count_value(self):
+        """
+        just to retrieve Count (number of UIDs)
+        Total number of UIDs from the retrieved set to be shown in the XML
+        output (default=20). By default, ESearch only includes the first 20
+        UIDs retrieved in the XML output. If usehistory is set to 'y',
+        the remainder of the retrieved set will be stored on the History server;
+
+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+        """
+        self.logger.info("retrieving data from %s" % self.base)
+        self.logger.info("for TaxID: %s and database: %s" %
+                         (self.taxid, self.dbname))
+        querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, '', '', "count")
+        self.logger.debug("taxonomy response:")
+        for line in querylog:
+            self.logger.debug(line.rstrip())
+            if '</Count>' in line:
+                self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
+        self.logger.info("Founded %d UIDs" % self.count)
+
+    def get_uids_list(self):
+        """
+        Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
+        up to a maximum of 100,000 records.
+        from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
+        """
+        retmax = self.retmax_esearch
+        if (self.count > retmax):
+            num_batches = (self.count / retmax) + 1
+        else:
+            num_batches = 1
+        self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
+        self.logger.info("Number of batches for esearch action: %d " % num_batches)
+        for n in range(num_batches):
+            querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, n*retmax, retmax, '')
+            for line in querylog:
+                if '<Id>' in line and '</Id>' in line:
+                    uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')])
+                    self.ids.append(uid)
+            self.logger.info("Retrieved %d UIDs" % len(self.ids))
+
+    def esearch(self, db, term, retstart, retmax, rettype):
+        url = self.base + "esearch.fcgi"
+        self.logger.debug("url: %s" % url)
+        values = {'db': db,
+                  'term': term,
+                  'rettype': rettype,
+                  'retstart': retstart,
+                  'retmax': retmax}
+        data = urllib.urlencode(values)
+        self.logger.debug("data: %s" % str(data))
+        req = urllib2.Request(url, data)
+        response = urllib2.urlopen(req)
+        querylog = response.readlines()
+        time.sleep(1)
+        return querylog
+
+    def epost(self, db, ids):
+        url = self.base + "epost.fcgi"
+        self.logger.debug("url_epost: %s" % url)
+        values = {'db': db,
+                  'id': ids}
+        data = urllib.urlencode(values)
+        req = urllib2.Request(url, data)
+        #self.logger.debug("data: %s" % str(data))
+        req = urllib2.Request(url, data)
+        response = urllib2.urlopen(req)
+        querylog = response.readlines()
+        self.logger.debug("taxonomy response:")
+        for line in querylog:
+            self.logger.debug(line.rstrip())
+            if '</QueryKey>' in line:
+                self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
+            if '</WebEnv>' in line:
+                self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')])
+            self.logger.debug("*** epost action ***")
+            self.logger.debug("query_key: %s" % self.query_key)
+            self.logger.debug("webenv: %s" % self.webenv)
+        time.sleep(1)
+
+    def efetch(self, db, query_key, webenv):
+        url = self.base + "efetch.fcgi"
+        self.logger.debug("url_efetch: %s" % url)
+        values = {'db': db,
+                  'query_key': query_key,
+                  'webenv': webenv,
+                  'rettype': "fasta",
+                  'retmode': "text"}
+        data = urllib.urlencode(values)
+        req = urllib2.Request(url, data)
+        self.logger.debug("data: %s" % str(data))
+        req = urllib2.Request(url, data)
+        response = urllib2.urlopen(req)
+        fasta = response.read()
+        assert fasta.startswith(">"), fasta
+        time.sleep(1)
+        return fasta
+
+    def get_sequences(self):
+        """
+        Total number of records from the input set to be retrieved, up to a maximum
+        of 10,000. Optionally, for a large set the value of retstart can be iterated
+        while holding retmax constant, thereby downloading the entire set in batches
+        of size retmax.
+
+        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+
+        """
+        batch_size = self.retmax_efetch
+        count = self.count
+        uids_list = self.ids
+        self.logger.info("Batch size for efetch action: %d" % batch_size)
+        self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
+        with open(self.outname, 'w') as out:
+            for start in range(0, count, batch_size):
+                end = min(count, start+batch_size)
+                batch = uids_list[start:end]
+                self.epost(self.dbname, ",".join(batch))
+                self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
+                mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
+                out.write(mfasta + '\n')
+
+
+LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
+LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
+LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+
+def __main__():
+    """ main function """
+    parser = optparse.OptionParser(description='Retrieve data from NCBI')
+    parser.add_option('-i', dest='taxid', help='taxonomy ID mandatory')
+    parser.add_option('-o', dest='outname', help='output file name')
+    parser.add_option('-l', '--logfile', help='log file (default=stderr)')
+    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
+    parser.add_option('-d', dest='dbname', help='database type')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+
+    log_level = getattr(logging, options.loglevel)
+    kwargs = {'format': LOG_FORMAT,
+              'datefmt': LOG_DATEFMT,
+              'level': log_level}
+    if options.logfile:
+        kwargs['filename'] = options.logfile
+    logging.basicConfig(**kwargs)
+    logger = logging.getLogger('data_from_NCBI')
+
+    E = Eutils(options, logger)
+    E.retrieve()
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_fasta_from_taxon.xml	Wed Sep 11 06:07:50 2013 -0400
@@ -0,0 +1,36 @@
+<tool id="get_fasta_from_taxon" name="Get FASTA from taxon" version="1.0.1">
+  <description></description>
+  <command interpreter="python">get_fasta_from_taxon.py -i $taxid -d $dbname -o $outfilename -l $logfile </command>
+
+  <inputs>
+    <param name="taxid" type="integer" value="" label="NCBI taxonomy ID" />
+    <param name="dbname" type="select" label="NCBI database">
+      <option value="nuccore">Nucleotide</option>
+      <option value="protein">Protein</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data name="outfilename" format="fasta" label="${tool.name} on ${on_string}: taxid${taxid.value}.${dbname.value_label}.fasta" />
+    <data format="txt" name="logfile" label="${tool.name} on ${on_string}: log"/>
+  </outputs>
+  <tests>
+
+  </tests>
+  <help>
+**What it does**
+
+This tool retrieves all nucleotidic/peptidic sequences from the corresponding NCBI database for a given taxonomy ID.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+  </help>
+</tool>