Mercurial > repos > drosofff > fetch_fasta_from_ncbi

--- a/retrieve_fasta_from_NCBI.py	Wed Nov 09 11:27:31 2016 -0500
+++ b/retrieve_fasta_from_NCBI.py	Mon May 15 03:10:11 2017 -0400
@@ -21,8 +21,6 @@
 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)


-python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
-python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
 """
 import sys
 import logging
@@ -37,6 +35,9 @@
 class Eutils:

     def __init__(self, options, logger):
+        """
+        Initialize retrieval parameters
+        """
         self.logger = logger
         self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
         self.query_string = options.query_string
@@ -47,16 +48,23 @@
             self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
         self.ids = []
         self.retmax_esearch = 100000
-        self.retmax_efetch = 1000
+        self.retmax_efetch = 500
         self.count = 0
         self.webenv = ""
         self.query_key = ""

     def retrieve(self):
-        """ """
+        """
+        Retrieve the fasta sequences corresponding to the query
+        """
         self.get_count_value()
-        self.get_uids_list()
-        self.get_sequences()
+
+        # If no UIDs are found exit script
+        if self.count > 0:
+            self.get_uids_list()
+            self.get_sequences()
+        else:
+            self.logger.info("No UIDs were found. Exiting script.")

     def get_count_value(self):
         """
@@ -77,7 +85,7 @@
             self.logger.debug(line.rstrip())
             if '</Count>' in line:
                 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
-        self.logger.info("Founded %d UIDs" % self.count)
+        self.logger.info("Found %d UIDs" % self.count)

     def get_uids_list(self):
         """
@@ -113,6 +121,7 @@
         req = urllib2.Request(url, data)
         response = urllib2.urlopen(req)
         querylog = response.readlines()
+        response.close()
         time.sleep(1)
         return querylog

@@ -123,19 +132,32 @@
                   'id': ids}
         data = urllib.urlencode(values)
         req = urllib2.Request(url, data)
-        #self.logger.debug("data: %s" % str(data))
-        req = urllib2.Request(url, data)
         serverResponse = False
+        nb_trials = 0
         while not serverResponse:
+            nb_trials += 1
             try:
+                self.logger.debug("Try number %s for opening and readin URL %s" % ( nb_trials, url+data ))
                 response = urllib2.urlopen(req)
+                querylog = response.readlines()
+                response.close()
                 serverResponse = True
-            except: # catch *all* exceptions
-                e = sys.exc_info()[0]
-                self.logger.info( "Catched Error: %s" % e )
-                self.logger.info( "Retrying in 10 sec")
-                time.sleep(10)
-        querylog = response.readlines()
+            except urllib2.HTTPError as e:
+                self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
+            except urllib2.URLError as e:
+                self.logger.info("urlopen error: Failed to reach a server")
+                self.logger.info("Reason :%s" % ( e.reason ) )
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
+            except httplib.IncompleteRead as e:
+                self.logger.info("IncompleteRead error:  %s" % ( e.partial ) )
+                self.logger.info("Retrying in 1 sec")
+                serverResponse = False
+                time.sleep(1)
         self.logger.debug("query response:")
         for line in querylog:
             self.logger.debug(line.rstrip())
@@ -159,27 +181,34 @@
         data = urllib.urlencode(values)
         req = urllib2.Request(url, data)
         self.logger.debug("data: %s" % str(data))
-        req = urllib2.Request(url, data)
         serverTransaction = False
         counter = 0
+        response_code = 0
         while not serverTransaction:
             counter += 1
             self.logger.info("Server Transaction Trial:  %s" % ( counter ) )
             try:
                 response = urllib2.urlopen(req)
+                response_code = response.getcode()
                 fasta = response.read()
-                if ("Resource temporarily unavailable" in fasta) or (not fasta.startswith(">") ):
+                response.close()
+                if ( (response_code != 200) or ("Resource temporarily unavailable" in fasta)
+                    or ("Error" in fasta) or (not fasta.startswith(">") ) ):
                     serverTransaction = False
                 else:
                     serverTransaction = True
             except urllib2.HTTPError as e:
                 serverTransaction = False
                 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) )
+            except urllib2.URLError as e:
+                serverTransaction = False
+                self.logger.info("urlopen error: Failed to reach a server")
+                self.logger.info("Reason :%s" % ( e.reason ) )
             except httplib.IncompleteRead as e:
                 serverTransaction = False
                 self.logger.info("IncompleteRead error:  %s" % ( e.partial ) )
-        fasta = self.sanitiser(self.dbname, fasta) #
-        time.sleep(1)
+        fasta = self.sanitiser(self.dbname, fasta)
+        time.sleep(0.1)
         return fasta

     def sanitiser(self, db, fastaseq):
@@ -237,12 +266,12 @@
             for start in range(0, count, batch_size):
                 end = min(count, start+batch_size)
                 batch = uids_list[start:end]
-                self.epost(self.dbname, ",".join(batch))
-                mfasta = ''
-                while not mfasta:
-                    self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
-                    mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
-                out.write(mfasta + '\n')
+                if self.epost(self.dbname, ",".join(batch)) != -1:
+                    mfasta = ''
+                    while not mfasta:
+                        self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
+                        mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
+                    out.write(mfasta + '\n')


 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
--- a/retrieve_fasta_from_NCBI.xml	Wed Nov 09 11:27:31 2016 -0500
+++ b/retrieve_fasta_from_NCBI.xml	Mon May 15 03:10:11 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="0.9.4">
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="1.0.0">
   <description></description>
   <command><![CDATA[
       python '$__tool_directory__'/retrieve_fasta_from_NCBI.py
--- a/test-data/output.fa	Wed Nov 09 11:27:31 2016 -0500
+++ b/test-data/output.fa	Mon May 15 03:10:11 2017 -0400
@@ -1,4 +1,4 @@
->NC_001834.1_Drosophila_C_virus,_complete_genome
+>NC_001834.1_Drosophila_C_virus_strain_EB,_complete_genome
 TTTATATCGTGTGTACATATAAATATGTACACACGGCTTTTAGGTAGAATATTGTTTTCAATGTTGATTT
 TAAAGGTAACTTTGGTTATTATGCTTTACGGTTTTCATTGTTGATGGTATTTGTGGCCTGCGGTCCCTAA
 TTGTTGAATTATTTATTCTGATACGTTGTTTTCATTGTTGATGGTAAGGATTCTTATTTTGAAGTGGTTT