Mercurial > repos > artbio > fetch_fasta_from_ncbi

--- a/fetch_fasta_from_NCBI.py	Sun Oct 15 20:13:13 2017 -0400
+++ b/fetch_fasta_from_NCBI.py	Wed Nov 08 13:00:26 2017 -0500
@@ -21,9 +21,9 @@
 queries are 1 sec delayed, to satisfy NCBI guidelines
 (more than what they request)
 """
+import argparse
 import httplib
 import logging
-import optparse
 import re
 import sys
 import time
@@ -55,26 +55,15 @@
         self.count = 0
         self.webenv = ""
         self.query_key = ""
-        self.datetype = options.datetype
-        if options.reldate:
-            self.reldate = options.reldate
-        else:
-            self.reldate = ''
-        if options.mindate:
-            self.mindate = options.mindate
-        else:
-            self.mindate = ''
-        if options.maxdate:
-            self.maxdate = options.maxdate
-        else:
-            self.maxdate = ''
+
+    def dry_run(self):
+        self.get_count_value()

     def retrieve(self):
         """
         Retrieve the fasta sequences corresponding to the query
         """
         self.get_count_value()
-
         # If no UIDs are found exit script
         if self.count > 0:
             self.get_uids_list()
@@ -101,8 +90,7 @@
         self.logger.info("for Query: %s and database: %s" %
                          (self.query_string, self.dbname))
         querylog = self.esearch(self.dbname, self.query_string, '', '',
-                                "count", self.datetype, self.reldate,
-                                self.mindate, self.maxdate)
+                                "count")
         self.logger.debug("Query response:")
         for line in querylog:
             self.logger.debug(line.rstrip())
@@ -127,8 +115,7 @@
                          num_batches)
         for n in range(num_batches):
             querylog = self.esearch(self.dbname, self.query_string, n*retmax,
-                                    retmax, '', self.datetype, self.reldate,
-                                    self.mindate, self.maxdate)
+                                    retmax, '')
             for line in querylog:
                 if '<Id>' in line and '</Id>' in line:
                     uid = (line[line.find('<Id>')+len('<Id>'):
@@ -136,19 +123,14 @@
                     self.ids.append(uid)
             self.logger.info("Retrieved %d UIDs" % len(self.ids))

-    def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate,
-                mindate, maxdate):
+    def esearch(self, db, term, retstart, retmax, rettype):
         url = self.base + "esearch.fcgi"
         self.logger.debug("url: %s" % url)
         values = {'db': db,
                   'term': term,
                   'rettype': rettype,
                   'retstart': retstart,
-                  'retmax': retmax,
-                  'datetype': datetype,
-                  'reldate': reldate,
-                  'mindate': mindate,
-                  'maxdate': maxdate}
+                  'retmax': retmax}
         data = urllib.urlencode(values)
         self.logger.debug("data: %s" % str(data))
         req = urllib2.Request(url, data)
@@ -225,9 +207,14 @@
             counter += 1
             self.logger.info("Server Transaction Trial:  %s" % (counter))
             try:
+                self.logger.debug("Going to open")
                 response = urllib2.urlopen(req)
+                self.logger.debug("Going to get code")
                 response_code = response.getcode()
+                self.logger.debug("Going to read, de code was : %s",
+                                  str(response_code))
                 fasta = response.read()
+                self.logger.debug("Did all that")
                 response.close()
                 if((response_code != 200) or
                    ("Resource temporarily unavailable" in fasta) or
@@ -348,63 +335,46 @@
 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']


+def command_parse():
+    parser = argparse.ArgumentParser(description='Retrieve data from NCBI')
+    parser.add_argument('-i', dest='query_string', help='NCBI Query String',
+                        required=True)
+    parser.add_argument('-o', dest='outname', help='output file name')
+    parser.add_argument('-d', dest='dbname', help='database type')
+    parser.add_argument('--count', '-c', dest='count_ids',
+                        action='store_true', default=False,
+                        help='dry run ouputing only the number of sequences\
+                        found')
+    parser.add_argument('-l', '--logfile', help='log file (default=stderr)')
+    parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO',
+                        help='logging level (default: INFO)')
+    args = parser.parse_args()
+    return args
+
+
 def __main__():
     """ main function """
-    parser = optparse.OptionParser(description='Retrieve data from NCBI')
-    parser.add_option('-i', dest='query_string', help='NCBI Query String')
-    parser.add_option('-o', dest='outname', help='output file name')
-    parser.add_option('-d', dest='dbname', help='database type')
-    parser.add_option('-l', '--logfile', help='log file (default=stderr)')
-    parser.add_option('--datetype', dest='datetype',
-                      choices=['mdat', 'pdat'],
-                      help='Type of date used to limit a search.\
-                            [ mdat(modification date), pdat(publication date)]\
-                            (default=pdat)', default='pdat')
-    parser.add_option('--reldate', dest='reldate',
-                      help='When reldate is set to an integer n, the search\
-                            returns only those items that have a date\
-                            specified by datetype within the last n days.')
-    parser.add_option('--maxdate', dest='maxdate',
-                      help='Date range used to limit a search result by the\
-                            date specified by datetype. These two parameters\
-                            (mindate, maxdate) must be used together to\
-                            specify an arbitrary date range. The general date\
-                            format is YYYY/MM/DD, and these variants are also\
-                            allowed: YYYY, YYYY/MM.')
-    parser.add_option('--mindate', dest='mindate',
-                      help='Date range used to limit a search result by the\
-                            date specified by datetype. These two parameters\
-                            (mindate, maxdate) must be used together to\
-                            specify an arbitrary date range. The general date\
-                            format is YYYY/MM/DD, and these variants are also\
-                            allowed: YYYY, YYYY/MM.')
-    parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO',
-                      help='logging level (default: INFO)')
-    (options, args) = parser.parse_args()
-    if len(args) > 0:
-        parser.error('Wrong number of arguments')
-    if((options.reldate and options.maxdate) or
-       (options.reldate and options.mindate)):
-        parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\
-                     parameters")
-    if((options.mindate and not options.maxdate) or
-       (options.maxdate and not options.mindate)):
-        parser.error("mindate and maxdate must be used together")
-
-    log_level = getattr(logging, options.loglevel)
+    args = command_parse()
+    log_level = getattr(logging, args.loglevel)
     kwargs = {'format': LOG_FORMAT,
               'datefmt': LOG_DATEFMT,
               'level': log_level}
-    if options.logfile:
-        kwargs['filename'] = options.logfile
+    if args.logfile:
+        kwargs['filename'] = args.logfile
     logging.basicConfig(**kwargs)
     logger = logging.getLogger('data_from_NCBI')

-    E = Eutils(options, logger)
-    try:
-        E.retrieve()
-    except Exception:
-        sys.exit(1)
+    E = Eutils(args, logger)
+    if args.count_ids:
+        try:
+            E.dry_run()
+        except Exception:
+            sys.exit(1)
+    else:
+        try:
+            E.retrieve()
+        except Exception:
+            sys.exit(1)


 if __name__ == "__main__":
--- a/fetch_fasta_from_NCBI.xml	Sun Oct 15 20:13:13 2017 -0400
+++ b/fetch_fasta_from_NCBI.xml	Wed Nov 08 13:00:26 2017 -0500
@@ -1,16 +1,12 @@
-<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.1.1">
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1">
   <description></description>
   <command><![CDATA[
       python '$__tool_directory__'/fetch_fasta_from_NCBI.py
       -i "$queryString"
       -d $dbname
-      -o '$outfilename'
       -l '$logfile'
-      #if $date_condition.date_filter == "YES":
-          --datetype $date_condition.datetype
-          --mindate $date_condition.mindate
-          --maxdate $date_condition.maxdate
-      #end if
+      $dry_run
+      -o '$outfile'
   ]]></command>

   <inputs>
@@ -30,21 +26,12 @@
       <option value="nuccore">Nucleotide</option>
       <option value="protein">Protein</option>
     </param>
-    <conditional name="date_condition">
-        <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/>
-        <when value="YES">
-            <param name="datetype" type="select">
-                <option value="pdat">Publication date</option>
-                <option value="mdat">Modification date</option>
-            </param>
-            <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
-            <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/>
-        </when>
-        <when value="NO"/>
-    </conditional>
+    <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/>
   </inputs>
   <outputs>
-    <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" />
+    <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" >
+      <filter> dry_run == False</filter>
+    </data>
     <data format="txt" name="logfile" label="${tool.name}: log"/>
   </outputs>
   <tests>
@@ -52,16 +39,13 @@
         <param name="queryString" value="9629650[gi]" />
         <param name="dbname" value="nuccore" />
         <output name="outfilename" ftype="fasta" file="output.fa" />
-        <!--  <output name="logfile" ftype="txt" file="log.txt" />  log.txt changes with timestamp. removed to pass the  test -->
     </test>
     <test>
         <param name="queryString" value="CU929326[Accession]" />
         <param name="dbname" value="nuccore" />
-        <param name="date_filter" value="YES"/>
-        <param name="datetype" value="pdat"/>
-        <param name="mindate" value="2008/09"/>
-        <param name="maxdate" value="2008/09"/>
-        <output name="outfilename" ftype="fasta" file="zebra_output.fa" />
+        <param name="date_filter" value="1"/>
+        <param name="dry_run" value="True"/>
+        <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
     </test>
   </tests>
   <help>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dry_run.log	Wed Nov 08 13:00:26 2017 -0500
@@ -0,0 +1,4 @@
+2017-11-08 16:27:09|INFO    |retrieving data from https://eutils.ncbi.nlm.nih.gov/entrez/eutils/
+2017-11-08 16:27:09|INFO    |for Query: Drosophila melanogaster[Organism] AND Gcn5[Title] and database: nuccore
+2017-11-08 16:27:10|INFO    |Found 4 UIDs
+