Mercurial > repos > artbio > fetch_fasta_from_ncbi

--- a/fetch_fasta_from_NCBI.py	Wed Nov 08 13:00:26 2017 -0500
+++ b/fetch_fasta_from_NCBI.py	Wed Nov 29 17:38:52 2017 -0500
@@ -55,6 +55,13 @@
         self.count = 0
         self.webenv = ""
         self.query_key = ""
+        if options.get_uids:
+            self.get_uids = True
+        else:
+            self.get_uids = False
+        if options.iuds_file:
+            with open(options.iuds_file, 'r') as f:
+                self.ids.extend(f.readline().split(' '))

     def dry_run(self):
         self.get_count_value()
@@ -63,15 +70,23 @@
         """
         Retrieve the fasta sequences corresponding to the query
         """
-        self.get_count_value()
+        if len(self.ids) == 0:
+            self.get_count_value()
+        else:
+            self.count = len(self.ids)
         # If no UIDs are found exit script
         if self.count > 0:
-            self.get_uids_list()
-            try:
-                self.get_sequences()
-            except QueryException as e:
-                self.logger.error("Exiting script.")
-                raise e
+            if len(self.ids) == 0:
+                self.get_uids_list()
+            if not self.get_uids:
+                try:
+                    self.get_sequences()
+                except QueryException as e:
+                    self.logger.error("Exiting script.")
+                    raise e
+            else:
+                with open(self.outname, 'w') as f:
+                    f.write('\t'.join(self.ids)+'\n')
         else:
             self.logger.error("No UIDs were found. Exiting script.")
             raise Exception("")
@@ -140,61 +155,58 @@
         time.sleep(1)
         return querylog

-    def epost(self, db, ids):
-        url = self.base + "epost.fcgi"
-        self.logger.debug("url_epost: %s" % url)
-        values = {'db': db,
-                  'id': ids}
-        data = urllib.urlencode(values)
-        req = urllib2.Request(url, data)
-        serverResponse = False
-        nb_trials = 0
-        while not serverResponse:
-            nb_trials += 1
-            try:
-                self.logger.debug("Try number %s for opening and readin URL %s"
-                                  % (nb_trials, url+data))
-                response = urllib2.urlopen(req)
-                querylog = response.readlines()
-                response.close()
-                serverResponse = True
-            except urllib2.HTTPError as e:
-                self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-            except urllib2.URLError as e:
-                self.logger.info("urlopen error: Failed to reach a server")
-                self.logger.info("Reason :%s" % (e.reason))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-            except httplib.IncompleteRead as e:
-                self.logger.info("IncompleteRead error:  %s" % (e.partial))
-                self.logger.info("Retrying in 1 sec")
-                serverResponse = False
-                time.sleep(1)
-        self.logger.debug("query response:")
-        for line in querylog:
-            self.logger.debug(line.rstrip())
-            if '</QueryKey>' in line:
-                self.query_key = str(line[line.find('<QueryKey>') +
-                                     len('<QueryKey>'):line.find('</QueryKey>')
-                                     ])
-            if '</WebEnv>' in line:
-                self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):
-                                  line.find('</WebEnv>')])
-            self.logger.debug("*** epost action ***")
-            self.logger.debug("query_key: %s" % self.query_key)
-            self.logger.debug("webenv: %s" % self.webenv)
-        time.sleep(1)
+    def sanitiser(self, db, fastaseq):
+        if(db not in "nuccore protein"):
+            return fastaseq
+        regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
+        sane_seqlist = []
+        seqlist = fastaseq.split("\n\n")
+        for seq in seqlist[:-1]:
+            fastalines = seq.split("\n")
+            if len(fastalines) < 2:
+                self.logger.info("Empty sequence for %s" %
+                                 ("|".join(fastalines[0].split("|")[:4])))
+                self.logger.info("%s download is skipped" %
+                                 ("|".join(fastalines[0].split("|")[:4])))
+                continue
+            if db == "nuccore":
+                badnuc = 0
+                for nucleotide in fastalines[1]:
+                    if nucleotide not in "ATGC":
+                        badnuc += 1
+                if float(badnuc)/len(fastalines[1]) > 0.4:
+                    self.logger.info("%s ambiguous nucleotides in %s\
+                                     or download interrupted at this offset\
+                                     | %s" % (float(badnuc)/len(fastalines[1]),
+                                              "|".join(fastalines[0].split("|")
+                                                       [:4]),
+                                              fastalines[1]))
+                    self.logger.info("%s download is skipped" %
+                                     (fastalines[0].split("|")[:4]))
+                    continue
+                """ remove spaces and trim the header to 100 chars """
+                fastalines[0] = fastalines[0].replace(" ", "_")[:100]
+                cleanseq = "\n".join(fastalines)
+                sane_seqlist.append(cleanseq)
+            elif db == "protein":
+                fastalines[0] = fastalines[0][0:100]
+                fastalines[0] = fastalines[0].replace(" ", "_")
+                fastalines[0] = fastalines[0].replace("[", "_")
+                fastalines[0] = fastalines[0].replace("]", "_")
+                fastalines[0] = fastalines[0].replace("=", "_")
+                """ because blast makedb doesn't like it """
+                fastalines[0] = fastalines[0].rstrip("_")
+                fastalines[0] = re.sub(regex, "_", fastalines[0])
+                cleanseq = "\n".join(fastalines)
+                sane_seqlist.append(cleanseq)
+        self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
+        return "\n".join(sane_seqlist)

-    def efetch(self, db, query_key, webenv):
+    def efetch(self, db, uid_list):
         url = self.base + "efetch.fcgi"
         self.logger.debug("url_efetch: %s" % url)
         values = {'db': db,
-                  'query_key': query_key,
-                  'webenv': webenv,
+                  'id': uid_list,
                   'rettype': "fasta",
                   'retmode': "text"}
         data = urllib.urlencode(values)
@@ -251,62 +263,8 @@
         time.sleep(0.1)
         return fasta

-    def sanitiser(self, db, fastaseq):
-        if(db not in "nuccore protein"):
-            return fastaseq
-        regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
-        sane_seqlist = []
-        seqlist = fastaseq.split("\n\n")
-        for seq in seqlist[:-1]:
-            fastalines = seq.split("\n")
-            if len(fastalines) < 2:
-                self.logger.info("Empty sequence for %s" %
-                                 ("|".join(fastalines[0].split("|")[:4])))
-                self.logger.info("%s download is skipped" %
-                                 ("|".join(fastalines[0].split("|")[:4])))
-                continue
-            if db == "nuccore":
-                badnuc = 0
-                for nucleotide in fastalines[1]:
-                    if nucleotide not in "ATGC":
-                        badnuc += 1
-                if float(badnuc)/len(fastalines[1]) > 0.4:
-                    self.logger.info("%s ambiguous nucleotides in %s\
-                                     or download interrupted at this offset\
-                                     | %s" % (float(badnuc)/len(fastalines[1]),
-                                              "|".join(fastalines[0].split("|")
-                                                       [:4]),
-                                              fastalines[1]))
-                    self.logger.info("%s download is skipped" %
-                                     (fastalines[0].split("|")[:4]))
-                    continue
-                """ remove spaces and trim the header to 100 chars """
-                fastalines[0] = fastalines[0].replace(" ", "_")[:100]
-                cleanseq = "\n".join(fastalines)
-                sane_seqlist.append(cleanseq)
-            elif db == "protein":
-                fastalines[0] = fastalines[0][0:100]
-                fastalines[0] = fastalines[0].replace(" ", "_")
-                fastalines[0] = fastalines[0].replace("[", "_")
-                fastalines[0] = fastalines[0].replace("]", "_")
-                fastalines[0] = fastalines[0].replace("=", "_")
-                """ because blast makedb doesn't like it """
-                fastalines[0] = fastalines[0].rstrip("_")
-                fastalines[0] = re.sub(regex, "_", fastalines[0])
-                cleanseq = "\n".join(fastalines)
-                sane_seqlist.append(cleanseq)
-        self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
-        return "\n".join(sane_seqlist)
-
     def get_sequences(self):
-        """
-        Total number of records from the input set to be retrieved,
-        up to a maximum of 10,000. Optionally, for a large set the value of
-        retstart can be iterated while holding retmax constant, thereby
-        downloading the entire set in batches of size retmax.
-        http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
-        """
-        batch_size = self.retmax_efetch
+        batch_size = 200
         count = self.count
         uids_list = self.ids
         self.logger.info("Batch size for efetch action: %d" % batch_size)
@@ -316,18 +274,15 @@
             for start in range(0, count, batch_size):
                 end = min(count, start+batch_size)
                 batch = uids_list[start:end]
-                if self.epost(self.dbname, ",".join(batch)) != -1:
-                    mfasta = ''
-                    while not mfasta:
-                        self.logger.info("retrieving batch %d" %
-                                         ((start / batch_size) + 1))
-                        try:
-                            mfasta = self.efetch(self.dbname, self.query_key,
-                                                 self.webenv)
-                            out.write(mfasta + '\n')
-                        except QueryException as e:
-                            self.logger.error("%s" % e.message)
-                            raise e
+                self.logger.info("retrieving batch %d" %
+                                 ((start / batch_size) + 1))
+                try:
+                    mfasta = self.efetch(self.dbname, ','.join(batch))
+                    out.write(mfasta + '\n')
+                except QueryException as e:
+                    self.logger.error("%s" % e.message)
+                    raise e
+        urllib.urlcleanup()


 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
@@ -337,18 +292,25 @@

 def command_parse():
     parser = argparse.ArgumentParser(description='Retrieve data from NCBI')
-    parser.add_argument('-i', dest='query_string', help='NCBI Query String',
-                        required=True)
+    parser.add_argument('-i', dest='query_string', help='NCBI Query String')
+    parser.add_argument('--UID_list', dest='iuds_file',
+                        help='file containing a list of iuds to be fetched')
     parser.add_argument('-o', dest='outname', help='output file name')
     parser.add_argument('-d', dest='dbname', help='database type')
     parser.add_argument('--count', '-c', dest='count_ids',
                         action='store_true', default=False,
                         help='dry run ouputing only the number of sequences\
                         found')
+    parser.add_argument('--get_uids', '-u', dest='get_uids', default=False,
+                        action='store_true', help='prints to the output a list\
+                        of UIDs')
     parser.add_argument('-l', '--logfile', help='log file (default=stderr)')
     parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO',
                         help='logging level (default: INFO)')
     args = parser.parse_args()
+    if args.query_string is not None and args.iuds_file is not None:
+        parser.error('Please choose either fetching the -i query or the -u\
+                     list.')
     return args


@@ -369,12 +331,12 @@
         try:
             E.dry_run()
         except Exception:
-            sys.exit(1)
+            sys.exit(-1)
     else:
         try:
             E.retrieve()
         except Exception:
-            sys.exit(1)
+            sys.exit(-1)


 if __name__ == "__main__":
--- a/fetch_fasta_from_NCBI.xml	Wed Nov 08 13:00:26 2017 -0500
+++ b/fetch_fasta_from_NCBI.xml	Wed Nov 29 17:38:52 2017 -0500
@@ -1,16 +1,71 @@
-<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1">
+<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0">
   <description></description>
   <command><![CDATA[
-      python '$__tool_directory__'/fetch_fasta_from_NCBI.py
-      -i "$queryString"
-      -d $dbname
-      -l '$logfile'
-      $dry_run
-      -o '$outfile'
+    python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+        -i "$queryString"
+        -d $dbname
+        -l '$logfile'
+        -c
+        -o '$outfile';
+    #if $dry_run == ""
+        number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1');
+        python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+            -i "$queryString"
+            -d $dbname
+            -u
+            -l '$logfile'
+            -o 'uid_outfile';
+        UID_array=( \$(head uid_outfile) );
+        array_len=\${#UID_array[@]};
+        counter=0;
+        number_of_groups=\$((array_len / 200000));
+        modulo=\$((array_len % 200000));
+        if [ "\$modulo" -gt 0 ];then
+            number_of_groups=\$((number_of_groups + 1));
+        fi;
+        group_number=1;
+        echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile;
+        for ((i=0; i+200000<array_len;i+=200000)); do
+            echo "----- Group number: \$group_number -----" >> $logfile;
+            echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt;
+            echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt;
+            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+                -d $dbname
+                -l '$logfile'
+                -o 'tmp1_outfile'
+                --UID_list uid_list_1.txt&
+            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+                -d $dbname
+                -l 'tmp1_logfile'
+                -o 'tmp2_outfile'
+                --UID_list uid_list_2.txt&
+            wait;
+            cat tmp1_outfile tmp2_outfile>> $outfile;
+            rm tmp1_outfile tmp2_outfile;
+            cat tmp1_logfile >> $logfile;
+            rm tmp1_logfile;
+            rm uid_list_1.txt uid_list_2.txt;
+            group_number=\$((group_number + 1));
+            counter=\$(( counter + 200000 ));
+        done;
+        echo "----- Group number: \$group_number -----" >> $logfile;
+        echo "----- Last group -----" >> $logfile;
+        if [ "\$counter" -lt "\$array_len" ]; then
+            echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt;
+            python '$__tool_directory__'/fetch_fasta_from_NCBI.py
+                -d $dbname
+                -l '$logfile'
+                -o 'tmp_outfile'
+                --UID_list uid_list.txt;
+            rm uid_list.txt;
+            cat tmp_outfile >> $outfile;
+            rm tmp_outfile;
+        fi;
+    #end if
   ]]></command>

   <inputs>
-    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]">
+    <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]">
       <sanitizer>
         <valid initial="string.printable">
           <remove value="&quot;"/>
@@ -26,7 +81,7 @@
       <option value="nuccore">Nucleotide</option>
       <option value="protein">Protein</option>
     </param>
-    <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/>
+    <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/>
   </inputs>
   <outputs>
     <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" >
@@ -35,23 +90,29 @@
     <data format="txt" name="logfile" label="${tool.name}: log"/>
   </outputs>
   <tests>
-    <test>
-        <param name="queryString" value="9629650[gi]" />
-        <param name="dbname" value="nuccore" />
-        <output name="outfilename" ftype="fasta" file="output.fa" />
-    </test>
-    <test>
-        <param name="queryString" value="CU929326[Accession]" />
-        <param name="dbname" value="nuccore" />
-        <param name="date_filter" value="1"/>
-        <param name="dry_run" value="True"/>
-        <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
-    </test>
+      <test>
+          <param name="queryString" value="9629650[gi]" />
+          <param name="dbname" value="nuccore" />
+          <output name="outfilename" ftype="fasta" file="output.fa" />
+      </test>
+      <test>
+          <param name="queryString" value="CU929326[Accession]" />
+          <param name="dbname" value="nuccore" />
+          <param name="date_filter" value="1"/>
+          <param name="dry_run" value="True"/>
+          <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
+      </test>
+      <test>
+          <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" />
+          <output name="outfilename" ftype="fasta" >
+              <metadata name="sequences" value="13" />
+          </output>
+      </test>
   </tests>
   <help>
 **What it does**

-This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query.
+This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query.

 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose

@@ -59,8 +120,12 @@

 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.

+By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch.
+
 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)

+Retrieval progress is reported in the log dataset.
+
 **Acknowledgments**

 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.