Mercurial > repos > artbio > fetch_fasta_from_ncbi
changeset 1:7e41bbb94159 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit 6008aafac37eec1916d6b72c05d9cfcb002b8095
author | artbio |
---|---|
date | Sun, 15 Oct 2017 13:56:26 -0400 |
parents | c877548ebde1 |
children | 50f5ef3313bb |
files | fetch_fasta_from_NCBI.py fetch_fasta_from_NCBI.xml retrieve_fasta_from_NCBI.py retrieve_fasta_from_NCBI.xml |
diffstat | 4 files changed, 506 insertions(+), 507 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_fasta_from_NCBI.py Sun Oct 15 13:56:26 2017 -0400 @@ -0,0 +1,411 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +From a taxonomy ID retrieves all the nucleotide sequences +It returns a multiFASTA nuc/prot file + +Entrez Database UID common name E-utility Database Name +Nucleotide GI number nuccore +Protein GI number protein + +Retrieve strategy: + +esearch to get total number of UIDs (count) +esearch to get UIDs in batches +loop untile end of UIDs list: + epost to put a batch of UIDs in the history server + efetch to retrieve info from previous post + +retmax of efetch is 1/10 of declared value from NCBI + +queries are 1 sec delayed, to satisfy NCBI guidelines +(more than what they request) +""" +import httplib +import logging +import optparse +import re +import sys +import time +import urllib +import urllib2 + + +class QueryException(Exception): + pass + + +class Eutils: + + def __init__(self, options, logger): + """ + Initialize retrieval parameters + """ + self.logger = logger + self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + self.query_string = options.query_string + self.dbname = options.dbname + if options.outname: + self.outname = options.outname + else: + self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' + self.ids = [] + self.retmax_esearch = 100000 + self.retmax_efetch = 500 + self.count = 0 + self.webenv = "" + self.query_key = "" + self.datetype = options.datetype + if options.reldate: + self.reldate = options.reldate + else: + self.reldate = '' + if options.mindate: + self.mindate = options.mindate + else: + self.mindate = '' + if options.maxdate: + self.maxdate = options.maxdate + else: + self.maxdate = '' + + def retrieve(self): + """ + Retrieve the fasta sequences corresponding to the query + """ + self.get_count_value() + + # If no UIDs are found exit script + if self.count > 0: + self.get_uids_list() + try: + self.get_sequences() + except QueryException as e: + self.logger.error("Exiting script.") + raise e + else: + self.logger.error("No UIDs were found. Exiting script.") + raise Exception("") + + def get_count_value(self): + """ + just to retrieve Count (number of UIDs) + Total number of UIDs from the retrieved set to be shown in the XML + output (default=20). By default, ESearch only includes the first 20 + UIDs retrieved in the XML output. If usehistory is set to 'y', + the remainder of the retrieved set will be stored on the History server + + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + """ + self.logger.info("retrieving data from %s" % self.base) + self.logger.info("for Query: %s and database: %s" % + (self.query_string, self.dbname)) + querylog = self.esearch(self.dbname, self.query_string, '', '', + "count", self.datetype, self.reldate, + self.mindate, self.maxdate) + self.logger.debug("Query response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</Count>' in line: + self.count = int(line[line.find('<Count>')+len('<Count>'): + line.find('</Count>')]) + self.logger.info("Found %d UIDs" % self.count) + + def get_uids_list(self): + """ + Increasing retmax allows more of the retrieved UIDs to be included in + the XML output, up to a maximum of 100,000 records. + from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch + """ + retmax = self.retmax_esearch + if (self.count > retmax): + num_batches = (self.count / retmax) + 1 + else: + num_batches = 1 + self.logger.info("Batch size for esearch action: %d UIDs" % retmax) + self.logger.info("Number of batches for esearch action: %d " % + num_batches) + for n in range(num_batches): + querylog = self.esearch(self.dbname, self.query_string, n*retmax, + retmax, '', self.datetype, self.reldate, + self.mindate, self.maxdate) + for line in querylog: + if '<Id>' in line and '</Id>' in line: + uid = (line[line.find('<Id>')+len('<Id>'): + line.find('</Id>')]) + self.ids.append(uid) + self.logger.info("Retrieved %d UIDs" % len(self.ids)) + + def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, + mindate, maxdate): + url = self.base + "esearch.fcgi" + self.logger.debug("url: %s" % url) + values = {'db': db, + 'term': term, + 'rettype': rettype, + 'retstart': retstart, + 'retmax': retmax, + 'datetype': datetype, + 'reldate': reldate, + 'mindate': mindate, + 'maxdate': maxdate} + data = urllib.urlencode(values) + self.logger.debug("data: %s" % str(data)) + req = urllib2.Request(url, data) + response = urllib2.urlopen(req) + querylog = response.readlines() + response.close() + time.sleep(1) + return querylog + + def epost(self, db, ids): + url = self.base + "epost.fcgi" + self.logger.debug("url_epost: %s" % url) + values = {'db': db, + 'id': ids} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + serverResponse = False + nb_trials = 0 + while not serverResponse: + nb_trials += 1 + try: + self.logger.debug("Try number %s for opening and readin URL %s" + % (nb_trials, url+data)) + response = urllib2.urlopen(req) + querylog = response.readlines() + response.close() + serverResponse = True + except urllib2.HTTPError as e: + self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except urllib2.URLError as e: + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % (e.reason)) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + except httplib.IncompleteRead as e: + self.logger.info("IncompleteRead error: %s" % (e.partial)) + self.logger.info("Retrying in 1 sec") + serverResponse = False + time.sleep(1) + self.logger.debug("query response:") + for line in querylog: + self.logger.debug(line.rstrip()) + if '</QueryKey>' in line: + self.query_key = str(line[line.find('<QueryKey>') + + len('<QueryKey>'):line.find('</QueryKey>') + ]) + if '</WebEnv>' in line: + self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'): + line.find('</WebEnv>')]) + self.logger.debug("*** epost action ***") + self.logger.debug("query_key: %s" % self.query_key) + self.logger.debug("webenv: %s" % self.webenv) + time.sleep(1) + + def efetch(self, db, query_key, webenv): + url = self.base + "efetch.fcgi" + self.logger.debug("url_efetch: %s" % url) + values = {'db': db, + 'query_key': query_key, + 'webenv': webenv, + 'rettype': "fasta", + 'retmode': "text"} + data = urllib.urlencode(values) + req = urllib2.Request(url, data) + self.logger.debug("data: %s" % str(data)) + serverTransaction = False + counter = 0 + response_code = 0 + while not serverTransaction: + counter += 1 + self.logger.info("Server Transaction Trial: %s" % (counter)) + try: + response = urllib2.urlopen(req) + response_code = response.getcode() + fasta = response.read() + response.close() + if((response_code != 200) or + ("Resource temporarily unavailable" in fasta) or + ("Error" in fasta) or (not fasta.startswith(">"))): + serverTransaction = False + if (response_code != 200): + self.logger.info("urlopen error: Response code is not\ + 200") + elif ("Resource temporarily unavailable" in fasta): + self.logger.info("Ressource temporarily unavailable") + elif ("Error" in fasta): + self.logger.info("Error in fasta") + else: + self.logger.info("Fasta doesn't start with '>'") + else: + serverTransaction = True + except urllib2.HTTPError as e: + serverTransaction = False + self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) + except urllib2.URLError as e: + serverTransaction = False + self.logger.info("urlopen error: Failed to reach a server") + self.logger.info("Reason :%s" % (e.reason)) + except httplib.IncompleteRead as e: + serverTransaction = False + self.logger.info("IncompleteRead error: %s" % (e.partial)) + if (counter > 500): + serverTransaction = True + if (counter > 500): + raise QueryException({"message": + "500 Server Transaction Trials attempted for\ + this batch. Aborting."}) + fasta = self.sanitiser(self.dbname, fasta) + time.sleep(0.1) + return fasta + + def sanitiser(self, db, fastaseq): + if(db not in "nuccore protein"): + return fastaseq + regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") + sane_seqlist = [] + seqlist = fastaseq.split("\n\n") + for seq in seqlist[:-1]: + fastalines = seq.split("\n") + if len(fastalines) < 2: + self.logger.info("Empty sequence for %s" % + ("|".join(fastalines[0].split("|")[:4]))) + self.logger.info("%s download is skipped" % + ("|".join(fastalines[0].split("|")[:4]))) + continue + if db == "nuccore": + badnuc = 0 + for nucleotide in fastalines[1]: + if nucleotide not in "ATGC": + badnuc += 1 + if float(badnuc)/len(fastalines[1]) > 0.4: + self.logger.info("%s ambiguous nucleotides in %s\ + or download interrupted at this offset\ + | %s" % (float(badnuc)/len(fastalines[1]), + "|".join(fastalines[0].split("|") + [:4]), + fastalines[1])) + self.logger.info("%s download is skipped" % + (fastalines[0].split("|")[:4])) + continue + """ remove spaces and trim the header to 100 chars """ + fastalines[0] = fastalines[0].replace(" ", "_")[:100] + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + elif db == "protein": + fastalines[0] = fastalines[0][0:100] + fastalines[0] = fastalines[0].replace(" ", "_") + fastalines[0] = fastalines[0].replace("[", "_") + fastalines[0] = fastalines[0].replace("]", "_") + fastalines[0] = fastalines[0].replace("=", "_") + """ because blast makedb doesn't like it """ + fastalines[0] = fastalines[0].rstrip("_") + fastalines[0] = re.sub(regex, "_", fastalines[0]) + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) + return "\n".join(sane_seqlist) + + def get_sequences(self): + """ + Total number of records from the input set to be retrieved, + up to a maximum of 10,000. Optionally, for a large set the value of + retstart can be iterated while holding retmax constant, thereby + downloading the entire set in batches of size retmax. + http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch + """ + batch_size = self.retmax_efetch + count = self.count + uids_list = self.ids + self.logger.info("Batch size for efetch action: %d" % batch_size) + self.logger.info("Number of batches for efetch action: %d" % + ((count / batch_size) + 1)) + with open(self.outname, 'w') as out: + for start in range(0, count, batch_size): + end = min(count, start+batch_size) + batch = uids_list[start:end] + if self.epost(self.dbname, ",".join(batch)) != -1: + mfasta = '' + while not mfasta: + self.logger.info("retrieving batch %d" % + ((start / batch_size) + 1)) + try: + mfasta = self.efetch(self.dbname, self.query_key, + self.webenv) + out.write(mfasta + '\n') + except QueryException as e: + self.logger.error("%s" % e.message) + raise e + + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def __main__(): + """ main function """ + parser = optparse.OptionParser(description='Retrieve data from NCBI') + parser.add_option('-i', dest='query_string', help='NCBI Query String') + parser.add_option('-o', dest='outname', help='output file name') + parser.add_option('-d', dest='dbname', help='database type') + parser.add_option('-l', '--logfile', help='log file (default=stderr)') + parser.add_option('--datetype', dest='datetype', + choices=['mdat', 'pdat'], + help='Type of date used to limit a search.\ + [ mdat(modification date), pdat(publication date)]\ + (default=pdat)', default='pdat') + parser.add_option('--reldate', dest='reldate', + help='When reldate is set to an integer n, the search\ + returns only those items that have a date\ + specified by datetype within the last n days.') + parser.add_option('--maxdate', dest='maxdate', + help='Date range used to limit a search result by the\ + date specified by datetype. These two parameters\ + (mindate, maxdate) must be used together to\ + specify an arbitrary date range. The general date\ + format is YYYY/MM/DD, and these variants are also\ + allowed: YYYY, YYYY/MM.') + parser.add_option('--mindate', dest='mindate', + help='Date range used to limit a search result by the\ + date specified by datetype. These two parameters\ + (mindate, maxdate) must be used together to\ + specify an arbitrary date range. The general date\ + format is YYYY/MM/DD, and these variants are also\ + allowed: YYYY, YYYY/MM.') + parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', + help='logging level (default: INFO)') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + if((options.reldate and options.maxdate) or + (options.reldate and options.mindate)): + parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ + parameters") + if((options.mindate and not options.maxdate) or + (options.maxdate and not options.mindate)): + parser.error("mindate and maxdate must be used together") + + log_level = getattr(logging, options.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if options.logfile: + kwargs['filename'] = options.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('data_from_NCBI') + + E = Eutils(options, logger) + try: + E.retrieve() + except Exception as e: + sys.exit(1) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_fasta_from_NCBI.xml Sun Oct 15 13:56:26 2017 -0400 @@ -0,0 +1,95 @@ +<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.1.0"> + <description></description> + <command><![CDATA[ + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -i "$queryString" + -d $dbname + -o '$outfilename' + -l '$logfile' + #if $date_condition.date_filter == "YES": + --datetype $date_condition.datetype + --mindate $date_condition.mindate + --maxdate $date_condition.maxdate + #end if + ]]></command> + + <inputs> + <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + <remove value="\"/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + <add source="\" target="\\"/> + </mapping> + </sanitizer> + </param> + <param name="dbname" type="select" label="NCBI database"> + <option value="nuccore">Nucleotide</option> + <option value="protein">Protein</option> + </param> + <conditional name="date_condition"> + <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/> + <when value="YES"> + <param name="datetype" type="select"> + <option value="pdat">Publication date</option> + <option value="mdat">Modification date</option> + </param> + <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> + <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> + </when> + <when value="NO"/> + </conditional> + </inputs> + <outputs> + <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" /> + <data format="txt" name="logfile" label="${tool.name}: log"/> + </outputs> + <tests> + <test> + <param name="queryString" value="9629650[gi]" /> + <param name="dbname" value="nuccore" /> + <output name="outfilename" ftype="fasta" file="output.fa" /> + <!-- <output name="logfile" ftype="txt" file="log.txt" /> log.txt changes with timestamp. removed to pass the test --> + </test> + <test> + <param name="queryString" value="CU929326[Accession]" /> + <param name="dbname" value="nuccore" /> + <param name="date_filter" value="YES"/> + <param name="datetype" value="pdat"/> + <param name="mindate" value="2008/09"/> + <param name="maxdate" value="2008/09"/> + <output name="outfilename" ftype="fasta" file="zebra_output.fa" /> + </test> + </tests> + <help> +**What it does** + +This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query. + +The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose + +See `Entrez help`_ for explanation of query formats + +Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. + +Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) + +**Acknowledgments** + +This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. + +It is Copyright © 2014-2015 `CNRS and University Pierre et Marie Curie`_ and is released under the `MIT license`_. + +.. _Entrez help: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options +.. _get_fasta_from_taxon: https://toolshed.g2.bx.psu.edu/view/crs4/get_fasta_from_taxon +.. _CNRS and University Pierre et Marie Curie: http://www.ibps.upmc.fr/en +.. _MIT license: http://opensource.org/licenses/MIT + + </help> + <citations> + <citation type="doi">10.1186/1471-2105-14-73</citation> + </citations> +</tool>
--- a/retrieve_fasta_from_NCBI.py Thu Aug 24 08:18:34 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,411 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -From a taxonomy ID retrieves all the nucleotide sequences -It returns a multiFASTA nuc/prot file - -Entrez Database UID common name E-utility Database Name -Nucleotide GI number nuccore -Protein GI number protein - -Retrieve strategy: - -esearch to get total number of UIDs (count) -esearch to get UIDs in batches -loop untile end of UIDs list: - epost to put a batch of UIDs in the history server - efetch to retrieve info from previous post - -retmax of efetch is 1/10 of declared value from NCBI - -queries are 1 sec delayed, to satisfy NCBI guidelines -(more than what they request) -""" -import sys -import logging -import optparse -import time -import urllib -import urllib2 -import httplib -import re - - -class QueryException(Exception): - pass - - -class Eutils: - - def __init__(self, options, logger): - """ - Initialize retrieval parameters - """ - self.logger = logger - self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - self.query_string = options.query_string - self.dbname = options.dbname - if options.outname: - self.outname = options.outname - else: - self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' - self.ids = [] - self.retmax_esearch = 100000 - self.retmax_efetch = 500 - self.count = 0 - self.webenv = "" - self.query_key = "" - self.datetype = options.datetype - if options.reldate: - self.reldate = options.reldate - else: - self.reldate = '' - if options.mindate: - self.mindate = options.mindate - else: - self.mindate = '' - if options.maxdate: - self.maxdate = options.maxdate - else: - self.maxdate = '' - - def retrieve(self): - """ - Retrieve the fasta sequences corresponding to the query - """ - self.get_count_value() - - # If no UIDs are found exit script - if self.count > 0: - self.get_uids_list() - try: - self.get_sequences() - except QueryException as e: - self.logger.error("Exiting script.") - raise e - else: - self.logger.error("No UIDs were found. Exiting script.") - raise Exception("") - - def get_count_value(self): - """ - just to retrieve Count (number of UIDs) - Total number of UIDs from the retrieved set to be shown in the XML - output (default=20). By default, ESearch only includes the first 20 - UIDs retrieved in the XML output. If usehistory is set to 'y', - the remainder of the retrieved set will be stored on the History server - - http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch - """ - self.logger.info("retrieving data from %s" % self.base) - self.logger.info("for Query: %s and database: %s" % - (self.query_string, self.dbname)) - querylog = self.esearch(self.dbname, self.query_string, '', '', - "count", self.datetype, self.reldate, - self.mindate, self.maxdate) - self.logger.debug("Query response:") - for line in querylog: - self.logger.debug(line.rstrip()) - if '</Count>' in line: - self.count = int(line[line.find('<Count>')+len('<Count>'): - line.find('</Count>')]) - self.logger.info("Found %d UIDs" % self.count) - - def get_uids_list(self): - """ - Increasing retmax allows more of the retrieved UIDs to be included in - the XML output, up to a maximum of 100,000 records. - from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch - """ - retmax = self.retmax_esearch - if (self.count > retmax): - num_batches = (self.count / retmax) + 1 - else: - num_batches = 1 - self.logger.info("Batch size for esearch action: %d UIDs" % retmax) - self.logger.info("Number of batches for esearch action: %d " % - num_batches) - for n in range(num_batches): - querylog = self.esearch(self.dbname, self.query_string, n*retmax, - retmax, '', self.datetype, self.reldate, - self.mindate, self.maxdate) - for line in querylog: - if '<Id>' in line and '</Id>' in line: - uid = (line[line.find('<Id>')+len('<Id>'): - line.find('</Id>')]) - self.ids.append(uid) - self.logger.info("Retrieved %d UIDs" % len(self.ids)) - - def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate, - mindate, maxdate): - url = self.base + "esearch.fcgi" - self.logger.debug("url: %s" % url) - values = {'db': db, - 'term': term, - 'rettype': rettype, - 'retstart': retstart, - 'retmax': retmax, - 'datetype': datetype, - 'reldate': reldate, - 'mindate': mindate, - 'maxdate': maxdate} - data = urllib.urlencode(values) - self.logger.debug("data: %s" % str(data)) - req = urllib2.Request(url, data) - response = urllib2.urlopen(req) - querylog = response.readlines() - response.close() - time.sleep(1) - return querylog - - def epost(self, db, ids): - url = self.base + "epost.fcgi" - self.logger.debug("url_epost: %s" % url) - values = {'db': db, - 'id': ids} - data = urllib.urlencode(values) - req = urllib2.Request(url, data) - serverResponse = False - nb_trials = 0 - while not serverResponse: - nb_trials += 1 - try: - self.logger.debug("Try number %s for opening and readin URL %s" - % (nb_trials, url+data)) - response = urllib2.urlopen(req) - querylog = response.readlines() - response.close() - serverResponse = True - except urllib2.HTTPError as e: - self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - except urllib2.URLError as e: - self.logger.info("urlopen error: Failed to reach a server") - self.logger.info("Reason :%s" % (e.reason)) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - except httplib.IncompleteRead as e: - self.logger.info("IncompleteRead error: %s" % (e.partial)) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - self.logger.debug("query response:") - for line in querylog: - self.logger.debug(line.rstrip()) - if '</QueryKey>' in line: - self.query_key = str(line[line.find('<QueryKey>') + - len('<QueryKey>'):line.find('</QueryKey>') - ]) - if '</WebEnv>' in line: - self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'): - line.find('</WebEnv>')]) - self.logger.debug("*** epost action ***") - self.logger.debug("query_key: %s" % self.query_key) - self.logger.debug("webenv: %s" % self.webenv) - time.sleep(1) - - def efetch(self, db, query_key, webenv): - url = self.base + "efetch.fcgi" - self.logger.debug("url_efetch: %s" % url) - values = {'db': db, - 'query_key': query_key, - 'webenv': webenv, - 'rettype': "fasta", - 'retmode': "text"} - data = urllib.urlencode(values) - req = urllib2.Request(url, data) - self.logger.debug("data: %s" % str(data)) - serverTransaction = False - counter = 0 - response_code = 0 - while not serverTransaction: - counter += 1 - self.logger.info("Server Transaction Trial: %s" % (counter)) - try: - response = urllib2.urlopen(req) - response_code = response.getcode() - fasta = response.read() - response.close() - if((response_code != 200) or - ("Resource temporarily unavailable" in fasta) or - ("Error" in fasta) or (not fasta.startswith(">"))): - serverTransaction = False - if (response_code != 200): - self.logger.info("urlopen error: Response code is not\ - 200") - elif ("Resource temporarily unavailable" in fasta): - self.logger.info("Ressource temporarily unavailable") - elif ("Error" in fasta): - self.logger.info("Error in fasta") - else: - self.logger.info("Fasta doesn't start with '>'") - else: - serverTransaction = True - except urllib2.HTTPError as e: - serverTransaction = False - self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) - except urllib2.URLError as e: - serverTransaction = False - self.logger.info("urlopen error: Failed to reach a server") - self.logger.info("Reason :%s" % (e.reason)) - except httplib.IncompleteRead as e: - serverTransaction = False - self.logger.info("IncompleteRead error: %s" % (e.partial)) - if (counter > 500): - serverTransaction = True - if (counter > 500): - raise QueryException({"message": - "500 Server Transaction Trials attempted for\ - this batch. Aborting."}) - fasta = self.sanitiser(self.dbname, fasta) - time.sleep(0.1) - return fasta - - def sanitiser(self, db, fastaseq): - if(db not in "nuccore protein"): - return fastaseq - regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") - sane_seqlist = [] - seqlist = fastaseq.split("\n\n") - for seq in seqlist[:-1]: - fastalines = seq.split("\n") - if len(fastalines) < 2: - self.logger.info("Empty sequence for %s" % - ("|".join(fastalines[0].split("|")[:4]))) - self.logger.info("%s download is skipped" % - ("|".join(fastalines[0].split("|")[:4]))) - continue - if db == "nuccore": - badnuc = 0 - for nucleotide in fastalines[1]: - if nucleotide not in "ATGC": - badnuc += 1 - if float(badnuc)/len(fastalines[1]) > 0.4: - self.logger.info("%s ambiguous nucleotides in %s\ - or download interrupted at this offset\ - | %s" % (float(badnuc)/len(fastalines[1]), - "|".join(fastalines[0].split("|") - [:4]), - fastalines[1])) - self.logger.info("%s download is skipped" % - (fastalines[0].split("|")[:4])) - continue - """ remove spaces and trim the header to 100 chars """ - fastalines[0] = fastalines[0].replace(" ", "_")[:100] - cleanseq = "\n".join(fastalines) - sane_seqlist.append(cleanseq) - elif db == "protein": - fastalines[0] = fastalines[0][0:100] - fastalines[0] = fastalines[0].replace(" ", "_") - fastalines[0] = fastalines[0].replace("[", "_") - fastalines[0] = fastalines[0].replace("]", "_") - fastalines[0] = fastalines[0].replace("=", "_") - """ because blast makedb doesn't like it """ - fastalines[0] = fastalines[0].rstrip("_") - fastalines[0] = re.sub(regex, "_", fastalines[0]) - cleanseq = "\n".join(fastalines) - sane_seqlist.append(cleanseq) - self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) - return "\n".join(sane_seqlist) - - def get_sequences(self): - """ - Total number of records from the input set to be retrieved, - up to a maximum of 10,000. Optionally, for a large set the value of - retstart can be iterated while holding retmax constant, thereby - downloading the entire set in batches of size retmax. - http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch - """ - batch_size = self.retmax_efetch - count = self.count - uids_list = self.ids - self.logger.info("Batch size for efetch action: %d" % batch_size) - self.logger.info("Number of batches for efetch action: %d" % - ((count / batch_size) + 1)) - with open(self.outname, 'w') as out: - for start in range(0, count, batch_size): - end = min(count, start+batch_size) - batch = uids_list[start:end] - if self.epost(self.dbname, ",".join(batch)) != -1: - mfasta = '' - while not mfasta: - self.logger.info("retrieving batch %d" % - ((start / batch_size) + 1)) - try: - mfasta = self.efetch(self.dbname, self.query_key, - self.webenv) - out.write(mfasta + '\n') - except QueryException as e: - self.logger.error("%s" % e.message) - raise e - - -LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' -LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' -LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] - - -def __main__(): - """ main function """ - parser = optparse.OptionParser(description='Retrieve data from NCBI') - parser.add_option('-i', dest='query_string', help='NCBI Query String') - parser.add_option('-o', dest='outname', help='output file name') - parser.add_option('-d', dest='dbname', help='database type') - parser.add_option('-l', '--logfile', help='log file (default=stderr)') - parser.add_option('--datetype', dest='datetype', - choices=['mdat', 'edat', 'pdat'], - help='Type of date used to limit a search.\ - [ mdat(modification date), pdat(publication date),\ - edat(entrez date)] (default=pdat)', default='pdat') - parser.add_option('--reldate', dest='reldate', - help='When reldate is set to an integer n, the search\ - returns only those items that have a date\ - specified by datetype within the last n days.') - parser.add_option('--maxdate', dest='maxdate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--mindate', dest='mindate', - help='Date range used to limit a search result by the\ - date specified by datetype. These two parameters\ - (mindate, maxdate) must be used together to\ - specify an arbitrary date range. The general date\ - format is YYYY/MM/DD, and these variants are also\ - allowed: YYYY, YYYY/MM.') - parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', - help='logging level (default: INFO)') - (options, args) = parser.parse_args() - if len(args) > 0: - parser.error('Wrong number of arguments') - if((options.reldate and options.maxdate) or - (options.reldate and options.mindate)): - parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\ - parameters") - if((options.mindate and not options.maxdate) or - (options.maxdate and not options.mindate)): - parser.error("mindate and maxdate must be used together") - - log_level = getattr(logging, options.loglevel) - kwargs = {'format': LOG_FORMAT, - 'datefmt': LOG_DATEFMT, - 'level': log_level} - if options.logfile: - kwargs['filename'] = options.logfile - logging.basicConfig(**kwargs) - logger = logging.getLogger('data_from_NCBI') - - E = Eutils(options, logger) - try: - E.retrieve() - except Exception as e: - sys.exit(1) - - -if __name__ == "__main__": - __main__()
--- a/retrieve_fasta_from_NCBI.xml Thu Aug 24 08:18:34 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.0.0"> - <description></description> - <command><![CDATA[ - python '$__tool_directory__'/retrieve_fasta_from_NCBI.py - -i "$queryString" - -d $dbname - -o '$outfilename' - -l '$logfile' - #if $date_condition.date_filter == "YES": - --datetype $date_condition.datetype - --mindate $date_condition.mindate - --maxdate $date_condition.maxdate - #end if - ]]></command> - - <inputs> - <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]"> - <sanitizer> - <valid initial="string.printable"> - <remove value="""/> - <remove value="\"/> - </valid> - <mapping initial="none"> - <add source=""" target="\""/> - <add source="\" target="\\"/> - </mapping> - </sanitizer> - </param> - <param name="dbname" type="select" label="NCBI database"> - <option value="nuccore">Nucleotide</option> - <option value="protein">Protein</option> - </param> - <conditional name="date_condition"> - <param name="date_filter" type="boolean" label="Filter the sequences by date?" truevalue="YES" falsevalue="NO" checked="false"/> - <when value="YES"> - <param name="datetype" type="select"> - <option value="pdat">Publication date</option> - <option value="edat">Entrez date</option> - <option value="mdat">Modification date</option> - </param> - <param name="mindate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> - <param name="maxdate" type="text" help="Date must follow one of the following formats: YYYY/MM/DD, YYYY/MM or YYYY"/> - </when> - <when value="NO"/> - </conditional> - </inputs> - <outputs> - <data name="outfilename" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" /> - <data format="txt" name="logfile" label="${tool.name}: log"/> - </outputs> - <tests> - <test> - <param name="queryString" value="9629650[gi]" /> - <param name="dbname" value="nuccore" /> - <output name="outfilename" ftype="fasta" file="output.fa" /> - <!-- <output name="logfile" ftype="txt" file="log.txt" /> log.txt changes with timestamp. removed to pass the test --> - </test> - <test> - <param name="queryString" value="CU929326[Accession]" /> - <param name="dbname" value="nuccore" /> - <param name="date_filter" value="YES"/> - <param name="datetype" value="pdat"/> - <param name="mindate" value="2008/09"/> - <param name="maxdate" value="2008/09"/> - <output name="outfilename" ftype="fasta" file="zebra_output.fa" /> - </test> - </tests> - <help> -**What it does** - -This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query. - -The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose - -See `Entrez help`_ for explanation of query formats - -Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. - -Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) - -**Acknowledgments** - -This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. - -It is Copyright © 2014-2015 `CNRS and University Pierre et Marie Curie`_ and is released under the `MIT license`_. - -.. _Entrez help: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Entrez_Searching_Options -.. _get_fasta_from_taxon: https://toolshed.g2.bx.psu.edu/view/crs4/get_fasta_from_taxon -.. _CNRS and University Pierre et Marie Curie: http://www.ibps.upmc.fr/en -.. _MIT license: http://opensource.org/licenses/MIT - - </help> - <citations> - <citation type="doi">10.1186/1471-2105-14-73</citation> - </citations> -</tool>