Mercurial > repos > artbio > fetch_fasta_from_ncbi
changeset 4:c667d0ee39f5 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca3070e85c370b914ffa0562afe12b363e05aea4
author | artbio |
---|---|
date | Wed, 29 Nov 2017 17:38:52 -0500 |
parents | 8be88084f89c |
children | 706fe8139955 |
files | fetch_fasta_from_NCBI.py fetch_fasta_from_NCBI.xml |
diffstat | 2 files changed, 178 insertions(+), 151 deletions(-) [+] |
line wrap: on
line diff
--- a/fetch_fasta_from_NCBI.py Wed Nov 08 13:00:26 2017 -0500 +++ b/fetch_fasta_from_NCBI.py Wed Nov 29 17:38:52 2017 -0500 @@ -55,6 +55,13 @@ self.count = 0 self.webenv = "" self.query_key = "" + if options.get_uids: + self.get_uids = True + else: + self.get_uids = False + if options.iuds_file: + with open(options.iuds_file, 'r') as f: + self.ids.extend(f.readline().split(' ')) def dry_run(self): self.get_count_value() @@ -63,15 +70,23 @@ """ Retrieve the fasta sequences corresponding to the query """ - self.get_count_value() + if len(self.ids) == 0: + self.get_count_value() + else: + self.count = len(self.ids) # If no UIDs are found exit script if self.count > 0: - self.get_uids_list() - try: - self.get_sequences() - except QueryException as e: - self.logger.error("Exiting script.") - raise e + if len(self.ids) == 0: + self.get_uids_list() + if not self.get_uids: + try: + self.get_sequences() + except QueryException as e: + self.logger.error("Exiting script.") + raise e + else: + with open(self.outname, 'w') as f: + f.write('\t'.join(self.ids)+'\n') else: self.logger.error("No UIDs were found. Exiting script.") raise Exception("") @@ -140,61 +155,58 @@ time.sleep(1) return querylog - def epost(self, db, ids): - url = self.base + "epost.fcgi" - self.logger.debug("url_epost: %s" % url) - values = {'db': db, - 'id': ids} - data = urllib.urlencode(values) - req = urllib2.Request(url, data) - serverResponse = False - nb_trials = 0 - while not serverResponse: - nb_trials += 1 - try: - self.logger.debug("Try number %s for opening and readin URL %s" - % (nb_trials, url+data)) - response = urllib2.urlopen(req) - querylog = response.readlines() - response.close() - serverResponse = True - except urllib2.HTTPError as e: - self.logger.info("urlopen error:%s, %s" % (e.code, e.read())) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - except urllib2.URLError as e: - self.logger.info("urlopen error: Failed to reach a server") - self.logger.info("Reason :%s" % (e.reason)) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - except httplib.IncompleteRead as e: - self.logger.info("IncompleteRead error: %s" % (e.partial)) - self.logger.info("Retrying in 1 sec") - serverResponse = False - time.sleep(1) - self.logger.debug("query response:") - for line in querylog: - self.logger.debug(line.rstrip()) - if '</QueryKey>' in line: - self.query_key = str(line[line.find('<QueryKey>') + - len('<QueryKey>'):line.find('</QueryKey>') - ]) - if '</WebEnv>' in line: - self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'): - line.find('</WebEnv>')]) - self.logger.debug("*** epost action ***") - self.logger.debug("query_key: %s" % self.query_key) - self.logger.debug("webenv: %s" % self.webenv) - time.sleep(1) + def sanitiser(self, db, fastaseq): + if(db not in "nuccore protein"): + return fastaseq + regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") + sane_seqlist = [] + seqlist = fastaseq.split("\n\n") + for seq in seqlist[:-1]: + fastalines = seq.split("\n") + if len(fastalines) < 2: + self.logger.info("Empty sequence for %s" % + ("|".join(fastalines[0].split("|")[:4]))) + self.logger.info("%s download is skipped" % + ("|".join(fastalines[0].split("|")[:4]))) + continue + if db == "nuccore": + badnuc = 0 + for nucleotide in fastalines[1]: + if nucleotide not in "ATGC": + badnuc += 1 + if float(badnuc)/len(fastalines[1]) > 0.4: + self.logger.info("%s ambiguous nucleotides in %s\ + or download interrupted at this offset\ + | %s" % (float(badnuc)/len(fastalines[1]), + "|".join(fastalines[0].split("|") + [:4]), + fastalines[1])) + self.logger.info("%s download is skipped" % + (fastalines[0].split("|")[:4])) + continue + """ remove spaces and trim the header to 100 chars """ + fastalines[0] = fastalines[0].replace(" ", "_")[:100] + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + elif db == "protein": + fastalines[0] = fastalines[0][0:100] + fastalines[0] = fastalines[0].replace(" ", "_") + fastalines[0] = fastalines[0].replace("[", "_") + fastalines[0] = fastalines[0].replace("]", "_") + fastalines[0] = fastalines[0].replace("=", "_") + """ because blast makedb doesn't like it """ + fastalines[0] = fastalines[0].rstrip("_") + fastalines[0] = re.sub(regex, "_", fastalines[0]) + cleanseq = "\n".join(fastalines) + sane_seqlist.append(cleanseq) + self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) + return "\n".join(sane_seqlist) - def efetch(self, db, query_key, webenv): + def efetch(self, db, uid_list): url = self.base + "efetch.fcgi" self.logger.debug("url_efetch: %s" % url) values = {'db': db, - 'query_key': query_key, - 'webenv': webenv, + 'id': uid_list, 'rettype': "fasta", 'retmode': "text"} data = urllib.urlencode(values) @@ -251,62 +263,8 @@ time.sleep(0.1) return fasta - def sanitiser(self, db, fastaseq): - if(db not in "nuccore protein"): - return fastaseq - regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") - sane_seqlist = [] - seqlist = fastaseq.split("\n\n") - for seq in seqlist[:-1]: - fastalines = seq.split("\n") - if len(fastalines) < 2: - self.logger.info("Empty sequence for %s" % - ("|".join(fastalines[0].split("|")[:4]))) - self.logger.info("%s download is skipped" % - ("|".join(fastalines[0].split("|")[:4]))) - continue - if db == "nuccore": - badnuc = 0 - for nucleotide in fastalines[1]: - if nucleotide not in "ATGC": - badnuc += 1 - if float(badnuc)/len(fastalines[1]) > 0.4: - self.logger.info("%s ambiguous nucleotides in %s\ - or download interrupted at this offset\ - | %s" % (float(badnuc)/len(fastalines[1]), - "|".join(fastalines[0].split("|") - [:4]), - fastalines[1])) - self.logger.info("%s download is skipped" % - (fastalines[0].split("|")[:4])) - continue - """ remove spaces and trim the header to 100 chars """ - fastalines[0] = fastalines[0].replace(" ", "_")[:100] - cleanseq = "\n".join(fastalines) - sane_seqlist.append(cleanseq) - elif db == "protein": - fastalines[0] = fastalines[0][0:100] - fastalines[0] = fastalines[0].replace(" ", "_") - fastalines[0] = fastalines[0].replace("[", "_") - fastalines[0] = fastalines[0].replace("]", "_") - fastalines[0] = fastalines[0].replace("=", "_") - """ because blast makedb doesn't like it """ - fastalines[0] = fastalines[0].rstrip("_") - fastalines[0] = re.sub(regex, "_", fastalines[0]) - cleanseq = "\n".join(fastalines) - sane_seqlist.append(cleanseq) - self.logger.info("clean sequences appended: %d" % (len(sane_seqlist))) - return "\n".join(sane_seqlist) - def get_sequences(self): - """ - Total number of records from the input set to be retrieved, - up to a maximum of 10,000. Optionally, for a large set the value of - retstart can be iterated while holding retmax constant, thereby - downloading the entire set in batches of size retmax. - http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch - """ - batch_size = self.retmax_efetch + batch_size = 200 count = self.count uids_list = self.ids self.logger.info("Batch size for efetch action: %d" % batch_size) @@ -316,18 +274,15 @@ for start in range(0, count, batch_size): end = min(count, start+batch_size) batch = uids_list[start:end] - if self.epost(self.dbname, ",".join(batch)) != -1: - mfasta = '' - while not mfasta: - self.logger.info("retrieving batch %d" % - ((start / batch_size) + 1)) - try: - mfasta = self.efetch(self.dbname, self.query_key, - self.webenv) - out.write(mfasta + '\n') - except QueryException as e: - self.logger.error("%s" % e.message) - raise e + self.logger.info("retrieving batch %d" % + ((start / batch_size) + 1)) + try: + mfasta = self.efetch(self.dbname, ','.join(batch)) + out.write(mfasta + '\n') + except QueryException as e: + self.logger.error("%s" % e.message) + raise e + urllib.urlcleanup() LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' @@ -337,18 +292,25 @@ def command_parse(): parser = argparse.ArgumentParser(description='Retrieve data from NCBI') - parser.add_argument('-i', dest='query_string', help='NCBI Query String', - required=True) + parser.add_argument('-i', dest='query_string', help='NCBI Query String') + parser.add_argument('--UID_list', dest='iuds_file', + help='file containing a list of iuds to be fetched') parser.add_argument('-o', dest='outname', help='output file name') parser.add_argument('-d', dest='dbname', help='database type') parser.add_argument('--count', '-c', dest='count_ids', action='store_true', default=False, help='dry run ouputing only the number of sequences\ found') + parser.add_argument('--get_uids', '-u', dest='get_uids', default=False, + action='store_true', help='prints to the output a list\ + of UIDs') parser.add_argument('-l', '--logfile', help='log file (default=stderr)') parser.add_argument('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)') args = parser.parse_args() + if args.query_string is not None and args.iuds_file is not None: + parser.error('Please choose either fetching the -i query or the -u\ + list.') return args @@ -369,12 +331,12 @@ try: E.dry_run() except Exception: - sys.exit(1) + sys.exit(-1) else: try: E.retrieve() except Exception: - sys.exit(1) + sys.exit(-1) if __name__ == "__main__":
--- a/fetch_fasta_from_NCBI.xml Wed Nov 08 13:00:26 2017 -0500 +++ b/fetch_fasta_from_NCBI.xml Wed Nov 29 17:38:52 2017 -0500 @@ -1,16 +1,71 @@ -<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1"> +<tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0"> <description></description> <command><![CDATA[ - python '$__tool_directory__'/fetch_fasta_from_NCBI.py - -i "$queryString" - -d $dbname - -l '$logfile' - $dry_run - -o '$outfile' + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -i "$queryString" + -d $dbname + -l '$logfile' + -c + -o '$outfile'; + #if $dry_run == "" + number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1'); + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -i "$queryString" + -d $dbname + -u + -l '$logfile' + -o 'uid_outfile'; + UID_array=( \$(head uid_outfile) ); + array_len=\${#UID_array[@]}; + counter=0; + number_of_groups=\$((array_len / 200000)); + modulo=\$((array_len % 200000)); + if [ "\$modulo" -gt 0 ];then + number_of_groups=\$((number_of_groups + 1)); + fi; + group_number=1; + echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile; + for ((i=0; i+200000<array_len;i+=200000)); do + echo "----- Group number: \$group_number -----" >> $logfile; + echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt; + echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt; + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -d $dbname + -l '$logfile' + -o 'tmp1_outfile' + --UID_list uid_list_1.txt& + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -d $dbname + -l 'tmp1_logfile' + -o 'tmp2_outfile' + --UID_list uid_list_2.txt& + wait; + cat tmp1_outfile tmp2_outfile>> $outfile; + rm tmp1_outfile tmp2_outfile; + cat tmp1_logfile >> $logfile; + rm tmp1_logfile; + rm uid_list_1.txt uid_list_2.txt; + group_number=\$((group_number + 1)); + counter=\$(( counter + 200000 )); + done; + echo "----- Group number: \$group_number -----" >> $logfile; + echo "----- Last group -----" >> $logfile; + if [ "\$counter" -lt "\$array_len" ]; then + echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt; + python '$__tool_directory__'/fetch_fasta_from_NCBI.py + -d $dbname + -l '$logfile' + -o 'tmp_outfile' + --UID_list uid_list.txt; + rm uid_list.txt; + cat tmp_outfile >> $outfile; + rm tmp_outfile; + fi; + #end if ]]></command> <inputs> - <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]"> + <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]"> <sanitizer> <valid initial="string.printable"> <remove value="""/> @@ -26,7 +81,7 @@ <option value="nuccore">Nucleotide</option> <option value="protein">Protein</option> </param> - <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/> + <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/> </inputs> <outputs> <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > @@ -35,23 +90,29 @@ <data format="txt" name="logfile" label="${tool.name}: log"/> </outputs> <tests> - <test> - <param name="queryString" value="9629650[gi]" /> - <param name="dbname" value="nuccore" /> - <output name="outfilename" ftype="fasta" file="output.fa" /> - </test> - <test> - <param name="queryString" value="CU929326[Accession]" /> - <param name="dbname" value="nuccore" /> - <param name="date_filter" value="1"/> - <param name="dry_run" value="True"/> - <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> - </test> + <test> + <param name="queryString" value="9629650[gi]" /> + <param name="dbname" value="nuccore" /> + <output name="outfilename" ftype="fasta" file="output.fa" /> + </test> + <test> + <param name="queryString" value="CU929326[Accession]" /> + <param name="dbname" value="nuccore" /> + <param name="date_filter" value="1"/> + <param name="dry_run" value="True"/> + <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> + </test> + <test> + <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" /> + <output name="outfilename" ftype="fasta" > + <metadata name="sequences" value="13" /> + </output> + </test> </tests> <help> **What it does** -This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query. +This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query. The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose @@ -59,8 +120,12 @@ Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. +By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch. + Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) +Retrieval progress is reported in the log dataset. + **Acknowledgments** This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.