Mercurial > repos > iuc > virannot_blast2tsv
comparison rps2tsv.py @ 0:e889010415a1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
| author | iuc |
|---|---|
| date | Mon, 04 Mar 2024 19:55:52 +0000 |
| parents | |
| children | 77c3ef9b0ed7 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e889010415a1 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 | |
| 4 # Name: rps2ecsv | |
| 5 # Author: Marie Lefebvre - INRAE | |
| 6 # Aims: Convert rpsblast xml output to csv and add taxonomy | |
| 7 | |
| 8 | |
| 9 import argparse | |
| 10 import json | |
| 11 import logging as log | |
| 12 from urllib import request | |
| 13 from urllib.error import HTTPError, URLError | |
| 14 | |
| 15 from Bio.Blast import NCBIXML | |
| 16 from ete3 import NCBITaxa | |
| 17 | |
| 18 ncbi = NCBITaxa() | |
| 19 | |
| 20 | |
| 21 def main(): | |
| 22 options = _set_options() | |
| 23 _set_log_level(options.verbosity) | |
| 24 hits = _read_xml(options) | |
| 25 _write_tsv(options, hits) | |
| 26 | |
| 27 | |
| 28 def _read_xml(options): | |
| 29 """ | |
| 30 Parse XML RPSblast results file | |
| 31 """ | |
| 32 log.info("Read XML file " + options.xml_file) | |
| 33 xml = open(options.xml_file, 'r') | |
| 34 records = NCBIXML.parse(xml) | |
| 35 xml_results = {} | |
| 36 for blast_record in records: | |
| 37 for aln in blast_record.alignments: | |
| 38 for hit in aln.hsps: | |
| 39 hsp = {} | |
| 40 hit_evalue = hit.expect | |
| 41 if hit_evalue > options.max_evalue: | |
| 42 continue | |
| 43 hit_frame = hit.frame[0] # frame | |
| 44 hit_evalue = hit.expect # evalue | |
| 45 hit_startQ = hit.query_start | |
| 46 hit_endQ = hit.query_end | |
| 47 hsp["frame"] = hit_frame | |
| 48 hsp["evalue"] = hit_evalue | |
| 49 hsp["startQ"] = hit_startQ | |
| 50 hsp["endQ"] = hit_endQ | |
| 51 hsp["query_id"] = blast_record.query_id | |
| 52 hsp["cdd_id"] = aln.hit_def.split(",")[0] | |
| 53 hsp["hit_id"] = aln.hit_id | |
| 54 hsp["query_length"] = blast_record.query_length # length of the query | |
| 55 hsp["description"] = aln.hit_def | |
| 56 hsp["accession"] = aln.accession | |
| 57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF") | |
| 58 log.info("Requeting Interpro for " + hsp["pfam_id"]) | |
| 59 url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/" + hsp["pfam_id"] + "/taxonomy/uniprot/" | |
| 60 req = request.Request(url) | |
| 61 try: | |
| 62 response = request.urlopen(req) | |
| 63 except HTTPError as e: | |
| 64 log.debug('Http error for interpro: ', e.code) | |
| 65 except URLError as e: | |
| 66 log.debug('Url error for interpro: ', e.reason) | |
| 67 else: | |
| 68 encoded_response = response.read() | |
| 69 decoded_response = encoded_response.decode() | |
| 70 payload = json.loads(decoded_response) | |
| 71 kingdoms = [] | |
| 72 for item in payload["taxonomy_subset"]: | |
| 73 lineage_string = item["lineage"] | |
| 74 lineage = [int(i) for i in lineage_string] | |
| 75 translation = ncbi.get_taxid_translator(lineage) | |
| 76 names = list(translation.values()) | |
| 77 taxonomy = names[1:] # remove 'root' at the begining | |
| 78 kingdoms.append(taxonomy[0]) | |
| 79 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} | |
| 80 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) | |
| 81 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) | |
| 82 hsp["taxonomy"] = concat_freq | |
| 83 xml_results[hsp["query_id"]] = hsp | |
| 84 return xml_results | |
| 85 | |
| 86 | |
| 87 def _write_tsv(options, hits): | |
| 88 """ | |
| 89 Write output | |
| 90 """ | |
| 91 log.info("Write output file " + options.output) | |
| 92 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" | |
| 93 f = open(options.output, "w+") | |
| 94 f.write(headers) | |
| 95 for h in hits: | |
| 96 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") | |
| 97 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") | |
| 98 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") | |
| 99 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) | |
| 100 f.write("\n") | |
| 101 f.close() | |
| 102 | |
| 103 | |
| 104 def _set_options(): | |
| 105 parser = argparse.ArgumentParser() | |
| 106 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
| 107 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
| 108 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') | |
| 109 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
| 110 args = parser.parse_args() | |
| 111 return args | |
| 112 | |
| 113 | |
| 114 def _set_log_level(verbosity): | |
| 115 if verbosity == 1: | |
| 116 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
| 117 log.basicConfig(level=log.INFO, format=log_format) | |
| 118 elif verbosity == 3: | |
| 119 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
| 120 log.basicConfig(level=log.DEBUG, format=log_format) | |
| 121 | |
| 122 | |
| 123 if __name__ == "__main__": | |
| 124 main() |
