comparison rps2tsv.py @ 0:c9dac9b2e01c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author iuc
date Mon, 04 Mar 2024 19:56:40 +0000
parents
children 735a21808348
comparison
equal deleted inserted replaced
-1:000000000000 0:c9dac9b2e01c
1 #!/usr/bin/env python3
2
3
4 # Name: rps2ecsv
5 # Author: Marie Lefebvre - INRAE
6 # Aims: Convert rpsblast xml output to csv and add taxonomy
7
8
9 import argparse
10 import json
11 import logging as log
12 from urllib import request
13 from urllib.error import HTTPError, URLError
14
15 from Bio.Blast import NCBIXML
16 from ete3 import NCBITaxa
17
18 ncbi = NCBITaxa()
19
20
21 def main():
22 options = _set_options()
23 _set_log_level(options.verbosity)
24 hits = _read_xml(options)
25 _write_tsv(options, hits)
26
27
28 def _read_xml(options):
29 """
30 Parse XML RPSblast results file
31 """
32 log.info("Read XML file " + options.xml_file)
33 xml = open(options.xml_file, 'r')
34 records = NCBIXML.parse(xml)
35 xml_results = {}
36 for blast_record in records:
37 for aln in blast_record.alignments:
38 for hit in aln.hsps:
39 hsp = {}
40 hit_evalue = hit.expect
41 if hit_evalue > options.max_evalue:
42 continue
43 hit_frame = hit.frame[0] # frame
44 hit_evalue = hit.expect # evalue
45 hit_startQ = hit.query_start
46 hit_endQ = hit.query_end
47 hsp["frame"] = hit_frame
48 hsp["evalue"] = hit_evalue
49 hsp["startQ"] = hit_startQ
50 hsp["endQ"] = hit_endQ
51 hsp["query_id"] = blast_record.query_id
52 hsp["cdd_id"] = aln.hit_def.split(",")[0]
53 hsp["hit_id"] = aln.hit_id
54 hsp["query_length"] = blast_record.query_length # length of the query
55 hsp["description"] = aln.hit_def
56 hsp["accession"] = aln.accession
57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF")
58 log.info("Requeting Interpro for " + hsp["pfam_id"])
59 url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/" + hsp["pfam_id"] + "/taxonomy/uniprot/"
60 req = request.Request(url)
61 try:
62 response = request.urlopen(req)
63 except HTTPError as e:
64 log.debug('Http error for interpro: ', e.code)
65 except URLError as e:
66 log.debug('Url error for interpro: ', e.reason)
67 else:
68 encoded_response = response.read()
69 decoded_response = encoded_response.decode()
70 payload = json.loads(decoded_response)
71 kingdoms = []
72 for item in payload["taxonomy_subset"]:
73 lineage_string = item["lineage"]
74 lineage = [int(i) for i in lineage_string]
75 translation = ncbi.get_taxid_translator(lineage)
76 names = list(translation.values())
77 taxonomy = names[1:] # remove 'root' at the begining
78 kingdoms.append(taxonomy[0])
79 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4}
80 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
81 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items())
82 hsp["taxonomy"] = concat_freq
83 xml_results[hsp["query_id"]] = hsp
84 return xml_results
85
86
87 def _write_tsv(options, hits):
88 """
89 Write output
90 """
91 log.info("Write output file " + options.output)
92 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n"
93 f = open(options.output, "w+")
94 f.write(headers)
95 for h in hits:
96 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t")
97 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t")
98 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t")
99 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"])
100 f.write("\n")
101 f.close()
102
103
104 def _set_options():
105 parser = argparse.ArgumentParser()
106 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
107 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
108 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output')
109 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
110 args = parser.parse_args()
111 return args
112
113
114 def _set_log_level(verbosity):
115 if verbosity == 1:
116 log_format = '%(asctime)s %(levelname)-8s %(message)s'
117 log.basicConfig(level=log.INFO, format=log_format)
118 elif verbosity == 3:
119 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
120 log.basicConfig(level=log.DEBUG, format=log_format)
121
122
123 if __name__ == "__main__":
124 main()