Mercurial > repos > iuc > virannot_otu
comparison rps2tsv.py @ 0:c9dac9b2e01c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author | iuc |
---|---|
date | Mon, 04 Mar 2024 19:56:40 +0000 |
parents | |
children | 735a21808348 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c9dac9b2e01c |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # Name: rps2ecsv | |
5 # Author: Marie Lefebvre - INRAE | |
6 # Aims: Convert rpsblast xml output to csv and add taxonomy | |
7 | |
8 | |
9 import argparse | |
10 import json | |
11 import logging as log | |
12 from urllib import request | |
13 from urllib.error import HTTPError, URLError | |
14 | |
15 from Bio.Blast import NCBIXML | |
16 from ete3 import NCBITaxa | |
17 | |
18 ncbi = NCBITaxa() | |
19 | |
20 | |
21 def main(): | |
22 options = _set_options() | |
23 _set_log_level(options.verbosity) | |
24 hits = _read_xml(options) | |
25 _write_tsv(options, hits) | |
26 | |
27 | |
28 def _read_xml(options): | |
29 """ | |
30 Parse XML RPSblast results file | |
31 """ | |
32 log.info("Read XML file " + options.xml_file) | |
33 xml = open(options.xml_file, 'r') | |
34 records = NCBIXML.parse(xml) | |
35 xml_results = {} | |
36 for blast_record in records: | |
37 for aln in blast_record.alignments: | |
38 for hit in aln.hsps: | |
39 hsp = {} | |
40 hit_evalue = hit.expect | |
41 if hit_evalue > options.max_evalue: | |
42 continue | |
43 hit_frame = hit.frame[0] # frame | |
44 hit_evalue = hit.expect # evalue | |
45 hit_startQ = hit.query_start | |
46 hit_endQ = hit.query_end | |
47 hsp["frame"] = hit_frame | |
48 hsp["evalue"] = hit_evalue | |
49 hsp["startQ"] = hit_startQ | |
50 hsp["endQ"] = hit_endQ | |
51 hsp["query_id"] = blast_record.query_id | |
52 hsp["cdd_id"] = aln.hit_def.split(",")[0] | |
53 hsp["hit_id"] = aln.hit_id | |
54 hsp["query_length"] = blast_record.query_length # length of the query | |
55 hsp["description"] = aln.hit_def | |
56 hsp["accession"] = aln.accession | |
57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF") | |
58 log.info("Requeting Interpro for " + hsp["pfam_id"]) | |
59 url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/" + hsp["pfam_id"] + "/taxonomy/uniprot/" | |
60 req = request.Request(url) | |
61 try: | |
62 response = request.urlopen(req) | |
63 except HTTPError as e: | |
64 log.debug('Http error for interpro: ', e.code) | |
65 except URLError as e: | |
66 log.debug('Url error for interpro: ', e.reason) | |
67 else: | |
68 encoded_response = response.read() | |
69 decoded_response = encoded_response.decode() | |
70 payload = json.loads(decoded_response) | |
71 kingdoms = [] | |
72 for item in payload["taxonomy_subset"]: | |
73 lineage_string = item["lineage"] | |
74 lineage = [int(i) for i in lineage_string] | |
75 translation = ncbi.get_taxid_translator(lineage) | |
76 names = list(translation.values()) | |
77 taxonomy = names[1:] # remove 'root' at the begining | |
78 kingdoms.append(taxonomy[0]) | |
79 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} | |
80 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) | |
81 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) | |
82 hsp["taxonomy"] = concat_freq | |
83 xml_results[hsp["query_id"]] = hsp | |
84 return xml_results | |
85 | |
86 | |
87 def _write_tsv(options, hits): | |
88 """ | |
89 Write output | |
90 """ | |
91 log.info("Write output file " + options.output) | |
92 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" | |
93 f = open(options.output, "w+") | |
94 f.write(headers) | |
95 for h in hits: | |
96 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") | |
97 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") | |
98 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") | |
99 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) | |
100 f.write("\n") | |
101 f.close() | |
102 | |
103 | |
104 def _set_options(): | |
105 parser = argparse.ArgumentParser() | |
106 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
107 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
108 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') | |
109 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
110 args = parser.parse_args() | |
111 return args | |
112 | |
113 | |
114 def _set_log_level(verbosity): | |
115 if verbosity == 1: | |
116 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
117 log.basicConfig(level=log.INFO, format=log_format) | |
118 elif verbosity == 3: | |
119 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
120 log.basicConfig(level=log.DEBUG, format=log_format) | |
121 | |
122 | |
123 if __name__ == "__main__": | |
124 main() |