comparison rps2tsv.py @ 4:adcf06db3030 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
author iuc
date Tue, 13 May 2025 11:52:27 +0000
parents 40fb54cc6628
children
comparison
equal deleted inserted replaced
3:40fb54cc6628 4:adcf06db3030
3 3
4 # Name: rps2ecsv 4 # Name: rps2ecsv
5 # Author: Marie Lefebvre - INRAE 5 # Author: Marie Lefebvre - INRAE
6 # Aims: Convert rpsblast xml output to csv and add taxonomy 6 # Aims: Convert rpsblast xml output to csv and add taxonomy
7 7
8 """Module which converts rpsblast xml output to tsv and add taxonomy"""
8 9
9 import argparse 10 import argparse
10 import json 11 import json
11 import logging as log 12 import logging as log
12 from urllib import request 13 from urllib import request
17 18
18 ncbi = NCBITaxa() 19 ncbi = NCBITaxa()
19 20
20 21
21 def main(): 22 def main():
23 """
24 Main function
25 """
22 options = _set_options() 26 options = _set_options()
23 _set_log_level(options.verbosity) 27 _set_log_level(options.verbosity)
24 hits = _read_xml(options) 28 hits = _read_xml(options)
25 _write_tsv(options, hits) 29 _write_tsv(options, hits)
26 30
42 continue 46 continue
43 hit_frame = hit.frame[0] # frame 47 hit_frame = hit.frame[0] # frame
44 hit_evalue = hit.expect # evalue 48 hit_evalue = hit.expect # evalue
45 hit_startQ = hit.query_start 49 hit_startQ = hit.query_start
46 hit_endQ = hit.query_end 50 hit_endQ = hit.query_end
51 hit_identity = hit.identities
52 hit_aln_length = hit.align_length
53 pident = "%0.3f" % (100 * float(hit_identity) / float(hit_aln_length))
54 if float(pident) < 0.1:
55 continue
56 hsp["pident"] = pident
47 hsp["frame"] = hit_frame 57 hsp["frame"] = hit_frame
48 hsp["evalue"] = hit_evalue 58 hsp["evalue"] = hit_evalue
49 hsp["startQ"] = hit_startQ 59 hsp["startQ"] = hit_startQ
50 hsp["endQ"] = hit_endQ 60 hsp["endQ"] = hit_endQ
51 hsp["query_id"] = blast_record.query 61 hsp["query_id"] = blast_record.query
81 taxonomy = names 91 taxonomy = names
82 else: 92 else:
83 taxonomy = names 93 taxonomy = names
84 if len(taxonomy) != 0: 94 if len(taxonomy) != 0:
85 kingdoms.append(taxonomy[0]) 95 kingdoms.append(taxonomy[0])
86 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} 96 # {'Pseudomonadota': 9, 'cellular organisms': 4}
97 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms}
87 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) 98 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
88 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) 99 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items())
89 hsp["taxonomy"] = concat_freq 100 hsp["taxonomy"] = concat_freq
90 xml_results[hsp["query_id"]] = hsp 101 xml_results[hsp["query_id"]] = hsp
91 return xml_results 102 return xml_results
94 def _write_tsv(options, hits): 105 def _write_tsv(options, hits):
95 """ 106 """
96 Write output 107 Write output
97 """ 108 """
98 log.info("Write output file " + options.output) 109 log.info("Write output file " + options.output)
99 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" 110 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\tpident\n"
100 f = open(options.output, "w+") 111 f = open(options.output, "w+")
101 f.write(headers) 112 f.write(headers)
102 for h in hits: 113 for h in hits:
103 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") 114 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t")
104 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") 115 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t")
105 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") 116 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t"
106 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) 117 + str(hits[h]["frame"]) + "\t")
118 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"] + "\t" + hits[h]["pident"])
107 f.write("\n") 119 f.write("\n")
108 f.close() 120 f.close()
109 121
110 122
111 def _set_options(): 123 def _set_options():
124 """
125 Script parameters
126 """
112 parser = argparse.ArgumentParser() 127 parser = argparse.ArgumentParser()
113 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') 128 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store',
114 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') 129 required=True, dest='xml_file')
115 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') 130 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store',
116 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) 131 type=float, default=0.0001, dest='max_evalue')
132 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store',
133 type=str, default='./rps2tsv_output.tab', dest='output')
134 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store',
135 type=int, choices=[1, 2, 3, 4], default=1)
117 args = parser.parse_args() 136 args = parser.parse_args()
118 return args 137 return args
119 138
120 139
121 def _set_log_level(verbosity): 140 def _set_log_level(verbosity):
141 """
142 Debbug
143 """
122 if verbosity == 1: 144 if verbosity == 1:
123 log_format = '%(asctime)s %(levelname)-8s %(message)s' 145 log_format = '%(asctime)s %(levelname)-8s %(message)s'
124 log.basicConfig(level=log.INFO, format=log_format) 146 log.basicConfig(level=log.INFO, format=log_format)
125 elif verbosity == 3: 147 elif verbosity == 3:
126 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' 148 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'