comparison otu.py @ 0:e889010415a1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author iuc
date Mon, 04 Mar 2024 19:55:52 +0000
parents
children f8ebd1e802d7
comparison
equal deleted inserted replaced
-1:000000000000 0:e889010415a1
1 #!/usr/bin/env python3
2
3
4 # Name: virAnnot_otu
5 # Author: Marie Lefebvre - INRAE
6 # Reuirements: Ete3 toolkit and external apps
7 # Aims: Create viral OTUs based on RPS and Blast annotations
8
9
10 import argparse
11 import csv
12 import logging as log
13 import os
14 import random
15 import re
16
17 import pandas as pd
18 import xlsxwriter
19 from Bio import SeqIO
20 from Bio.Align.Applications import ClustalOmegaCommandline
21 from ete3 import NodeStyle, SeqGroup, SeqMotifFace, Tree, TreeStyle
22
23
24 def main():
25 """
26 1 - retrieve info (sequence, query_id, taxo) from RPS file
27 2 - align protein sequences of the same domain, calculate
28 matrix of distances, generate trees
29 3 - get statistics (read number) per otu
30 4 - create HTML report
31 """
32 options = _set_options()
33 _set_log_level(options.verbosity)
34 hits_collection = _cut_sequence(options)
35 _align_sequences(options, hits_collection)
36 _get_stats(options, hits_collection)
37 _create_html(options, hits_collection)
38
39
40 def _cut_sequence(options):
41 """
42 Retrieve viral hits and sequences from RPS files
43 """
44 log.info("Cut sequences")
45 i = 0 # keep track of iterations over rps files to use the corresponding fasta file
46 collection = {}
47 options.rps.sort()
48 for rps_file in options.rps:
49 log.debug("Reading rps file " + str(rps_file))
50 with open(rps_file[0], 'r') as rps_current_file:
51 rps_reader = csv.reader(rps_current_file, delimiter='\t')
52 headers = 0
53 for row in rps_reader:
54 if headers == 0:
55 # headers
56 headers += 1
57 else:
58 if row[1] == "no_hit":
59 pass
60 else:
61 query_id = row[0]
62 cdd_id = row[2]
63 startQ = int(row[5])
64 endQ = int(row[6])
65 frame = float(row[7])
66 description = row[8]
67 superkingdom = row[9]
68 match = re.search("Viruses", superkingdom)
69 # if contig is viral then retrieve sequence
70 if match:
71 options.fasta.sort()
72 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id)
73 seq_length = len(seq)
74 if endQ < seq_length:
75 seq = seq[startQ - 1:endQ]
76 else:
77 seq = seq[startQ - 1:seq_length]
78 if frame < 0:
79 seq = seq.reverse_complement()
80 prot = seq.translate()
81 if len(prot) >= options.min_protein_length:
82 log.debug("Add " + query_id + " to collection")
83 if cdd_id not in collection:
84 collection[cdd_id] = {}
85 collection[cdd_id][query_id] = {}
86 collection[cdd_id][query_id]["nuccleotide"] = seq
87 collection[cdd_id][query_id]["protein"] = prot
88 collection[cdd_id][query_id]["full_description"] = description
89 if options.blast is not None:
90 options.blast.sort()
91 with open(options.blast[i][0], 'r') as blast_current_file:
92 blast_reader = csv.reader(blast_current_file, delimiter='\t')
93 for b_query in blast_reader:
94 if b_query[1] == query_id:
95 collection[cdd_id][query_id]["nb"] = b_query[2]
96 if len(b_query) > 10:
97 collection[cdd_id][query_id]["taxonomy"] = b_query[14]
98 else:
99 collection[cdd_id][query_id]["taxonomy"] = "Unknown"
100 else:
101 if "nb" not in collection[cdd_id][query_id]:
102 collection[cdd_id][query_id]["nb"] = 0
103 if "taxonomy" not in collection[cdd_id][query_id]:
104 collection[cdd_id][query_id]["taxonomy"] = "Unknown"
105 else:
106 log.info("No blast file")
107 collection[cdd_id][query_id]["taxonomy"] = "Unknown"
108 collection[cdd_id][query_id]["nb"] = 0
109
110 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1
111 collection[cdd_id]["full_description"] = description
112 i += 1
113 return collection
114
115
116 def _retrieve_fasta_seq(fasta_file, query_id):
117 """
118 From fasta file retrieve specific sequence with id
119 """
120 contigs_list = SeqIO.to_dict(SeqIO.parse(open(fasta_file), 'fasta'))
121 try:
122 seq = contigs_list[query_id].seq
123 except KeyError:
124 print("KeyError for " + query_id + " file " + fasta_file)
125 else:
126 return seq
127
128
129 def _create_tree(tree, fasta, out, color):
130 """
131 Create phylogenic tree from multiple alignments
132 """
133 try:
134 f = open(tree, 'r')
135 except IOError:
136 log.info("Unknown file: " + tree + ". You may have less than 2 sequences to align.")
137 return
138
139 line = ""
140 for word in f:
141 line += word.strip()
142
143 f.close()
144 seqs = SeqGroup(fasta, format="fasta")
145 t = Tree(tree)
146 ts = TreeStyle()
147 ts.show_branch_length = True
148 colors = _parse_color_file(color)
149 node_names = t.get_leaf_names()
150 for name in node_names:
151 seq = seqs.get_seq(name)
152 seqFace = SeqMotifFace(seq, seq_format="()")
153 node = t.get_leaves_by_name(name)
154 for i in range(0, len(node)):
155 if name in colors:
156 ns = NodeStyle()
157 ns['bgcolor'] = colors[name]
158 node[i].set_style(ns)
159 node[i].add_face(seqFace, 0, 'aligned')
160
161 t.render(out, tree_style=ts)
162
163
164 def _parse_color_file(file):
165 fh = open(file)
166 reader = csv.reader(fh, delimiter="\t")
167 data = list(reader)
168 colors = {}
169 for i in range(0, len(data)):
170 colors[data[i][0]] = data[i][1]
171
172 return colors
173
174
175 def _align_sequences(options, hits_collection):
176 """
177 Align hit sequences with pfam reference
178 """
179 log.info("Align sequences")
180 if not os.path.exists(options.output):
181 os.mkdir(options.output)
182 color_by_sample = {}
183 for cdd_id in hits_collection:
184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
185 if not os.path.exists(cdd_output):
186 os.mkdir(cdd_output)
187 if os.path.exists(cdd_output + "/seq_to_align.fasta"):
188 os.remove(cdd_output + "/seq_to_align.fasta")
189 file_seq_to_align = cdd_output + "/seq_to_align.fasta"
190 file_color_config = cdd_output + "/color_config.txt"
191 f = open(file_seq_to_align, "a")
192 f_c = open(file_color_config, "w+")
193 log.info("Writing to " + file_seq_to_align)
194 count = 0 # count number of contig per domain
195 for query_id in hits_collection[cdd_id]:
196 if query_id not in ["short_description", "full_description"]:
197 sample = query_id.split("_")[0] # get sample from SAMPLE_IdCONTIG
198 sample_color = "#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])
199 # same color for each contig of the same sample
200 if sample not in color_by_sample.keys():
201 color_by_sample[sample] = sample_color
202 f.write(">" + query_id + "\n")
203 f.write(str(hits_collection[cdd_id][query_id]["protein"]) + "\n")
204 f_c.write(query_id + '\t' + color_by_sample[sample] + '\n')
205 count += 1
206 f.close()
207 f_c.close()
208 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
209 tree_file = cdd_output + '/tree.dnd'
210 file_cluster = cdd_output + '/otu_cluster.csv'
211 # create alignment for domain with more than 1 contigs
212 if count > 1:
213 log.info("Run clustal omega...")
214 clustalo_cmd = ClustalOmegaCommandline("clustalo", infile=file_seq_to_align, outfile=file_seq_aligned,
215 guidetree_out=tree_file, seqtype="protein", force=True)
216 log.debug(clustalo_cmd)
217 stdout, stderr = clustalo_cmd()
218 log.debug(stdout + stderr)
219
220 # create tree plot with colors
221 file_matrix = cdd_output + "/identity_matrix.csv"
222 log.info("Create tree...")
223 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config)
224 _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id)
225 log.info("Retrieve OTUs...")
226 # if os.path.exists(file_cluster):
227 # os.remove(file_cluster)
228 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc)
229 log.debug(otu_cmd)
230 os.system(otu_cmd)
231 # only one contig
232 else:
233 mv_cmd = 'cp ' + file_seq_to_align + ' ' + file_seq_aligned
234 log.debug(mv_cmd)
235 os.system(mv_cmd)
236
237 f = open(file_cluster, "w+")
238 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',')
239 f.close()
240
241
242 def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id):
243 """
244 Calculate paiwise distance between aligned protein sequences
245 from a cdd_id
246 """
247 log.info("Compute pairwise distance of " + cdd_id)
248 matrix = {}
249 for k1 in SeqIO.parse(file_seq_aligned, "fasta"):
250 row = []
251 for k2 in SeqIO.parse(file_seq_aligned, "fasta"):
252 identic = 0
253 compared = 0
254 keep_pos = 0
255 for base in k1:
256 base2 = k2[keep_pos]
257 # mutation, next
258 if base == 'X' or base2 == 'X':
259 keep_pos += 1
260 continue
261 # gap in both sequences, next
262 if base == '-' and base2 == '-':
263 keep_pos += 1
264 continue
265 # gap in one of the sequence, next
266 if base == '-' or base2 == '-':
267 keep_pos += 1
268 continue
269 # identity
270 if base == base2:
271 identic += 1
272 compared += 1
273 keep_pos += 1
274 # set minimum overlap to 20
275 if compared == 0 or compared < 20:
276 percentIdentity = 0
277 else:
278 percentIdentity = (identic / compared) * 100
279 row.append(percentIdentity)
280 matrix[k1.id] = row
281 log.debug("Write " + file_matrix)
282 f = open(file_matrix, "w+")
283 for row in matrix:
284 f.write(row + ',' + ', '.join(map(str, matrix[row])) + "\n")
285 f.close()
286
287
288 def _get_stats(options, hits_collection):
289 """
290 Retrieve annotation and number of read
291 for each OTUs
292 """
293 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook
294 workbook = xlsxwriter.Workbook(file_xlsx)
295 log.info("Writing stats to " + file_xlsx)
296 for cdd_id in hits_collection:
297 otu_collection = {}
298 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
299 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet
300 file_cluster = cdd_output + '/otu_cluster.csv'
301 with open(file_cluster, 'r') as clust:
302 otu_reader = csv.reader(clust, delimiter=',')
303 samples_list = []
304 for row in otu_reader:
305 contigs_list = row[2:len(row) - 1] # remove last empty column
306 otu_collection[row[0]] = {} # key -> otu number
307 otu_collection[row[0]]['contigs_list'] = contigs_list
308 for contig in contigs_list:
309 sample = contig.split('_')[0]
310 samples_list.append(sample) if sample not in samples_list else samples_list
311 if sample not in otu_collection[row[0]]:
312 otu_collection[row[0]][sample] = {}
313 otu_collection[row[0]][sample][contig] = {}
314 # add read number of the contig and annotation
315 if 'nb' in hits_collection[cdd_id][contig]:
316 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
317 else:
318 otu_collection[row[0]][sample][contig]['nb'] = 0
319 if 'taxonomy' in hits_collection[cdd_id][contig]:
320 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
321 else:
322 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
323 else:
324 otu_collection[row[0]][sample][contig] = {}
325 # add read number of the contig and annotation
326 if 'nb' in hits_collection[cdd_id][contig]:
327 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
328 else:
329 otu_collection[row[0]][sample][contig]['nb'] = 0
330 if 'taxonomy' in hits_collection[cdd_id][contig]:
331 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
332 else:
333 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
334 if 'taxonomy' in hits_collection[cdd_id][contig]:
335 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
336 else:
337 otu_collection[row[0]]['global_taxonomy'] = 'unknown'
338
339 # calculate total number of reads for each sample of each OTU
340 for otu in otu_collection:
341 for sample in otu_collection[otu]:
342 if sample not in ['contigs_list', 'global_taxonomy']:
343 total_nb_read = 0
344 for contig in otu_collection[otu][sample]:
345 total_nb_read += int(otu_collection[otu][sample][contig]['nb'])
346 otu_collection[otu][sample]['total_nb_read'] = total_nb_read
347 row = 0
348 column = 0
349 item = '#OTU_name'
350 worksheet.write(row, column, item)
351 for samp in samples_list:
352 column += 1
353 worksheet.write(row, column, samp)
354 worksheet.write(row, column + 1, 'taxonomy')
355 worksheet.write(row, column + 2, 'contigs_list')
356 row = 1
357 # column = 0
358 for otu in otu_collection:
359 if isinstance(otu_collection[otu], dict):
360 column = 0
361 worksheet.write(row, column, otu)
362 # prepare table with 0 in each cells
363 for sample in otu_collection[otu]:
364 column = 1
365 for samp in samples_list:
366 worksheet.write(row, column, 0)
367 column += 1
368 # fill in table with nb of read for each sample and each OTU
369 for sample in otu_collection[otu]:
370 column = 1
371 for samp in samples_list:
372 if samp == sample:
373 worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read'])
374 column += 1
375 worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' '))
376 worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list']))
377 row += 1
378 workbook.close()
379 read_file = pd.ExcelFile(file_xlsx)
380 for sheet in read_file.sheet_names:
381 cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab"
382 data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None)
383 data_xls.to_csv(cluster_nb_reads_file, encoding='utf-8', index=False, sep='\t')
384
385
386 def _create_html(options, hits_collection):
387 """
388 Create HTML file with all results
389 """
390 # create mapping file with all informations to use to create HTML report
391 map_file_path = options.output + "/map.txt"
392 if os.path.exists(map_file_path):
393 os.remove(map_file_path)
394
395 map_file = open(map_file_path, "w+")
396 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
397 map_file.write("\t".join(headers))
398 for cdd_id in hits_collection:
399 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
400 short_description = cdd_output
401 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
402 tree_file = cdd_output + '/tree.dnd.png'
403 file_cluster = cdd_output + '/otu_cluster.csv'
404 file_matrix = cdd_output + "/identity_matrix.csv"
405 cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"
406 map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t")
407 map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t")
408 map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n")
409 map_file.close()
410 log.info("Writing HTML report")
411 html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output
412 log.debug(html_cmd)
413 os.system(html_cmd)
414
415
416 def _set_options():
417 parser = argparse.ArgumentParser()
418 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+')
419 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+')
420 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+')
421 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc')
422 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion')
423 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length')
424 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path')
425 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output')
426 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb')
427 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
428 args = parser.parse_args()
429 return args
430
431
432 def _set_log_level(verbosity):
433 if verbosity == 1:
434 log_format = '%(asctime)s %(levelname)-8s %(message)s'
435 log.basicConfig(level=log.INFO, format=log_format)
436 elif verbosity == 3:
437 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
438 log.basicConfig(level=log.DEBUG, format=log_format)
439
440
441 if __name__ == "__main__":
442 main()