comparison otu.py @ 4:bb29ae8708b5 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
author iuc
date Tue, 13 May 2025 11:52:17 +0000
parents f8ebd1e802d7
children
comparison
equal deleted inserted replaced
3:f8ebd1e802d7 4:bb29ae8708b5
2 2
3 3
4 # Name: virAnnot_otu 4 # Name: virAnnot_otu
5 # Author: Marie Lefebvre - INRAE 5 # Author: Marie Lefebvre - INRAE
6 # Reuirements: Ete3 toolkit and external apps 6 # Reuirements: Ete3 toolkit and external apps
7 # Aims: Create viral OTUs based on RPS and Blast annotations 7
8 8 """Create viral OTUs based on RPS and Blast annotations"""
9 9
10 import argparse 10 import argparse
11 import csv 11 import csv
12 import logging as log 12 import logging as log
13 import os 13 import os
63 startQ = int(row[5]) 63 startQ = int(row[5])
64 endQ = int(row[6]) 64 endQ = int(row[6])
65 frame = float(row[7]) 65 frame = float(row[7])
66 description = row[8] 66 description = row[8]
67 superkingdom = row[9] 67 superkingdom = row[9]
68 try:
69 pident = row[10]
70 except IndexError:
71 log.info(rps_file[0])
72 log.info(row)
68 match = re.search("Viruses", superkingdom) 73 match = re.search("Viruses", superkingdom)
69 # if contig is viral then retrieve sequence 74 # if contig is viral then retrieve sequence
70 if match: 75 if match and float(pident) >= options.viral_portion:
71 options.fasta.sort() 76 options.fasta.sort()
72 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) 77 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id)
73 seq_length = len(seq) 78 seq_length = len(seq)
74 if endQ < seq_length: 79 if endQ < seq_length:
75 seq = seq[startQ - 1:endQ] 80 seq = seq[startQ - 1:endQ]
101 if "nb" not in collection[cdd_id][query_id]: 106 if "nb" not in collection[cdd_id][query_id]:
102 collection[cdd_id][query_id]["nb"] = 0 107 collection[cdd_id][query_id]["nb"] = 0
103 if "taxonomy" not in collection[cdd_id][query_id]: 108 if "taxonomy" not in collection[cdd_id][query_id]:
104 collection[cdd_id][query_id]["taxonomy"] = "Unknown" 109 collection[cdd_id][query_id]["taxonomy"] = "Unknown"
105 else: 110 else:
106 log.info("No blast file") 111 log.debug("No blast file")
107 collection[cdd_id][query_id]["taxonomy"] = "Unknown" 112 collection[cdd_id][query_id]["taxonomy"] = "Unknown"
108 collection[cdd_id][query_id]["nb"] = 0 113 collection[cdd_id][query_id]["nb"] = 0
109 114 # keep pfamXXX and RdRp 1
110 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1 115 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1]
111 collection[cdd_id]["full_description"] = description 116 collection[cdd_id]["full_description"] = description
112 i += 1 117 i += 1
118 if options.merge_rdrp == "yes":
119 rdrp_list = ["pfam00680", "pfam02123", "pfam00978", "pfam00998"]
120 collection["RdRp_merge"] = {}
121 for cdd_id in collection:
122 if cdd_id in rdrp_list and cdd_id != "RdRp_merge":
123 log.info("Add " + cdd_id + " in merge")
124 for query_id in collection[cdd_id]:
125 if query_id not in collection["RdRp_merge"]:
126 collection["RdRp_merge"][query_id] = {}
127 collection["RdRp_merge"][query_id] = collection[cdd_id][query_id]
113 return collection 128 return collection
114 129
115 130
116 def _retrieve_fasta_seq(fasta_file, query_id): 131 def _retrieve_fasta_seq(fasta_file, query_id):
117 """ 132 """
179 log.info("Align sequences") 194 log.info("Align sequences")
180 if not os.path.exists(options.output): 195 if not os.path.exists(options.output):
181 os.mkdir(options.output) 196 os.mkdir(options.output)
182 color_by_sample = {} 197 color_by_sample = {}
183 for cdd_id in hits_collection: 198 for cdd_id in hits_collection:
184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") 199 log.info("align seq for " + cdd_id)
200 if cdd_id == "RdRp_merge":
201 cdd_output = options.output + "/" + cdd_id
202 else:
203 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
185 if not os.path.exists(cdd_output): 204 if not os.path.exists(cdd_output):
186 os.mkdir(cdd_output) 205 os.mkdir(cdd_output)
187 if os.path.exists(cdd_output + "/seq_to_align.fasta"): 206 if os.path.exists(cdd_output + "/seq_to_align.fasta"):
188 os.remove(cdd_output + "/seq_to_align.fasta") 207 os.remove(cdd_output + "/seq_to_align.fasta")
189 if os.path.exists(cdd_output + "/seq_nucc.fasta"): 208 if os.path.exists(cdd_output + "/seq_nucc.fasta"):
221 240
222 # create tree plot with colors 241 # create tree plot with colors
223 file_matrix = cdd_output + "/identity_matrix.csv" 242 file_matrix = cdd_output + "/identity_matrix.csv"
224 log.info("Create tree...") 243 log.info("Create tree...")
225 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) 244 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config)
226 _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id) 245 _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id)
227 log.info("Retrieve OTUs...") 246 log.info("Retrieve OTUs...")
228 # if os.path.exists(file_cluster): 247 # if os.path.exists(file_cluster):
229 # os.remove(file_cluster) 248 # os.remove(file_cluster)
230 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) 249 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc)
231 log.debug(otu_cmd) 250 log.debug(otu_cmd)
239 f = open(file_cluster, "w+") 258 f = open(file_cluster, "w+")
240 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') 259 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',')
241 f.close() 260 f.close()
242 261
243 262
244 def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id): 263 def _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id):
245 """ 264 """
246 Calculate paiwise distance between aligned protein sequences 265 Calculate paiwise distance between aligned protein sequences
247 from a cdd_id 266 from a cdd_id
248 """ 267 """
249 log.info("Compute pairwise distance of " + cdd_id) 268 log.info("Compute pairwise distance of " + cdd_id)
295 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook 314 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook
296 workbook = xlsxwriter.Workbook(file_xlsx) 315 workbook = xlsxwriter.Workbook(file_xlsx)
297 log.info("Writing stats to " + file_xlsx) 316 log.info("Writing stats to " + file_xlsx)
298 for cdd_id in hits_collection: 317 for cdd_id in hits_collection:
299 otu_collection = {} 318 otu_collection = {}
300 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") 319 if cdd_id == "RdRp_merge":
301 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet 320 cdd_output = options.output + "/" + cdd_id
321 worksheet = workbook.add_worksheet(cdd_id)
322 else:
323
324 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
325 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet
302 file_cluster = cdd_output + '/otu_cluster.csv' 326 file_cluster = cdd_output + '/otu_cluster.csv'
303 file_fasta_nucc = cdd_output + '/representative_nucc.fasta' 327 file_fasta_nucc = cdd_output + '/representative_nucc.fasta'
304 with open(file_cluster, 'r') as clust: 328 with open(file_cluster, 'r') as clust:
305 otu_reader = csv.reader(clust, delimiter=',') 329 otu_reader = csv.reader(clust, delimiter=',')
306 samples_list = [] 330 samples_list = []
313 samples_list.append(sample) if sample not in samples_list else samples_list 337 samples_list.append(sample) if sample not in samples_list else samples_list
314 if sample not in otu_collection[row[0]]: 338 if sample not in otu_collection[row[0]]:
315 otu_collection[row[0]][sample] = {} 339 otu_collection[row[0]][sample] = {}
316 otu_collection[row[0]][sample][contig] = {} 340 otu_collection[row[0]][sample][contig] = {}
317 # add read number of the contig and annotation 341 # add read number of the contig and annotation
318 if 'nb' in hits_collection[cdd_id][contig]: 342 if contig in hits_collection[cdd_id]:
319 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] 343 if 'nb' in hits_collection[cdd_id][contig]:
344 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
345 else:
346 otu_collection[row[0]][sample][contig]['nb'] = 0
347 if 'taxonomy' in hits_collection[cdd_id][contig]:
348 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
349 else:
350 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
320 else: 351 else:
321 otu_collection[row[0]][sample][contig]['nb'] = 0 352 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'}
322 if 'taxonomy' in hits_collection[cdd_id][contig]:
323 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
324 else:
325 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
326 else: 353 else:
327 otu_collection[row[0]][sample][contig] = {} 354 otu_collection[row[0]][sample][contig] = {}
328 # add read number of the contig and annotation 355 # add read number of the contig and annotation
329 if 'nb' in hits_collection[cdd_id][contig]: 356 if contig in hits_collection[cdd_id]:
330 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] 357 if 'nb' in hits_collection[cdd_id][contig]:
358 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
359 else:
360 otu_collection[row[0]][sample][contig]['nb'] = 0
361 if 'taxonomy' in hits_collection[cdd_id][contig]:
362 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
363 else:
364 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
331 else: 365 else:
332 otu_collection[row[0]][sample][contig]['nb'] = 0 366 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'}
333 if 'taxonomy' in hits_collection[cdd_id][contig]:
334 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
335 else:
336 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
337 if 'taxonomy' in hits_collection[cdd_id][contig]: 367 if 'taxonomy' in hits_collection[cdd_id][contig]:
338 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] 368 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
339 else: 369 else:
340 otu_collection[row[0]]['global_taxonomy'] = 'unknown' 370 otu_collection[row[0]]['global_taxonomy'] = 'unknown'
341 371
360 worksheet.write(row, column + 2, 'contigs_list') 390 worksheet.write(row, column + 2, 'contigs_list')
361 row = 1 391 row = 1
362 # column = 0 392 # column = 0
363 with open(file_fasta_nucc, "w+") as f_nucc: 393 with open(file_fasta_nucc, "w+") as f_nucc:
364 for otu in otu_collection: 394 for otu in otu_collection:
365 log.info(otu)
366 if isinstance(otu_collection[otu], dict): 395 if isinstance(otu_collection[otu], dict):
367 column = 0 396 column = 0
368 worksheet.write(row, column, otu) 397 worksheet.write(row, column, otu)
369 # prepare table with 0 in each cells 398 # prepare table with 0 in each cells
370 for sample in otu_collection[otu]: 399 for sample in otu_collection[otu]:
403 432
404 with open(map_file_path, "w+") as map_file: 433 with open(map_file_path, "w+") as map_file:
405 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] 434 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
406 map_file.write("\t".join(headers)) 435 map_file.write("\t".join(headers))
407 for cdd_id in hits_collection: 436 for cdd_id in hits_collection:
408 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") 437 if cdd_id == "RdRp_merge":
438 cdd_output = "RdRp_merge"
439 else:
440 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
409 short_description = cdd_output 441 short_description = cdd_output
410 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' 442 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
411 tree_file = cdd_output + '/tree.dnd.png' 443 tree_file = cdd_output + '/tree.dnd.png'
412 file_cluster = cdd_output + '/otu_cluster.csv' 444 file_cluster = cdd_output + '/otu_cluster.csv'
413 file_matrix = cdd_output + "/identity_matrix.csv" 445 file_matrix = cdd_output + "/identity_matrix.csv"
420 log.debug(html_cmd) 452 log.debug(html_cmd)
421 os.system(html_cmd) 453 os.system(html_cmd)
422 454
423 455
424 def _set_options(): 456 def _set_options():
457 """
458 Set parameters
459 """
425 parser = argparse.ArgumentParser() 460 parser = argparse.ArgumentParser()
426 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') 461 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+')
427 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') 462 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+')
428 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') 463 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+')
429 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') 464 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc')
430 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') 465 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion')
431 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') 466 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length')
467 parser.add_argument('-m', '--merge_rdrp', help='Merge RdRp1, 2, 3 and 4 to create otu on it.', action='store', type=str, default="no", dest='merge_rdrp')
432 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') 468 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path')
433 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') 469 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output')
434 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') 470 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb')
435 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) 471 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
436 args = parser.parse_args() 472 args = parser.parse_args()
437 return args 473 return args
438 474
439 475
440 def _set_log_level(verbosity): 476 def _set_log_level(verbosity):
477 """
478 Debbug
479 """
441 if verbosity == 1: 480 if verbosity == 1:
442 log_format = '%(asctime)s %(levelname)-8s %(message)s' 481 log_format = '%(asctime)s %(levelname)-8s %(message)s'
443 log.basicConfig(level=log.INFO, format=log_format) 482 log.basicConfig(level=log.INFO, format=log_format)
444 elif verbosity == 3: 483 elif verbosity == 3:
445 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' 484 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'