Mercurial > repos > iuc > virannot_blast2tsv
comparison otu.py @ 4:bb29ae8708b5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
author | iuc |
---|---|
date | Tue, 13 May 2025 11:52:17 +0000 |
parents | f8ebd1e802d7 |
children |
comparison
equal
deleted
inserted
replaced
3:f8ebd1e802d7 | 4:bb29ae8708b5 |
---|---|
2 | 2 |
3 | 3 |
4 # Name: virAnnot_otu | 4 # Name: virAnnot_otu |
5 # Author: Marie Lefebvre - INRAE | 5 # Author: Marie Lefebvre - INRAE |
6 # Reuirements: Ete3 toolkit and external apps | 6 # Reuirements: Ete3 toolkit and external apps |
7 # Aims: Create viral OTUs based on RPS and Blast annotations | 7 |
8 | 8 """Create viral OTUs based on RPS and Blast annotations""" |
9 | 9 |
10 import argparse | 10 import argparse |
11 import csv | 11 import csv |
12 import logging as log | 12 import logging as log |
13 import os | 13 import os |
63 startQ = int(row[5]) | 63 startQ = int(row[5]) |
64 endQ = int(row[6]) | 64 endQ = int(row[6]) |
65 frame = float(row[7]) | 65 frame = float(row[7]) |
66 description = row[8] | 66 description = row[8] |
67 superkingdom = row[9] | 67 superkingdom = row[9] |
68 try: | |
69 pident = row[10] | |
70 except IndexError: | |
71 log.info(rps_file[0]) | |
72 log.info(row) | |
68 match = re.search("Viruses", superkingdom) | 73 match = re.search("Viruses", superkingdom) |
69 # if contig is viral then retrieve sequence | 74 # if contig is viral then retrieve sequence |
70 if match: | 75 if match and float(pident) >= options.viral_portion: |
71 options.fasta.sort() | 76 options.fasta.sort() |
72 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) | 77 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) |
73 seq_length = len(seq) | 78 seq_length = len(seq) |
74 if endQ < seq_length: | 79 if endQ < seq_length: |
75 seq = seq[startQ - 1:endQ] | 80 seq = seq[startQ - 1:endQ] |
101 if "nb" not in collection[cdd_id][query_id]: | 106 if "nb" not in collection[cdd_id][query_id]: |
102 collection[cdd_id][query_id]["nb"] = 0 | 107 collection[cdd_id][query_id]["nb"] = 0 |
103 if "taxonomy" not in collection[cdd_id][query_id]: | 108 if "taxonomy" not in collection[cdd_id][query_id]: |
104 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | 109 collection[cdd_id][query_id]["taxonomy"] = "Unknown" |
105 else: | 110 else: |
106 log.info("No blast file") | 111 log.debug("No blast file") |
107 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | 112 collection[cdd_id][query_id]["taxonomy"] = "Unknown" |
108 collection[cdd_id][query_id]["nb"] = 0 | 113 collection[cdd_id][query_id]["nb"] = 0 |
109 | 114 # keep pfamXXX and RdRp 1 |
110 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1 | 115 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] |
111 collection[cdd_id]["full_description"] = description | 116 collection[cdd_id]["full_description"] = description |
112 i += 1 | 117 i += 1 |
118 if options.merge_rdrp == "yes": | |
119 rdrp_list = ["pfam00680", "pfam02123", "pfam00978", "pfam00998"] | |
120 collection["RdRp_merge"] = {} | |
121 for cdd_id in collection: | |
122 if cdd_id in rdrp_list and cdd_id != "RdRp_merge": | |
123 log.info("Add " + cdd_id + " in merge") | |
124 for query_id in collection[cdd_id]: | |
125 if query_id not in collection["RdRp_merge"]: | |
126 collection["RdRp_merge"][query_id] = {} | |
127 collection["RdRp_merge"][query_id] = collection[cdd_id][query_id] | |
113 return collection | 128 return collection |
114 | 129 |
115 | 130 |
116 def _retrieve_fasta_seq(fasta_file, query_id): | 131 def _retrieve_fasta_seq(fasta_file, query_id): |
117 """ | 132 """ |
179 log.info("Align sequences") | 194 log.info("Align sequences") |
180 if not os.path.exists(options.output): | 195 if not os.path.exists(options.output): |
181 os.mkdir(options.output) | 196 os.mkdir(options.output) |
182 color_by_sample = {} | 197 color_by_sample = {} |
183 for cdd_id in hits_collection: | 198 for cdd_id in hits_collection: |
184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | 199 log.info("align seq for " + cdd_id) |
200 if cdd_id == "RdRp_merge": | |
201 cdd_output = options.output + "/" + cdd_id | |
202 else: | |
203 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
185 if not os.path.exists(cdd_output): | 204 if not os.path.exists(cdd_output): |
186 os.mkdir(cdd_output) | 205 os.mkdir(cdd_output) |
187 if os.path.exists(cdd_output + "/seq_to_align.fasta"): | 206 if os.path.exists(cdd_output + "/seq_to_align.fasta"): |
188 os.remove(cdd_output + "/seq_to_align.fasta") | 207 os.remove(cdd_output + "/seq_to_align.fasta") |
189 if os.path.exists(cdd_output + "/seq_nucc.fasta"): | 208 if os.path.exists(cdd_output + "/seq_nucc.fasta"): |
221 | 240 |
222 # create tree plot with colors | 241 # create tree plot with colors |
223 file_matrix = cdd_output + "/identity_matrix.csv" | 242 file_matrix = cdd_output + "/identity_matrix.csv" |
224 log.info("Create tree...") | 243 log.info("Create tree...") |
225 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) | 244 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) |
226 _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id) | 245 _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id) |
227 log.info("Retrieve OTUs...") | 246 log.info("Retrieve OTUs...") |
228 # if os.path.exists(file_cluster): | 247 # if os.path.exists(file_cluster): |
229 # os.remove(file_cluster) | 248 # os.remove(file_cluster) |
230 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) | 249 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) |
231 log.debug(otu_cmd) | 250 log.debug(otu_cmd) |
239 f = open(file_cluster, "w+") | 258 f = open(file_cluster, "w+") |
240 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') | 259 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') |
241 f.close() | 260 f.close() |
242 | 261 |
243 | 262 |
244 def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id): | 263 def _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id): |
245 """ | 264 """ |
246 Calculate paiwise distance between aligned protein sequences | 265 Calculate paiwise distance between aligned protein sequences |
247 from a cdd_id | 266 from a cdd_id |
248 """ | 267 """ |
249 log.info("Compute pairwise distance of " + cdd_id) | 268 log.info("Compute pairwise distance of " + cdd_id) |
295 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook | 314 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook |
296 workbook = xlsxwriter.Workbook(file_xlsx) | 315 workbook = xlsxwriter.Workbook(file_xlsx) |
297 log.info("Writing stats to " + file_xlsx) | 316 log.info("Writing stats to " + file_xlsx) |
298 for cdd_id in hits_collection: | 317 for cdd_id in hits_collection: |
299 otu_collection = {} | 318 otu_collection = {} |
300 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | 319 if cdd_id == "RdRp_merge": |
301 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet | 320 cdd_output = options.output + "/" + cdd_id |
321 worksheet = workbook.add_worksheet(cdd_id) | |
322 else: | |
323 | |
324 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
325 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet | |
302 file_cluster = cdd_output + '/otu_cluster.csv' | 326 file_cluster = cdd_output + '/otu_cluster.csv' |
303 file_fasta_nucc = cdd_output + '/representative_nucc.fasta' | 327 file_fasta_nucc = cdd_output + '/representative_nucc.fasta' |
304 with open(file_cluster, 'r') as clust: | 328 with open(file_cluster, 'r') as clust: |
305 otu_reader = csv.reader(clust, delimiter=',') | 329 otu_reader = csv.reader(clust, delimiter=',') |
306 samples_list = [] | 330 samples_list = [] |
313 samples_list.append(sample) if sample not in samples_list else samples_list | 337 samples_list.append(sample) if sample not in samples_list else samples_list |
314 if sample not in otu_collection[row[0]]: | 338 if sample not in otu_collection[row[0]]: |
315 otu_collection[row[0]][sample] = {} | 339 otu_collection[row[0]][sample] = {} |
316 otu_collection[row[0]][sample][contig] = {} | 340 otu_collection[row[0]][sample][contig] = {} |
317 # add read number of the contig and annotation | 341 # add read number of the contig and annotation |
318 if 'nb' in hits_collection[cdd_id][contig]: | 342 if contig in hits_collection[cdd_id]: |
319 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | 343 if 'nb' in hits_collection[cdd_id][contig]: |
344 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
345 else: | |
346 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
347 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
348 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
349 else: | |
350 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
320 else: | 351 else: |
321 otu_collection[row[0]][sample][contig]['nb'] = 0 | 352 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} |
322 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
323 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
324 else: | |
325 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
326 else: | 353 else: |
327 otu_collection[row[0]][sample][contig] = {} | 354 otu_collection[row[0]][sample][contig] = {} |
328 # add read number of the contig and annotation | 355 # add read number of the contig and annotation |
329 if 'nb' in hits_collection[cdd_id][contig]: | 356 if contig in hits_collection[cdd_id]: |
330 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | 357 if 'nb' in hits_collection[cdd_id][contig]: |
358 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
359 else: | |
360 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
361 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
362 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
363 else: | |
364 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
331 else: | 365 else: |
332 otu_collection[row[0]][sample][contig]['nb'] = 0 | 366 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} |
333 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
334 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
335 else: | |
336 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
337 if 'taxonomy' in hits_collection[cdd_id][contig]: | 367 if 'taxonomy' in hits_collection[cdd_id][contig]: |
338 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | 368 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] |
339 else: | 369 else: |
340 otu_collection[row[0]]['global_taxonomy'] = 'unknown' | 370 otu_collection[row[0]]['global_taxonomy'] = 'unknown' |
341 | 371 |
360 worksheet.write(row, column + 2, 'contigs_list') | 390 worksheet.write(row, column + 2, 'contigs_list') |
361 row = 1 | 391 row = 1 |
362 # column = 0 | 392 # column = 0 |
363 with open(file_fasta_nucc, "w+") as f_nucc: | 393 with open(file_fasta_nucc, "w+") as f_nucc: |
364 for otu in otu_collection: | 394 for otu in otu_collection: |
365 log.info(otu) | |
366 if isinstance(otu_collection[otu], dict): | 395 if isinstance(otu_collection[otu], dict): |
367 column = 0 | 396 column = 0 |
368 worksheet.write(row, column, otu) | 397 worksheet.write(row, column, otu) |
369 # prepare table with 0 in each cells | 398 # prepare table with 0 in each cells |
370 for sample in otu_collection[otu]: | 399 for sample in otu_collection[otu]: |
403 | 432 |
404 with open(map_file_path, "w+") as map_file: | 433 with open(map_file_path, "w+") as map_file: |
405 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] | 434 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] |
406 map_file.write("\t".join(headers)) | 435 map_file.write("\t".join(headers)) |
407 for cdd_id in hits_collection: | 436 for cdd_id in hits_collection: |
408 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") | 437 if cdd_id == "RdRp_merge": |
438 cdd_output = "RdRp_merge" | |
439 else: | |
440 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
409 short_description = cdd_output | 441 short_description = cdd_output |
410 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' | 442 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' |
411 tree_file = cdd_output + '/tree.dnd.png' | 443 tree_file = cdd_output + '/tree.dnd.png' |
412 file_cluster = cdd_output + '/otu_cluster.csv' | 444 file_cluster = cdd_output + '/otu_cluster.csv' |
413 file_matrix = cdd_output + "/identity_matrix.csv" | 445 file_matrix = cdd_output + "/identity_matrix.csv" |
420 log.debug(html_cmd) | 452 log.debug(html_cmd) |
421 os.system(html_cmd) | 453 os.system(html_cmd) |
422 | 454 |
423 | 455 |
424 def _set_options(): | 456 def _set_options(): |
457 """ | |
458 Set parameters | |
459 """ | |
425 parser = argparse.ArgumentParser() | 460 parser = argparse.ArgumentParser() |
426 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') | 461 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') |
427 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') | 462 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') |
428 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') | 463 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') |
429 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') | 464 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') |
430 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') | 465 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') |
431 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') | 466 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') |
467 parser.add_argument('-m', '--merge_rdrp', help='Merge RdRp1, 2, 3 and 4 to create otu on it.', action='store', type=str, default="no", dest='merge_rdrp') | |
432 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') | 468 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') |
433 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') | 469 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') |
434 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') | 470 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') |
435 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | 471 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) |
436 args = parser.parse_args() | 472 args = parser.parse_args() |
437 return args | 473 return args |
438 | 474 |
439 | 475 |
440 def _set_log_level(verbosity): | 476 def _set_log_level(verbosity): |
477 """ | |
478 Debbug | |
479 """ | |
441 if verbosity == 1: | 480 if verbosity == 1: |
442 log_format = '%(asctime)s %(levelname)-8s %(message)s' | 481 log_format = '%(asctime)s %(levelname)-8s %(message)s' |
443 log.basicConfig(level=log.INFO, format=log_format) | 482 log.basicConfig(level=log.INFO, format=log_format) |
444 elif verbosity == 3: | 483 elif verbosity == 3: |
445 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | 484 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' |