comparison otu.py @ 3:f8ebd1e802d7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 16701bfbffd605805e847897799251ab748f559f
author iuc
date Sun, 08 Sep 2024 14:09:19 +0000
parents e889010415a1
children bb29ae8708b5
comparison
equal deleted inserted replaced
2:77c3ef9b0ed7 3:f8ebd1e802d7
184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") 184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
185 if not os.path.exists(cdd_output): 185 if not os.path.exists(cdd_output):
186 os.mkdir(cdd_output) 186 os.mkdir(cdd_output)
187 if os.path.exists(cdd_output + "/seq_to_align.fasta"): 187 if os.path.exists(cdd_output + "/seq_to_align.fasta"):
188 os.remove(cdd_output + "/seq_to_align.fasta") 188 os.remove(cdd_output + "/seq_to_align.fasta")
189 if os.path.exists(cdd_output + "/seq_nucc.fasta"):
190 os.remove(cdd_output + "/seq_nucc.fasta")
189 file_seq_to_align = cdd_output + "/seq_to_align.fasta" 191 file_seq_to_align = cdd_output + "/seq_to_align.fasta"
190 file_color_config = cdd_output + "/color_config.txt" 192 file_color_config = cdd_output + "/color_config.txt"
191 f = open(file_seq_to_align, "a") 193 f = open(file_seq_to_align, "a")
192 f_c = open(file_color_config, "w+") 194 f_c = open(file_color_config, "w+")
193 log.info("Writing to " + file_seq_to_align) 195 log.info("Writing to " + file_seq_to_align)
296 for cdd_id in hits_collection: 298 for cdd_id in hits_collection:
297 otu_collection = {} 299 otu_collection = {}
298 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") 300 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
299 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet 301 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet
300 file_cluster = cdd_output + '/otu_cluster.csv' 302 file_cluster = cdd_output + '/otu_cluster.csv'
303 file_fasta_nucc = cdd_output + '/representative_nucc.fasta'
301 with open(file_cluster, 'r') as clust: 304 with open(file_cluster, 'r') as clust:
302 otu_reader = csv.reader(clust, delimiter=',') 305 otu_reader = csv.reader(clust, delimiter=',')
303 samples_list = [] 306 samples_list = []
304 for row in otu_reader: 307 for row in otu_reader:
305 contigs_list = row[2:len(row) - 1] # remove last empty column 308 contigs_list = row[2:len(row) - 1] # remove last empty column
340 for otu in otu_collection: 343 for otu in otu_collection:
341 for sample in otu_collection[otu]: 344 for sample in otu_collection[otu]:
342 if sample not in ['contigs_list', 'global_taxonomy']: 345 if sample not in ['contigs_list', 'global_taxonomy']:
343 total_nb_read = 0 346 total_nb_read = 0
344 for contig in otu_collection[otu][sample]: 347 for contig in otu_collection[otu][sample]:
348 if otu_collection[otu][sample][contig]['nb'] == '':
349 otu_collection[otu][sample][contig]['nb'] = 0
345 total_nb_read += int(otu_collection[otu][sample][contig]['nb']) 350 total_nb_read += int(otu_collection[otu][sample][contig]['nb'])
346 otu_collection[otu][sample]['total_nb_read'] = total_nb_read 351 otu_collection[otu][sample]['total_nb_read'] = total_nb_read
347 row = 0 352 row = 0
348 column = 0 353 column = 0
349 item = '#OTU_name' 354 item = '#OTU_name'
353 worksheet.write(row, column, samp) 358 worksheet.write(row, column, samp)
354 worksheet.write(row, column + 1, 'taxonomy') 359 worksheet.write(row, column + 1, 'taxonomy')
355 worksheet.write(row, column + 2, 'contigs_list') 360 worksheet.write(row, column + 2, 'contigs_list')
356 row = 1 361 row = 1
357 # column = 0 362 # column = 0
358 for otu in otu_collection: 363 with open(file_fasta_nucc, "w+") as f_nucc:
359 if isinstance(otu_collection[otu], dict): 364 for otu in otu_collection:
360 column = 0 365 log.info(otu)
361 worksheet.write(row, column, otu) 366 if isinstance(otu_collection[otu], dict):
362 # prepare table with 0 in each cells 367 column = 0
363 for sample in otu_collection[otu]: 368 worksheet.write(row, column, otu)
364 column = 1 369 # prepare table with 0 in each cells
365 for samp in samples_list: 370 for sample in otu_collection[otu]:
366 worksheet.write(row, column, 0) 371 column = 1
367 column += 1 372 for samp in samples_list:
368 # fill in table with nb of read for each sample and each OTU 373 worksheet.write(row, column, 0)
369 for sample in otu_collection[otu]: 374 column += 1
370 column = 1 375 # fill in table with nb of read for each sample and each OTU
371 for samp in samples_list: 376 for sample in otu_collection[otu]:
372 if samp == sample: 377 column = 1
373 worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read']) 378 for samp in samples_list:
374 column += 1 379 if samp == sample:
375 worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' ')) 380 worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read'])
376 worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list'])) 381 column += 1
377 row += 1 382 worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' '))
383 worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list']))
384 row += 1
385 f_nucc.write(">" + cdd_id + "_" + otu + "_" + otu_collection[otu]['contigs_list'][0] + "\n")
386 f_nucc.write(str(hits_collection[cdd_id][otu_collection[otu]['contigs_list'][0]]['nuccleotide']) + "\n")
378 workbook.close() 387 workbook.close()
379 read_file = pd.ExcelFile(file_xlsx) 388 read_file = pd.ExcelFile(file_xlsx)
380 for sheet in read_file.sheet_names: 389 for sheet in read_file.sheet_names:
381 cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab" 390 cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab"
382 data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None) 391 data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None)
390 # create mapping file with all informations to use to create HTML report 399 # create mapping file with all informations to use to create HTML report
391 map_file_path = options.output + "/map.txt" 400 map_file_path = options.output + "/map.txt"
392 if os.path.exists(map_file_path): 401 if os.path.exists(map_file_path):
393 os.remove(map_file_path) 402 os.remove(map_file_path)
394 403
395 map_file = open(map_file_path, "w+") 404 with open(map_file_path, "w+") as map_file:
396 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] 405 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
397 map_file.write("\t".join(headers)) 406 map_file.write("\t".join(headers))
398 for cdd_id in hits_collection: 407 for cdd_id in hits_collection:
399 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") 408 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
400 short_description = cdd_output 409 short_description = cdd_output
401 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' 410 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
402 tree_file = cdd_output + '/tree.dnd.png' 411 tree_file = cdd_output + '/tree.dnd.png'
403 file_cluster = cdd_output + '/otu_cluster.csv' 412 file_cluster = cdd_output + '/otu_cluster.csv'
404 file_matrix = cdd_output + "/identity_matrix.csv" 413 file_matrix = cdd_output + "/identity_matrix.csv"
405 cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab" 414 cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"
406 map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t") 415 map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t")
407 map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t") 416 map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t")
408 map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n") 417 map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n")
409 map_file.close()
410 log.info("Writing HTML report") 418 log.info("Writing HTML report")
411 html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output 419 html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output
412 log.debug(html_cmd) 420 log.debug(html_cmd)
413 os.system(html_cmd) 421 os.system(html_cmd)
414 422