diff otu.py @ 3:40fb54cc6628 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 16701bfbffd605805e847897799251ab748f559f
author iuc
date Sun, 08 Sep 2024 14:09:31 +0000
parents c9dac9b2e01c
children adcf06db3030
line wrap: on
line diff
--- a/otu.py	Wed Aug 21 13:13:50 2024 +0000
+++ b/otu.py	Sun Sep 08 14:09:31 2024 +0000
@@ -186,6 +186,8 @@
             os.mkdir(cdd_output)
         if os.path.exists(cdd_output + "/seq_to_align.fasta"):
             os.remove(cdd_output + "/seq_to_align.fasta")
+        if os.path.exists(cdd_output + "/seq_nucc.fasta"):
+            os.remove(cdd_output + "/seq_nucc.fasta")
         file_seq_to_align = cdd_output + "/seq_to_align.fasta"
         file_color_config = cdd_output + "/color_config.txt"
         f = open(file_seq_to_align, "a")
@@ -298,6 +300,7 @@
         cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
         worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"])  # add a worksheet
         file_cluster = cdd_output + '/otu_cluster.csv'
+        file_fasta_nucc = cdd_output + '/representative_nucc.fasta'
         with open(file_cluster, 'r') as clust:
             otu_reader = csv.reader(clust, delimiter=',')
             samples_list = []
@@ -342,6 +345,8 @@
                 if sample not in ['contigs_list', 'global_taxonomy']:
                     total_nb_read = 0
                     for contig in otu_collection[otu][sample]:
+                        if otu_collection[otu][sample][contig]['nb'] == '':
+                            otu_collection[otu][sample][contig]['nb'] = 0
                         total_nb_read += int(otu_collection[otu][sample][contig]['nb'])
                     otu_collection[otu][sample]['total_nb_read'] = total_nb_read
         row = 0
@@ -355,26 +360,30 @@
         worksheet.write(row, column + 2, 'contigs_list')
         row = 1
         # column = 0
-        for otu in otu_collection:
-            if isinstance(otu_collection[otu], dict):
-                column = 0
-                worksheet.write(row, column, otu)
-                # prepare table with 0 in each cells
-                for sample in otu_collection[otu]:
-                    column = 1
-                    for samp in samples_list:
-                        worksheet.write(row, column, 0)
-                        column += 1
-                # fill in table with nb of read for each sample and each OTU
-                for sample in otu_collection[otu]:
-                    column = 1
-                    for samp in samples_list:
-                        if samp == sample:
-                            worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read'])
-                        column += 1
-                worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' '))
-                worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list']))
-                row += 1
+        with open(file_fasta_nucc, "w+") as f_nucc:
+            for otu in otu_collection:
+                log.info(otu)
+                if isinstance(otu_collection[otu], dict):
+                    column = 0
+                    worksheet.write(row, column, otu)
+                    # prepare table with 0 in each cells
+                    for sample in otu_collection[otu]:
+                        column = 1
+                        for samp in samples_list:
+                            worksheet.write(row, column, 0)
+                            column += 1
+                    # fill in table with nb of read for each sample and each OTU
+                    for sample in otu_collection[otu]:
+                        column = 1
+                        for samp in samples_list:
+                            if samp == sample:
+                                worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read'])
+                            column += 1
+                    worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' '))
+                    worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list']))
+                    row += 1
+                    f_nucc.write(">" + cdd_id + "_" + otu + "_" + otu_collection[otu]['contigs_list'][0] + "\n")
+                    f_nucc.write(str(hits_collection[cdd_id][otu_collection[otu]['contigs_list'][0]]['nuccleotide']) + "\n")
     workbook.close()
     read_file = pd.ExcelFile(file_xlsx)
     for sheet in read_file.sheet_names:
@@ -392,21 +401,20 @@
     if os.path.exists(map_file_path):
         os.remove(map_file_path)
 
-    map_file = open(map_file_path, "w+")
-    headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
-    map_file.write("\t".join(headers))
-    for cdd_id in hits_collection:
-        cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
-        short_description = cdd_output
-        file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
-        tree_file = cdd_output + '/tree.dnd.png'
-        file_cluster = cdd_output + '/otu_cluster.csv'
-        file_matrix = cdd_output + "/identity_matrix.csv"
-        cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"
-        map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t")
-        map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t")
-        map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n")
-    map_file.close()
+    with open(map_file_path, "w+") as map_file:
+        headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
+        map_file.write("\t".join(headers))
+        for cdd_id in hits_collection:
+            cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
+            short_description = cdd_output
+            file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
+            tree_file = cdd_output + '/tree.dnd.png'
+            file_cluster = cdd_output + '/otu_cluster.csv'
+            file_matrix = cdd_output + "/identity_matrix.csv"
+            cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"
+            map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t")
+            map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t")
+            map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n")
     log.info("Writing HTML report")
     html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output
     log.debug(html_cmd)