Mercurial > repos > iuc > virannot_rps2tsv
comparison otu.py @ 0:bbaa89f070f4 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
| author | iuc |
|---|---|
| date | Mon, 04 Mar 2024 19:56:16 +0000 |
| parents | |
| children | d1fd5579469d |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bbaa89f070f4 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 | |
| 4 # Name: virAnnot_otu | |
| 5 # Author: Marie Lefebvre - INRAE | |
| 6 # Reuirements: Ete3 toolkit and external apps | |
| 7 # Aims: Create viral OTUs based on RPS and Blast annotations | |
| 8 | |
| 9 | |
| 10 import argparse | |
| 11 import csv | |
| 12 import logging as log | |
| 13 import os | |
| 14 import random | |
| 15 import re | |
| 16 | |
| 17 import pandas as pd | |
| 18 import xlsxwriter | |
| 19 from Bio import SeqIO | |
| 20 from Bio.Align.Applications import ClustalOmegaCommandline | |
| 21 from ete3 import NodeStyle, SeqGroup, SeqMotifFace, Tree, TreeStyle | |
| 22 | |
| 23 | |
| 24 def main(): | |
| 25 """ | |
| 26 1 - retrieve info (sequence, query_id, taxo) from RPS file | |
| 27 2 - align protein sequences of the same domain, calculate | |
| 28 matrix of distances, generate trees | |
| 29 3 - get statistics (read number) per otu | |
| 30 4 - create HTML report | |
| 31 """ | |
| 32 options = _set_options() | |
| 33 _set_log_level(options.verbosity) | |
| 34 hits_collection = _cut_sequence(options) | |
| 35 _align_sequences(options, hits_collection) | |
| 36 _get_stats(options, hits_collection) | |
| 37 _create_html(options, hits_collection) | |
| 38 | |
| 39 | |
| 40 def _cut_sequence(options): | |
| 41 """ | |
| 42 Retrieve viral hits and sequences from RPS files | |
| 43 """ | |
| 44 log.info("Cut sequences") | |
| 45 i = 0 # keep track of iterations over rps files to use the corresponding fasta file | |
| 46 collection = {} | |
| 47 options.rps.sort() | |
| 48 for rps_file in options.rps: | |
| 49 log.debug("Reading rps file " + str(rps_file)) | |
| 50 with open(rps_file[0], 'r') as rps_current_file: | |
| 51 rps_reader = csv.reader(rps_current_file, delimiter='\t') | |
| 52 headers = 0 | |
| 53 for row in rps_reader: | |
| 54 if headers == 0: | |
| 55 # headers | |
| 56 headers += 1 | |
| 57 else: | |
| 58 if row[1] == "no_hit": | |
| 59 pass | |
| 60 else: | |
| 61 query_id = row[0] | |
| 62 cdd_id = row[2] | |
| 63 startQ = int(row[5]) | |
| 64 endQ = int(row[6]) | |
| 65 frame = float(row[7]) | |
| 66 description = row[8] | |
| 67 superkingdom = row[9] | |
| 68 match = re.search("Viruses", superkingdom) | |
| 69 # if contig is viral then retrieve sequence | |
| 70 if match: | |
| 71 options.fasta.sort() | |
| 72 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) | |
| 73 seq_length = len(seq) | |
| 74 if endQ < seq_length: | |
| 75 seq = seq[startQ - 1:endQ] | |
| 76 else: | |
| 77 seq = seq[startQ - 1:seq_length] | |
| 78 if frame < 0: | |
| 79 seq = seq.reverse_complement() | |
| 80 prot = seq.translate() | |
| 81 if len(prot) >= options.min_protein_length: | |
| 82 log.debug("Add " + query_id + " to collection") | |
| 83 if cdd_id not in collection: | |
| 84 collection[cdd_id] = {} | |
| 85 collection[cdd_id][query_id] = {} | |
| 86 collection[cdd_id][query_id]["nuccleotide"] = seq | |
| 87 collection[cdd_id][query_id]["protein"] = prot | |
| 88 collection[cdd_id][query_id]["full_description"] = description | |
| 89 if options.blast is not None: | |
| 90 options.blast.sort() | |
| 91 with open(options.blast[i][0], 'r') as blast_current_file: | |
| 92 blast_reader = csv.reader(blast_current_file, delimiter='\t') | |
| 93 for b_query in blast_reader: | |
| 94 if b_query[1] == query_id: | |
| 95 collection[cdd_id][query_id]["nb"] = b_query[2] | |
| 96 if len(b_query) > 10: | |
| 97 collection[cdd_id][query_id]["taxonomy"] = b_query[14] | |
| 98 else: | |
| 99 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | |
| 100 else: | |
| 101 if "nb" not in collection[cdd_id][query_id]: | |
| 102 collection[cdd_id][query_id]["nb"] = 0 | |
| 103 if "taxonomy" not in collection[cdd_id][query_id]: | |
| 104 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | |
| 105 else: | |
| 106 log.info("No blast file") | |
| 107 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | |
| 108 collection[cdd_id][query_id]["nb"] = 0 | |
| 109 | |
| 110 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1 | |
| 111 collection[cdd_id]["full_description"] = description | |
| 112 i += 1 | |
| 113 return collection | |
| 114 | |
| 115 | |
| 116 def _retrieve_fasta_seq(fasta_file, query_id): | |
| 117 """ | |
| 118 From fasta file retrieve specific sequence with id | |
| 119 """ | |
| 120 contigs_list = SeqIO.to_dict(SeqIO.parse(open(fasta_file), 'fasta')) | |
| 121 try: | |
| 122 seq = contigs_list[query_id].seq | |
| 123 except KeyError: | |
| 124 print("KeyError for " + query_id + " file " + fasta_file) | |
| 125 else: | |
| 126 return seq | |
| 127 | |
| 128 | |
| 129 def _create_tree(tree, fasta, out, color): | |
| 130 """ | |
| 131 Create phylogenic tree from multiple alignments | |
| 132 """ | |
| 133 try: | |
| 134 f = open(tree, 'r') | |
| 135 except IOError: | |
| 136 log.info("Unknown file: " + tree + ". You may have less than 2 sequences to align.") | |
| 137 return | |
| 138 | |
| 139 line = "" | |
| 140 for word in f: | |
| 141 line += word.strip() | |
| 142 | |
| 143 f.close() | |
| 144 seqs = SeqGroup(fasta, format="fasta") | |
| 145 t = Tree(tree) | |
| 146 ts = TreeStyle() | |
| 147 ts.show_branch_length = True | |
| 148 colors = _parse_color_file(color) | |
| 149 node_names = t.get_leaf_names() | |
| 150 for name in node_names: | |
| 151 seq = seqs.get_seq(name) | |
| 152 seqFace = SeqMotifFace(seq, seq_format="()") | |
| 153 node = t.get_leaves_by_name(name) | |
| 154 for i in range(0, len(node)): | |
| 155 if name in colors: | |
| 156 ns = NodeStyle() | |
| 157 ns['bgcolor'] = colors[name] | |
| 158 node[i].set_style(ns) | |
| 159 node[i].add_face(seqFace, 0, 'aligned') | |
| 160 | |
| 161 t.render(out, tree_style=ts) | |
| 162 | |
| 163 | |
| 164 def _parse_color_file(file): | |
| 165 fh = open(file) | |
| 166 reader = csv.reader(fh, delimiter="\t") | |
| 167 data = list(reader) | |
| 168 colors = {} | |
| 169 for i in range(0, len(data)): | |
| 170 colors[data[i][0]] = data[i][1] | |
| 171 | |
| 172 return colors | |
| 173 | |
| 174 | |
| 175 def _align_sequences(options, hits_collection): | |
| 176 """ | |
| 177 Align hit sequences with pfam reference | |
| 178 """ | |
| 179 log.info("Align sequences") | |
| 180 if not os.path.exists(options.output): | |
| 181 os.mkdir(options.output) | |
| 182 color_by_sample = {} | |
| 183 for cdd_id in hits_collection: | |
| 184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 185 if not os.path.exists(cdd_output): | |
| 186 os.mkdir(cdd_output) | |
| 187 if os.path.exists(cdd_output + "/seq_to_align.fasta"): | |
| 188 os.remove(cdd_output + "/seq_to_align.fasta") | |
| 189 file_seq_to_align = cdd_output + "/seq_to_align.fasta" | |
| 190 file_color_config = cdd_output + "/color_config.txt" | |
| 191 f = open(file_seq_to_align, "a") | |
| 192 f_c = open(file_color_config, "w+") | |
| 193 log.info("Writing to " + file_seq_to_align) | |
| 194 count = 0 # count number of contig per domain | |
| 195 for query_id in hits_collection[cdd_id]: | |
| 196 if query_id not in ["short_description", "full_description"]: | |
| 197 sample = query_id.split("_")[0] # get sample from SAMPLE_IdCONTIG | |
| 198 sample_color = "#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)]) | |
| 199 # same color for each contig of the same sample | |
| 200 if sample not in color_by_sample.keys(): | |
| 201 color_by_sample[sample] = sample_color | |
| 202 f.write(">" + query_id + "\n") | |
| 203 f.write(str(hits_collection[cdd_id][query_id]["protein"]) + "\n") | |
| 204 f_c.write(query_id + '\t' + color_by_sample[sample] + '\n') | |
| 205 count += 1 | |
| 206 f.close() | |
| 207 f_c.close() | |
| 208 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' | |
| 209 tree_file = cdd_output + '/tree.dnd' | |
| 210 file_cluster = cdd_output + '/otu_cluster.csv' | |
| 211 # create alignment for domain with more than 1 contigs | |
| 212 if count > 1: | |
| 213 log.info("Run clustal omega...") | |
| 214 clustalo_cmd = ClustalOmegaCommandline("clustalo", infile=file_seq_to_align, outfile=file_seq_aligned, | |
| 215 guidetree_out=tree_file, seqtype="protein", force=True) | |
| 216 log.debug(clustalo_cmd) | |
| 217 stdout, stderr = clustalo_cmd() | |
| 218 log.debug(stdout + stderr) | |
| 219 | |
| 220 # create tree plot with colors | |
| 221 file_matrix = cdd_output + "/identity_matrix.csv" | |
| 222 log.info("Create tree...") | |
| 223 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) | |
| 224 _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id) | |
| 225 log.info("Retrieve OTUs...") | |
| 226 # if os.path.exists(file_cluster): | |
| 227 # os.remove(file_cluster) | |
| 228 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) | |
| 229 log.debug(otu_cmd) | |
| 230 os.system(otu_cmd) | |
| 231 # only one contig | |
| 232 else: | |
| 233 mv_cmd = 'cp ' + file_seq_to_align + ' ' + file_seq_aligned | |
| 234 log.debug(mv_cmd) | |
| 235 os.system(mv_cmd) | |
| 236 | |
| 237 f = open(file_cluster, "w+") | |
| 238 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') | |
| 239 f.close() | |
| 240 | |
| 241 | |
| 242 def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id): | |
| 243 """ | |
| 244 Calculate paiwise distance between aligned protein sequences | |
| 245 from a cdd_id | |
| 246 """ | |
| 247 log.info("Compute pairwise distance of " + cdd_id) | |
| 248 matrix = {} | |
| 249 for k1 in SeqIO.parse(file_seq_aligned, "fasta"): | |
| 250 row = [] | |
| 251 for k2 in SeqIO.parse(file_seq_aligned, "fasta"): | |
| 252 identic = 0 | |
| 253 compared = 0 | |
| 254 keep_pos = 0 | |
| 255 for base in k1: | |
| 256 base2 = k2[keep_pos] | |
| 257 # mutation, next | |
| 258 if base == 'X' or base2 == 'X': | |
| 259 keep_pos += 1 | |
| 260 continue | |
| 261 # gap in both sequences, next | |
| 262 if base == '-' and base2 == '-': | |
| 263 keep_pos += 1 | |
| 264 continue | |
| 265 # gap in one of the sequence, next | |
| 266 if base == '-' or base2 == '-': | |
| 267 keep_pos += 1 | |
| 268 continue | |
| 269 # identity | |
| 270 if base == base2: | |
| 271 identic += 1 | |
| 272 compared += 1 | |
| 273 keep_pos += 1 | |
| 274 # set minimum overlap to 20 | |
| 275 if compared == 0 or compared < 20: | |
| 276 percentIdentity = 0 | |
| 277 else: | |
| 278 percentIdentity = (identic / compared) * 100 | |
| 279 row.append(percentIdentity) | |
| 280 matrix[k1.id] = row | |
| 281 log.debug("Write " + file_matrix) | |
| 282 f = open(file_matrix, "w+") | |
| 283 for row in matrix: | |
| 284 f.write(row + ',' + ', '.join(map(str, matrix[row])) + "\n") | |
| 285 f.close() | |
| 286 | |
| 287 | |
| 288 def _get_stats(options, hits_collection): | |
| 289 """ | |
| 290 Retrieve annotation and number of read | |
| 291 for each OTUs | |
| 292 """ | |
| 293 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook | |
| 294 workbook = xlsxwriter.Workbook(file_xlsx) | |
| 295 log.info("Writing stats to " + file_xlsx) | |
| 296 for cdd_id in hits_collection: | |
| 297 otu_collection = {} | |
| 298 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 299 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet | |
| 300 file_cluster = cdd_output + '/otu_cluster.csv' | |
| 301 with open(file_cluster, 'r') as clust: | |
| 302 otu_reader = csv.reader(clust, delimiter=',') | |
| 303 samples_list = [] | |
| 304 for row in otu_reader: | |
| 305 contigs_list = row[2:len(row) - 1] # remove last empty column | |
| 306 otu_collection[row[0]] = {} # key -> otu number | |
| 307 otu_collection[row[0]]['contigs_list'] = contigs_list | |
| 308 for contig in contigs_list: | |
| 309 sample = contig.split('_')[0] | |
| 310 samples_list.append(sample) if sample not in samples_list else samples_list | |
| 311 if sample not in otu_collection[row[0]]: | |
| 312 otu_collection[row[0]][sample] = {} | |
| 313 otu_collection[row[0]][sample][contig] = {} | |
| 314 # add read number of the contig and annotation | |
| 315 if 'nb' in hits_collection[cdd_id][contig]: | |
| 316 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
| 317 else: | |
| 318 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
| 319 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 320 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 321 else: | |
| 322 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 323 else: | |
| 324 otu_collection[row[0]][sample][contig] = {} | |
| 325 # add read number of the contig and annotation | |
| 326 if 'nb' in hits_collection[cdd_id][contig]: | |
| 327 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
| 328 else: | |
| 329 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
| 330 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 331 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 332 else: | |
| 333 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 334 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 335 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 336 else: | |
| 337 otu_collection[row[0]]['global_taxonomy'] = 'unknown' | |
| 338 | |
| 339 # calculate total number of reads for each sample of each OTU | |
| 340 for otu in otu_collection: | |
| 341 for sample in otu_collection[otu]: | |
| 342 if sample not in ['contigs_list', 'global_taxonomy']: | |
| 343 total_nb_read = 0 | |
| 344 for contig in otu_collection[otu][sample]: | |
| 345 total_nb_read += int(otu_collection[otu][sample][contig]['nb']) | |
| 346 otu_collection[otu][sample]['total_nb_read'] = total_nb_read | |
| 347 row = 0 | |
| 348 column = 0 | |
| 349 item = '#OTU_name' | |
| 350 worksheet.write(row, column, item) | |
| 351 for samp in samples_list: | |
| 352 column += 1 | |
| 353 worksheet.write(row, column, samp) | |
| 354 worksheet.write(row, column + 1, 'taxonomy') | |
| 355 worksheet.write(row, column + 2, 'contigs_list') | |
| 356 row = 1 | |
| 357 # column = 0 | |
| 358 for otu in otu_collection: | |
| 359 if isinstance(otu_collection[otu], dict): | |
| 360 column = 0 | |
| 361 worksheet.write(row, column, otu) | |
| 362 # prepare table with 0 in each cells | |
| 363 for sample in otu_collection[otu]: | |
| 364 column = 1 | |
| 365 for samp in samples_list: | |
| 366 worksheet.write(row, column, 0) | |
| 367 column += 1 | |
| 368 # fill in table with nb of read for each sample and each OTU | |
| 369 for sample in otu_collection[otu]: | |
| 370 column = 1 | |
| 371 for samp in samples_list: | |
| 372 if samp == sample: | |
| 373 worksheet.write(row, column, otu_collection[otu][sample]['total_nb_read']) | |
| 374 column += 1 | |
| 375 worksheet.write(row, len(samples_list) + 1, otu_collection[otu]['global_taxonomy'].replace(';', ' ')) | |
| 376 worksheet.write(row, len(samples_list) + 2, ",".join(otu_collection[otu]['contigs_list'])) | |
| 377 row += 1 | |
| 378 workbook.close() | |
| 379 read_file = pd.ExcelFile(file_xlsx) | |
| 380 for sheet in read_file.sheet_names: | |
| 381 cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab" | |
| 382 data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None) | |
| 383 data_xls.to_csv(cluster_nb_reads_file, encoding='utf-8', index=False, sep='\t') | |
| 384 | |
| 385 | |
| 386 def _create_html(options, hits_collection): | |
| 387 """ | |
| 388 Create HTML file with all results | |
| 389 """ | |
| 390 # create mapping file with all informations to use to create HTML report | |
| 391 map_file_path = options.output + "/map.txt" | |
| 392 if os.path.exists(map_file_path): | |
| 393 os.remove(map_file_path) | |
| 394 | |
| 395 map_file = open(map_file_path, "w+") | |
| 396 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] | |
| 397 map_file.write("\t".join(headers)) | |
| 398 for cdd_id in hits_collection: | |
| 399 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 400 short_description = cdd_output | |
| 401 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' | |
| 402 tree_file = cdd_output + '/tree.dnd.png' | |
| 403 file_cluster = cdd_output + '/otu_cluster.csv' | |
| 404 file_matrix = cdd_output + "/identity_matrix.csv" | |
| 405 cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab" | |
| 406 map_file.write(cdd_id + "\t" + file_seq_aligned + "\t" + tree_file + "\t") | |
| 407 map_file.write(file_cluster + "\t" + cluster_nb_reads_files + "\t" + file_matrix + "\t") | |
| 408 map_file.write(short_description + "\t" + hits_collection[cdd_id]["full_description"] + "\n") | |
| 409 map_file.close() | |
| 410 log.info("Writing HTML report") | |
| 411 html_cmd = os.path.join(options.tool_path, 'rps2tree_html.py') + ' -m ' + map_file_path + ' -o ' + options.output | |
| 412 log.debug(html_cmd) | |
| 413 os.system(html_cmd) | |
| 414 | |
| 415 | |
| 416 def _set_options(): | |
| 417 parser = argparse.ArgumentParser() | |
| 418 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') | |
| 419 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') | |
| 420 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') | |
| 421 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') | |
| 422 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') | |
| 423 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') | |
| 424 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') | |
| 425 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') | |
| 426 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') | |
| 427 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
| 428 args = parser.parse_args() | |
| 429 return args | |
| 430 | |
| 431 | |
| 432 def _set_log_level(verbosity): | |
| 433 if verbosity == 1: | |
| 434 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
| 435 log.basicConfig(level=log.INFO, format=log_format) | |
| 436 elif verbosity == 3: | |
| 437 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
| 438 log.basicConfig(level=log.DEBUG, format=log_format) | |
| 439 | |
| 440 | |
| 441 if __name__ == "__main__": | |
| 442 main() |
