Mercurial > repos > erasmus-medical-center > mycrobiota
comparison mycrobiota.py @ 0:607c5e7e0a64 draft
planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/mycrobiota commit 1c4c58018b64ff3531a719e789ce71cb0a1244c5
author | erasmus-medical-center |
---|---|
date | Wed, 13 Dec 2017 10:09:50 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:607c5e7e0a64 |
---|---|
1 import argparse | |
2 import csv | |
3 import math | |
4 import os | |
5 from subprocess import call | |
6 | |
7 | |
8 def main(): | |
9 print("Welcome to the MYcrobiota suite") | |
10 parser = argparse.ArgumentParser() | |
11 parser.add_argument('-c', '--command', required=True, | |
12 help="What action to perform") | |
13 parser.add_argument('-ct', '--count_table', action='append', | |
14 help="mothur count table") | |
15 parser.add_argument('-cp', '--copies', help="copies of NC for samples") | |
16 parser.add_argument('-nccp', '--nc_copies', help="copies of NC for itself") | |
17 parser.add_argument('-t', '--taxonomy', action='append', | |
18 help="mothur taxonomy file") | |
19 parser.add_argument('-s', '--shared_file', action='append', | |
20 help="mothur shared file") | |
21 parser.add_argument('-otu', '--otutable', action='append', | |
22 help="mothur OTU table") | |
23 parser.add_argument('-f', '--fasta', action='append', help="fasta") | |
24 parser.add_argument('-sl', '--summary_log', action='append', | |
25 help="mothur summary log file") | |
26 parser.add_argument('-o', '--outfile', help="output file") | |
27 parser.add_argument('-od', '--outdir', help="output directory", default="") | |
28 parser.add_argument('-lv', '--level', help="taxonomy level") | |
29 parser.add_argument('-nc', '--negative_control', | |
30 help="sample name of the negative control") | |
31 parser.add_argument('-ncs', '--negative_control_species', | |
32 help="species name of the negative control", | |
33 default="Oscillatoria") | |
34 parser.add_argument('-r', '--replicate_suffix', | |
35 help="suffix to identify replicates") | |
36 parser.add_argument('-l', '--label', action='append', | |
37 help="label for count table") | |
38 parser.add_argument('--with-otu', dest='with_otu', action='store_true', | |
39 default=False) | |
40 args = parser.parse_args() | |
41 | |
42 try: | |
43 os.mkdir(args.outdir) | |
44 except OSError: | |
45 pass | |
46 | |
47 print("Running command: "+args.command) | |
48 | |
49 if args.command == 'counttable_totals': | |
50 count_table_totals(args.count_table[0], args.outdir, args.outfile) | |
51 | |
52 elif args.command == 'qc_report': | |
53 if args.count_table: | |
54 qc_report(args.count_table, args.label, 'counttables', args.outdir) | |
55 elif args.summary_log: | |
56 qc_report(args.summary_log, args.label, 'summarylogs', args.outdir) | |
57 | |
58 elif args.command == 'create_krona_plot': | |
59 create_krona_plot(args.taxonomy, args.outdir, args.with_otu) | |
60 | |
61 elif args.command == 'create_krona_plot_multisample': | |
62 create_krona_plot_multisample(args.taxonomy, args.shared_file, | |
63 args.level, args.outdir, args.with_otu) | |
64 | |
65 elif args.command == 'correct_replicates': | |
66 correct_replicates(args.shared_file, args.taxonomy, args.outdir, | |
67 args.replicate_suffix, args.copies, | |
68 args.negative_control, args.nc_copies, | |
69 args.negative_control_species) | |
70 | |
71 elif args.command == 'make_multi_otutable': | |
72 make_multi_otutable(args.taxonomy, args.shared_file, args.level, | |
73 args.outdir) | |
74 | |
75 elif args.command == 'otutable_add_blast_links': | |
76 otutable_add_blast_links(args.otutable, args.fasta) | |
77 | |
78 elif args.command == 'split_multi_otutable': | |
79 split_multi_otutable(args.otutable) | |
80 | |
81 else: | |
82 print("unknown command. exiting") | |
83 | |
84 | |
85 def make_url(seq, baseurl): | |
86 return baseurl+"?DATABASE=nr&PERC_IDENT=97&EXCLUDE_SEQ_UNCULT=on&" \ | |
87 "HITLIST_SIZE=10&FILTER=L&FILTER=m&FILTER=R&EXPECT=10&" \ | |
88 "FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&" \ | |
89 "SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY=" \ | |
90 + seq.lower() | |
91 | |
92 | |
93 def make_RIDlink(RID, baseurl): | |
94 return "<a target=\"_blank\" href=\""+baseurl+"?CMD=Get&RID="\ | |
95 + RID + "\">view results</a>" | |
96 | |
97 | |
98 def make_rerun_link(seq, baseurl): | |
99 return "<a target=\"_blank\" href=\"" + baseurl +\ | |
100 "?DATABASE=nr&EXCLUDE_SEQ_UNCULT=yes&FILTER=L&FORMAT_TYPE=HTML" \ | |
101 "&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&" \ | |
102 "CMD=Web&QUERY=" + seq.lower() + "\">send to BLAST</a>" | |
103 | |
104 | |
105 def otutable_add_blast_links(otutable, otureps): | |
106 baseurl = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi" | |
107 | |
108 # for each otu create blast search of corresponding representative sequence | |
109 reps = [line for line in open(otureps[0], "r")] | |
110 | |
111 seqs = [r.rstrip('\n').replace('-', '') for r in reps if '>' not in r] | |
112 seq_names = [r for r in reps if '>' in r] | |
113 otulines = [line for line in open(otutable[0], "r")] | |
114 | |
115 # Add RID link and rerun link to table | |
116 with open("otutable_with_blast.tsv", "w+") as outfile, \ | |
117 open("filtered_otureps.fasta", "w+") as repsout: | |
118 outfile.write(otulines[0].rstrip() + "\tBLAST\n") | |
119 | |
120 for otuline in otulines[1:]: | |
121 otu = otuline.split('\t')[0] | |
122 for i, seq in enumerate(seq_names): | |
123 if otu in seq: | |
124 outfile.write(otuline.rstrip() + "\t" + | |
125 make_rerun_link(seqs[i], baseurl) + "\n") | |
126 # output otureps for these otus | |
127 for i, seq in enumerate(reps): | |
128 if otu in seq: | |
129 repsout.write(reps[i]) | |
130 repsout.write(reps[i+1]) | |
131 | |
132 | |
133 def summarylog_total(infile): | |
134 with open(infile) as f: | |
135 summarylog = f.readlines() | |
136 for line in summarylog: | |
137 if line.startswith('# of Seqs:') \ | |
138 or line.startswith('total # of seqs:'): | |
139 return int(line.split('\t')[-1]) | |
140 return None | |
141 | |
142 | |
143 def count_table_totals(infile, outdir='', outfile=''): | |
144 """ | |
145 Given a Mothur counttable, calculate the total number of sequences for each | |
146 sample. This can be appended as additional row in the count table by | |
147 providing a file name. | |
148 | |
149 :param infile: Mothur count table. | |
150 :param outfile: Optional. Write the count table with an additional row with | |
151 totals to this file | |
152 :param outdir: Optional. Write output do this directory | |
153 :return: A list with totals for all columns (samples) in the count table | |
154 """ | |
155 # assume a single input file for now | |
156 out_rows = [] | |
157 with open(infile) as f: | |
158 count_table = csv.reader(f, delimiter='\t') | |
159 | |
160 header = next(count_table) | |
161 totals = [0] * (len(header)-1) | |
162 | |
163 out_rows.append(header) | |
164 for row in count_table: | |
165 if row[0] != 'total': | |
166 out_rows.append(row) | |
167 for i in range(1, len(row)): | |
168 totals[i-1] += int(row[i]) | |
169 | |
170 out_rows.append(["total"] + map(str, totals)) | |
171 | |
172 # write count table with totals to file if requested | |
173 if outfile: | |
174 write_output(outdir, outfile, out_rows) | |
175 | |
176 return totals | |
177 | |
178 | |
179 def qc_report(infiles, label, inputtype, outdir=''): | |
180 """ | |
181 Construct QC table from multiple count files | |
182 - Report the number of sequences lost at each consecutive QC step | |
183 - Create a multi-sample report and a separate report for each sample | |
184 | |
185 :param infiles: set of count tables | |
186 :param label: labels for each step | |
187 :param outdir: directory to place output files. Default: cwd | |
188 :return: | |
189 """ | |
190 assert len(infiles) == len(label), \ | |
191 "number of files and labels unequal, stopping" | |
192 | |
193 print("qcreport") | |
194 previous_totals = [] | |
195 outlines = [] | |
196 lasttotal = None | |
197 | |
198 for (i, lab) in zip(infiles, label): | |
199 with open(i, 'rb') as f: | |
200 count_file = csv.reader(f, delimiter='\t') | |
201 | |
202 if inputtype == 'summarylogs': | |
203 print("summarylogs") | |
204 if not outlines: | |
205 outlines = [['Step', 'Total', 'Removed', 'Percentage']] | |
206 | |
207 # get total count | |
208 total = summarylog_total(i) | |
209 | |
210 # calculate difference with last | |
211 if not lasttotal: | |
212 outlines.append([lab, total, None, None]) | |
213 else: | |
214 diff = total - lasttotal | |
215 perc = float(diff)/float(lasttotal)*100.0 | |
216 outlines.append([lab, total, diff, str("%.1f" % perc)+"%"]) | |
217 | |
218 lasttotal = total | |
219 | |
220 else: | |
221 # add header line to output | |
222 if not outlines: | |
223 outlines = [['step'] + next(count_file)[1:]] | |
224 | |
225 # calculate totals of each column in the count file | |
226 totals = count_table_totals(i) | |
227 | |
228 # calculate difference with previous count file | |
229 if not previous_totals: | |
230 diffs = [""] * len(totals) | |
231 else: | |
232 diffs = [" (" + str(t1-t2)+"; " | |
233 + str("%.1f" % (float(t1-t2)/float(t2)*100.0)) + | |
234 "%)" for t1, t2 in zip(totals, previous_totals)] | |
235 | |
236 outlines.append([lab] + | |
237 [str(a)+b for a, b in zip(totals, diffs)]) | |
238 previous_totals = totals | |
239 | |
240 # write multi-sample output file | |
241 write_output(outdir, 'all_qctable.tsv', outlines) | |
242 | |
243 # write per-sample output files | |
244 for j in range(2, len(outlines[0])): | |
245 sample = outlines[0][j] | |
246 sample_outlines = [[outlines_line[0], outlines_line[j]] | |
247 for outlines_line in outlines] | |
248 write_output(outdir, 'persample_qctable_'+sample+'.tsv', | |
249 sample_outlines) | |
250 | |
251 | |
252 def column(matrix, i): | |
253 return [row[i] for row in matrix] | |
254 | |
255 | |
256 def mean(data): | |
257 return sum(data) / float(len(data)) | |
258 | |
259 | |
260 def stdev(data): | |
261 c = mean(data) | |
262 ss = sum((x-c)**2 for x in data) | |
263 n = len(data) | |
264 return math.sqrt(ss/(n-1)) | |
265 | |
266 | |
267 def write_output(outdir, filename, outlines): | |
268 with open(os.path.join(outdir, filename), 'wb') as of: | |
269 out_table = csv.writer(of, delimiter='\t', lineterminator='\n') | |
270 for row in outlines: | |
271 out_table.writerow(row) | |
272 | |
273 | |
274 def correct_replicates(shared, taxonomy, outdir, replicate_suffix, | |
275 sample_copies, negative_control='', nc_copies=-1, | |
276 negative_control_species='Oscillatoria'): | |
277 with open(shared[0], 'rb') as f, open(taxonomy[0], 'rb') as f2: | |
278 shared_file = csv.reader(f, delimiter='\t') | |
279 taxonomy_file = csv.reader(f2, delimiter='\t') | |
280 | |
281 # determine which OTU number is the control, Oscillatoria by default | |
282 # (Bacteria;Cyanobacteria;Cyanobacteria;..;Oscillatoria) | |
283 try: | |
284 line = next(taxonomy_file) | |
285 while negative_control_species not in line[2]: | |
286 line = next(taxonomy_file) | |
287 otu = line[0] | |
288 except StopIteration: | |
289 print("negative control species not found in taxonomy, Exiting") | |
290 return 1 | |
291 | |
292 ''' Calculate Copies ''' | |
293 # per replicate of sample and NC, determine correction factor, | |
294 # (number Oscillatoria seqs/known copies of it) | |
295 # correct all sequence counts with that | |
296 myshared = [row for row in shared_file if row] | |
297 newshared = [myshared[0]] | |
298 newshared2 = [myshared[0]] | |
299 newshared3 = [myshared[0]] | |
300 newshared4 = [myshared[0]] | |
301 oscil_column = myshared[0].index(otu) | |
302 | |
303 for row in myshared[1:]: | |
304 if row[1].startswith(negative_control): | |
305 copies = nc_copies | |
306 else: | |
307 copies = sample_copies | |
308 | |
309 correction_factor = float(row[oscil_column]) / float(copies) | |
310 | |
311 new_row = row[0:3] | |
312 for count in row[3:]: | |
313 try: | |
314 newval = float(count) / correction_factor | |
315 except ZeroDivisionError: | |
316 newval = float(count) | |
317 new_row.append(newval) | |
318 newshared.append(new_row) | |
319 | |
320 ''' Average copy counts across replicates ''' | |
321 levels = set([row[0] for row in newshared[1:]]) | |
322 samples = set([row[1].split(replicate_suffix)[0] | |
323 for row in newshared[1:]]) | |
324 | |
325 for level in levels: | |
326 for sample in samples: | |
327 neg = True if sample.startswith(negative_control) else False | |
328 replicates = [row for row in newshared if row[0] == level | |
329 and row[1].startswith(sample)] | |
330 num_otus = int(replicates[0][2])+3 | |
331 total = replicates[0][2] | |
332 avg = [level, sample, total] | |
333 | |
334 for i in range(3, num_otus): | |
335 counts = column(replicates, i) | |
336 avg.append(mean(counts)) if 0 not in counts or neg \ | |
337 else avg.append(0) | |
338 | |
339 newshared2.append(avg) | |
340 | |
341 ''' Average *sequence* counts across replicates ''' | |
342 levels = set([row[0] for row in myshared[1:]]) | |
343 samples = set([row[1].split(replicate_suffix)[0] | |
344 for row in myshared[1:]]) | |
345 | |
346 for level in levels: | |
347 for sample in samples: | |
348 replicates = [row for row in myshared if row[0] == level | |
349 and row[1].startswith(sample)] | |
350 num_otus = int(replicates[0][2]) + 3 | |
351 total = replicates[0][2] | |
352 avg = [level, sample, total] | |
353 | |
354 for i in range(3, num_otus): | |
355 counts = map(int, column(replicates, i)) | |
356 avg.append(int(round(mean(counts)))) | |
357 | |
358 newshared4.append(avg) | |
359 | |
360 ''' Correct for background ''' | |
361 # for each otu, subtract 3 times the standard deviation of | |
362 # the negative control sample | |
363 for level in levels: | |
364 NC = [row for row in newshared if row[0] == level | |
365 and row[1].startswith(negative_control)] | |
366 samples = [row for row in newshared2 if row[0] == level | |
367 and not row[1].startswith(negative_control)] | |
368 num_otus = int(samples[0][2])+3 | |
369 | |
370 for i in range(3, num_otus): | |
371 m = mean(column(NC, i)) | |
372 sd = stdev(column(NC, i)) | |
373 corr = m + 3*sd | |
374 | |
375 for s in samples: | |
376 s[i] = max(0, int(round(s[i] - corr))) | |
377 | |
378 newshared3 += samples | |
379 | |
380 # remove Negative control species otu from table | |
381 for i, row in enumerate(newshared3): | |
382 del row[oscil_column] | |
383 if i > 0: | |
384 row[2] = int(row[2]) - 1 | |
385 | |
386 # sort file or other mothur tools may segfault :/ | |
387 newshared3 = [newshared3[0]] + sorted(newshared3[1:], | |
388 key=lambda a_entry: a_entry[0] | |
389 if a_entry[0] != 'unique' else 0) | |
390 | |
391 f2.seek(0) | |
392 taxonomy_out = [row for row in taxonomy_file if row and row[0] != otu] | |
393 write_output(outdir, 'taxonomy_corrected.tsv', taxonomy_out) | |
394 write_output(outdir, 'shared_corrected.tsv', newshared3) | |
395 write_output(outdir, 'shared_averaged.tsv', newshared4) | |
396 | |
397 | |
398 def make_multi_otutable(taxonomy_file, shared_file, level, outdir): | |
399 """ | |
400 Create an otu table from shared file and taxonomy file | |
401 | |
402 example output: | |
403 | |
404 OTU sample1 sample2 .. sampleX Kingdom Phylum Class Order Family Genus | |
405 Otu001 13 8 .. 91 Bacteria Bacteroidetes Bacteroidia .. | |
406 ... | |
407 | |
408 :param taxonomy_file: | |
409 :param shared_file: | |
410 :param level: | |
411 | |
412 :return: | |
413 """ | |
414 | |
415 outlines = [] | |
416 samples = [] | |
417 # create multisample taxonomy from counts in shared file | |
418 with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh: | |
419 taxonomy = csv.reader(tax, delimiter='\t') | |
420 shared = csv.reader(sh, delimiter='\t') | |
421 shared_header = next(shared) | |
422 outlines.append(shared_header[3:]) | |
423 | |
424 # get all taxonomies | |
425 taxonomies = [] | |
426 for j, t in enumerate(taxonomy): | |
427 if j > 0: | |
428 taxonomies.append(filter(None, [x.split('(')[0] | |
429 for x in t[2].split(';')])) | |
430 | |
431 for i, row in enumerate(shared): | |
432 tax.seek(0) # make sure to start at beginning of file each time | |
433 if row[0] == level: | |
434 samples.append(row[1]) | |
435 outlines.append(row[3:]) | |
436 | |
437 transposed = map(list, zip(*outlines)) | |
438 header = ["OTU"] + samples + ["Kingdom", "Phylum", "Class", "Order", | |
439 "Family", "Genus"] | |
440 | |
441 writelines = [header] + [a + b for a, b in zip(transposed, taxonomies) | |
442 if a[1:] != ['0'] * len(a[1:])] | |
443 | |
444 # sort sample columns by name | |
445 lst = map(list, zip(*[w[1:-6] for w in writelines])) | |
446 lst.sort(key=lambda x: x[0]) | |
447 lst = map(list, zip(*lst)) | |
448 writelines2 = [[writelines[i][0]] + lst[i] + writelines[i][-6:] | |
449 for i in range(0, len(writelines))] | |
450 | |
451 # output corrected shared file | |
452 write_output(outdir, "multi_otutable.tsv", writelines2) | |
453 | |
454 | |
455 def split_multi_otutable(otutable, with_avg=True): | |
456 fulltable = [line.strip().split('\t') | |
457 for line in open(otutable[0], 'r') if line] | |
458 samples = [s.split('_')[0] for s in fulltable[0][1:-6]] | |
459 numcols = len(fulltable[0]) | |
460 numreplicates = (numcols - 7) / len(set(samples)) | |
461 | |
462 for sample in set(samples): | |
463 outlines = [] | |
464 cols = [0] + [i+1 for i, s in enumerate(samples) if sample in s] \ | |
465 + [i for i in range(numcols-6, numcols)] | |
466 for i, line in enumerate(fulltable): | |
467 out = [line[j] for j in cols] | |
468 if out[1:-6] != ['0'] * numreplicates: | |
469 out.insert(-6, 'mean' if i == 0 | |
470 else int(round(mean(map(int, out[1:-6]))))) | |
471 outlines.append(out) | |
472 | |
473 write_output('.', sample+'.otutable', outlines) | |
474 | |
475 | |
476 def create_krona_plot_multisample(taxonomy_file, shared_file, level, outdir, | |
477 with_otu): | |
478 """ | |
479 Create krona plots from a multisample taxonomy plot and a shared file. | |
480 Create one multisample plot and a plot per individual sample | |
481 | |
482 :param taxonomy_file: | |
483 :param shared_file: | |
484 :param level: which level to use, e.g. unique/0.03/.. | |
485 :param with_otu: | |
486 :return: | |
487 """ | |
488 | |
489 taxonomies = [] | |
490 | |
491 # create taxonomy file per sample | |
492 with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh: | |
493 taxonomy = csv.reader(tax, delimiter='\t') | |
494 shared = csv.reader(sh, delimiter='\t') | |
495 shared_header = next(shared) | |
496 | |
497 for i, row in enumerate(shared): | |
498 tax.seek(0) # make sure to start at beginning of file each time | |
499 if row[0] == level: | |
500 sample = row[1] | |
501 | |
502 outfile = os.path.join(outdir, sample+".tsv") | |
503 taxonomies.append(outfile) | |
504 with open(outfile, 'w+') as of: | |
505 out_table = csv.writer(of, delimiter='\t') | |
506 out_table.writerow(next(taxonomy)) # header line | |
507 for j, t in enumerate(taxonomy): | |
508 assert t[0] == shared_header[j+3], \ | |
509 "OTU mismatch between taxonomy and shared file" | |
510 t[1] = row[j+3] | |
511 out_table.writerow(t + [shared_header[j+3]]) | |
512 | |
513 # make krona plot | |
514 create_krona_plot(taxonomies, outdir, with_otu) | |
515 | |
516 | |
517 def create_krona_plot(taxonomy_files, outdir, with_otu): | |
518 """ | |
519 Create a krona plot from one or more mothur taxonomy files | |
520 | |
521 :param taxonomy_files: mothur taxonomy file (output from classify.otu) | |
522 :param outdir: directory to store krona-formatted outputs. Default=cwd | |
523 :param with_otu: add OTU number as a level in the Krona plot? Default=True | |
524 :return: | |
525 """ | |
526 krona_input_files = [] | |
527 | |
528 # convert taxonomy files to krona input. | |
529 for tax in taxonomy_files: | |
530 with open(tax, 'r') as f: | |
531 taxonomy = csv.reader(f, delimiter='\t') | |
532 out_rows = [] | |
533 | |
534 next(taxonomy) # skip header line | |
535 for row in taxonomy: | |
536 out_rows.append( | |
537 filter(None, [row[1]] + row[2].rstrip(";\n").split(';') + | |
538 [row[0] if with_otu else None])) | |
539 | |
540 outfile = os.path.join(outdir, tax.split("/")[-1]+"krona") | |
541 krona_input_files.append(outfile) | |
542 | |
543 with open(outfile, 'w+') as f2: | |
544 out_table = csv.writer(f2, delimiter='\t') | |
545 for row in out_rows: | |
546 out_table.writerow(row) | |
547 | |
548 # execute krona command | |
549 call(["ktImportText"] + krona_input_files) | |
550 | |
551 | |
552 if __name__ == "__main__": | |
553 main() |