comparison mycrobiota.py @ 0:607c5e7e0a64 draft

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/mycrobiota commit 1c4c58018b64ff3531a719e789ce71cb0a1244c5
author erasmus-medical-center
date Wed, 13 Dec 2017 10:09:50 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:607c5e7e0a64
1 import argparse
2 import csv
3 import math
4 import os
5 from subprocess import call
6
7
8 def main():
9 print("Welcome to the MYcrobiota suite")
10 parser = argparse.ArgumentParser()
11 parser.add_argument('-c', '--command', required=True,
12 help="What action to perform")
13 parser.add_argument('-ct', '--count_table', action='append',
14 help="mothur count table")
15 parser.add_argument('-cp', '--copies', help="copies of NC for samples")
16 parser.add_argument('-nccp', '--nc_copies', help="copies of NC for itself")
17 parser.add_argument('-t', '--taxonomy', action='append',
18 help="mothur taxonomy file")
19 parser.add_argument('-s', '--shared_file', action='append',
20 help="mothur shared file")
21 parser.add_argument('-otu', '--otutable', action='append',
22 help="mothur OTU table")
23 parser.add_argument('-f', '--fasta', action='append', help="fasta")
24 parser.add_argument('-sl', '--summary_log', action='append',
25 help="mothur summary log file")
26 parser.add_argument('-o', '--outfile', help="output file")
27 parser.add_argument('-od', '--outdir', help="output directory", default="")
28 parser.add_argument('-lv', '--level', help="taxonomy level")
29 parser.add_argument('-nc', '--negative_control',
30 help="sample name of the negative control")
31 parser.add_argument('-ncs', '--negative_control_species',
32 help="species name of the negative control",
33 default="Oscillatoria")
34 parser.add_argument('-r', '--replicate_suffix',
35 help="suffix to identify replicates")
36 parser.add_argument('-l', '--label', action='append',
37 help="label for count table")
38 parser.add_argument('--with-otu', dest='with_otu', action='store_true',
39 default=False)
40 args = parser.parse_args()
41
42 try:
43 os.mkdir(args.outdir)
44 except OSError:
45 pass
46
47 print("Running command: "+args.command)
48
49 if args.command == 'counttable_totals':
50 count_table_totals(args.count_table[0], args.outdir, args.outfile)
51
52 elif args.command == 'qc_report':
53 if args.count_table:
54 qc_report(args.count_table, args.label, 'counttables', args.outdir)
55 elif args.summary_log:
56 qc_report(args.summary_log, args.label, 'summarylogs', args.outdir)
57
58 elif args.command == 'create_krona_plot':
59 create_krona_plot(args.taxonomy, args.outdir, args.with_otu)
60
61 elif args.command == 'create_krona_plot_multisample':
62 create_krona_plot_multisample(args.taxonomy, args.shared_file,
63 args.level, args.outdir, args.with_otu)
64
65 elif args.command == 'correct_replicates':
66 correct_replicates(args.shared_file, args.taxonomy, args.outdir,
67 args.replicate_suffix, args.copies,
68 args.negative_control, args.nc_copies,
69 args.negative_control_species)
70
71 elif args.command == 'make_multi_otutable':
72 make_multi_otutable(args.taxonomy, args.shared_file, args.level,
73 args.outdir)
74
75 elif args.command == 'otutable_add_blast_links':
76 otutable_add_blast_links(args.otutable, args.fasta)
77
78 elif args.command == 'split_multi_otutable':
79 split_multi_otutable(args.otutable)
80
81 else:
82 print("unknown command. exiting")
83
84
85 def make_url(seq, baseurl):
86 return baseurl+"?DATABASE=nr&PERC_IDENT=97&EXCLUDE_SEQ_UNCULT=on&" \
87 "HITLIST_SIZE=10&FILTER=L&FILTER=m&FILTER=R&EXPECT=10&" \
88 "FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&" \
89 "SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY=" \
90 + seq.lower()
91
92
93 def make_RIDlink(RID, baseurl):
94 return "<a target=\"_blank\" href=\""+baseurl+"?CMD=Get&RID="\
95 + RID + "\">view results</a>"
96
97
98 def make_rerun_link(seq, baseurl):
99 return "<a target=\"_blank\" href=\"" + baseurl +\
100 "?DATABASE=nr&EXCLUDE_SEQ_UNCULT=yes&FILTER=L&FORMAT_TYPE=HTML" \
101 "&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&" \
102 "CMD=Web&QUERY=" + seq.lower() + "\">send to BLAST</a>"
103
104
105 def otutable_add_blast_links(otutable, otureps):
106 baseurl = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
107
108 # for each otu create blast search of corresponding representative sequence
109 reps = [line for line in open(otureps[0], "r")]
110
111 seqs = [r.rstrip('\n').replace('-', '') for r in reps if '>' not in r]
112 seq_names = [r for r in reps if '>' in r]
113 otulines = [line for line in open(otutable[0], "r")]
114
115 # Add RID link and rerun link to table
116 with open("otutable_with_blast.tsv", "w+") as outfile, \
117 open("filtered_otureps.fasta", "w+") as repsout:
118 outfile.write(otulines[0].rstrip() + "\tBLAST\n")
119
120 for otuline in otulines[1:]:
121 otu = otuline.split('\t')[0]
122 for i, seq in enumerate(seq_names):
123 if otu in seq:
124 outfile.write(otuline.rstrip() + "\t" +
125 make_rerun_link(seqs[i], baseurl) + "\n")
126 # output otureps for these otus
127 for i, seq in enumerate(reps):
128 if otu in seq:
129 repsout.write(reps[i])
130 repsout.write(reps[i+1])
131
132
133 def summarylog_total(infile):
134 with open(infile) as f:
135 summarylog = f.readlines()
136 for line in summarylog:
137 if line.startswith('# of Seqs:') \
138 or line.startswith('total # of seqs:'):
139 return int(line.split('\t')[-1])
140 return None
141
142
143 def count_table_totals(infile, outdir='', outfile=''):
144 """
145 Given a Mothur counttable, calculate the total number of sequences for each
146 sample. This can be appended as additional row in the count table by
147 providing a file name.
148
149 :param infile: Mothur count table.
150 :param outfile: Optional. Write the count table with an additional row with
151 totals to this file
152 :param outdir: Optional. Write output do this directory
153 :return: A list with totals for all columns (samples) in the count table
154 """
155 # assume a single input file for now
156 out_rows = []
157 with open(infile) as f:
158 count_table = csv.reader(f, delimiter='\t')
159
160 header = next(count_table)
161 totals = [0] * (len(header)-1)
162
163 out_rows.append(header)
164 for row in count_table:
165 if row[0] != 'total':
166 out_rows.append(row)
167 for i in range(1, len(row)):
168 totals[i-1] += int(row[i])
169
170 out_rows.append(["total"] + map(str, totals))
171
172 # write count table with totals to file if requested
173 if outfile:
174 write_output(outdir, outfile, out_rows)
175
176 return totals
177
178
179 def qc_report(infiles, label, inputtype, outdir=''):
180 """
181 Construct QC table from multiple count files
182 - Report the number of sequences lost at each consecutive QC step
183 - Create a multi-sample report and a separate report for each sample
184
185 :param infiles: set of count tables
186 :param label: labels for each step
187 :param outdir: directory to place output files. Default: cwd
188 :return:
189 """
190 assert len(infiles) == len(label), \
191 "number of files and labels unequal, stopping"
192
193 print("qcreport")
194 previous_totals = []
195 outlines = []
196 lasttotal = None
197
198 for (i, lab) in zip(infiles, label):
199 with open(i, 'rb') as f:
200 count_file = csv.reader(f, delimiter='\t')
201
202 if inputtype == 'summarylogs':
203 print("summarylogs")
204 if not outlines:
205 outlines = [['Step', 'Total', 'Removed', 'Percentage']]
206
207 # get total count
208 total = summarylog_total(i)
209
210 # calculate difference with last
211 if not lasttotal:
212 outlines.append([lab, total, None, None])
213 else:
214 diff = total - lasttotal
215 perc = float(diff)/float(lasttotal)*100.0
216 outlines.append([lab, total, diff, str("%.1f" % perc)+"%"])
217
218 lasttotal = total
219
220 else:
221 # add header line to output
222 if not outlines:
223 outlines = [['step'] + next(count_file)[1:]]
224
225 # calculate totals of each column in the count file
226 totals = count_table_totals(i)
227
228 # calculate difference with previous count file
229 if not previous_totals:
230 diffs = [""] * len(totals)
231 else:
232 diffs = [" (" + str(t1-t2)+"; "
233 + str("%.1f" % (float(t1-t2)/float(t2)*100.0)) +
234 "%)" for t1, t2 in zip(totals, previous_totals)]
235
236 outlines.append([lab] +
237 [str(a)+b for a, b in zip(totals, diffs)])
238 previous_totals = totals
239
240 # write multi-sample output file
241 write_output(outdir, 'all_qctable.tsv', outlines)
242
243 # write per-sample output files
244 for j in range(2, len(outlines[0])):
245 sample = outlines[0][j]
246 sample_outlines = [[outlines_line[0], outlines_line[j]]
247 for outlines_line in outlines]
248 write_output(outdir, 'persample_qctable_'+sample+'.tsv',
249 sample_outlines)
250
251
252 def column(matrix, i):
253 return [row[i] for row in matrix]
254
255
256 def mean(data):
257 return sum(data) / float(len(data))
258
259
260 def stdev(data):
261 c = mean(data)
262 ss = sum((x-c)**2 for x in data)
263 n = len(data)
264 return math.sqrt(ss/(n-1))
265
266
267 def write_output(outdir, filename, outlines):
268 with open(os.path.join(outdir, filename), 'wb') as of:
269 out_table = csv.writer(of, delimiter='\t', lineterminator='\n')
270 for row in outlines:
271 out_table.writerow(row)
272
273
274 def correct_replicates(shared, taxonomy, outdir, replicate_suffix,
275 sample_copies, negative_control='', nc_copies=-1,
276 negative_control_species='Oscillatoria'):
277 with open(shared[0], 'rb') as f, open(taxonomy[0], 'rb') as f2:
278 shared_file = csv.reader(f, delimiter='\t')
279 taxonomy_file = csv.reader(f2, delimiter='\t')
280
281 # determine which OTU number is the control, Oscillatoria by default
282 # (Bacteria;Cyanobacteria;Cyanobacteria;..;Oscillatoria)
283 try:
284 line = next(taxonomy_file)
285 while negative_control_species not in line[2]:
286 line = next(taxonomy_file)
287 otu = line[0]
288 except StopIteration:
289 print("negative control species not found in taxonomy, Exiting")
290 return 1
291
292 ''' Calculate Copies '''
293 # per replicate of sample and NC, determine correction factor,
294 # (number Oscillatoria seqs/known copies of it)
295 # correct all sequence counts with that
296 myshared = [row for row in shared_file if row]
297 newshared = [myshared[0]]
298 newshared2 = [myshared[0]]
299 newshared3 = [myshared[0]]
300 newshared4 = [myshared[0]]
301 oscil_column = myshared[0].index(otu)
302
303 for row in myshared[1:]:
304 if row[1].startswith(negative_control):
305 copies = nc_copies
306 else:
307 copies = sample_copies
308
309 correction_factor = float(row[oscil_column]) / float(copies)
310
311 new_row = row[0:3]
312 for count in row[3:]:
313 try:
314 newval = float(count) / correction_factor
315 except ZeroDivisionError:
316 newval = float(count)
317 new_row.append(newval)
318 newshared.append(new_row)
319
320 ''' Average copy counts across replicates '''
321 levels = set([row[0] for row in newshared[1:]])
322 samples = set([row[1].split(replicate_suffix)[0]
323 for row in newshared[1:]])
324
325 for level in levels:
326 for sample in samples:
327 neg = True if sample.startswith(negative_control) else False
328 replicates = [row for row in newshared if row[0] == level
329 and row[1].startswith(sample)]
330 num_otus = int(replicates[0][2])+3
331 total = replicates[0][2]
332 avg = [level, sample, total]
333
334 for i in range(3, num_otus):
335 counts = column(replicates, i)
336 avg.append(mean(counts)) if 0 not in counts or neg \
337 else avg.append(0)
338
339 newshared2.append(avg)
340
341 ''' Average *sequence* counts across replicates '''
342 levels = set([row[0] for row in myshared[1:]])
343 samples = set([row[1].split(replicate_suffix)[0]
344 for row in myshared[1:]])
345
346 for level in levels:
347 for sample in samples:
348 replicates = [row for row in myshared if row[0] == level
349 and row[1].startswith(sample)]
350 num_otus = int(replicates[0][2]) + 3
351 total = replicates[0][2]
352 avg = [level, sample, total]
353
354 for i in range(3, num_otus):
355 counts = map(int, column(replicates, i))
356 avg.append(int(round(mean(counts))))
357
358 newshared4.append(avg)
359
360 ''' Correct for background '''
361 # for each otu, subtract 3 times the standard deviation of
362 # the negative control sample
363 for level in levels:
364 NC = [row for row in newshared if row[0] == level
365 and row[1].startswith(negative_control)]
366 samples = [row for row in newshared2 if row[0] == level
367 and not row[1].startswith(negative_control)]
368 num_otus = int(samples[0][2])+3
369
370 for i in range(3, num_otus):
371 m = mean(column(NC, i))
372 sd = stdev(column(NC, i))
373 corr = m + 3*sd
374
375 for s in samples:
376 s[i] = max(0, int(round(s[i] - corr)))
377
378 newshared3 += samples
379
380 # remove Negative control species otu from table
381 for i, row in enumerate(newshared3):
382 del row[oscil_column]
383 if i > 0:
384 row[2] = int(row[2]) - 1
385
386 # sort file or other mothur tools may segfault :/
387 newshared3 = [newshared3[0]] + sorted(newshared3[1:],
388 key=lambda a_entry: a_entry[0]
389 if a_entry[0] != 'unique' else 0)
390
391 f2.seek(0)
392 taxonomy_out = [row for row in taxonomy_file if row and row[0] != otu]
393 write_output(outdir, 'taxonomy_corrected.tsv', taxonomy_out)
394 write_output(outdir, 'shared_corrected.tsv', newshared3)
395 write_output(outdir, 'shared_averaged.tsv', newshared4)
396
397
398 def make_multi_otutable(taxonomy_file, shared_file, level, outdir):
399 """
400 Create an otu table from shared file and taxonomy file
401
402 example output:
403
404 OTU sample1 sample2 .. sampleX Kingdom Phylum Class Order Family Genus
405 Otu001 13 8 .. 91 Bacteria Bacteroidetes Bacteroidia ..
406 ...
407
408 :param taxonomy_file:
409 :param shared_file:
410 :param level:
411
412 :return:
413 """
414
415 outlines = []
416 samples = []
417 # create multisample taxonomy from counts in shared file
418 with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
419 taxonomy = csv.reader(tax, delimiter='\t')
420 shared = csv.reader(sh, delimiter='\t')
421 shared_header = next(shared)
422 outlines.append(shared_header[3:])
423
424 # get all taxonomies
425 taxonomies = []
426 for j, t in enumerate(taxonomy):
427 if j > 0:
428 taxonomies.append(filter(None, [x.split('(')[0]
429 for x in t[2].split(';')]))
430
431 for i, row in enumerate(shared):
432 tax.seek(0) # make sure to start at beginning of file each time
433 if row[0] == level:
434 samples.append(row[1])
435 outlines.append(row[3:])
436
437 transposed = map(list, zip(*outlines))
438 header = ["OTU"] + samples + ["Kingdom", "Phylum", "Class", "Order",
439 "Family", "Genus"]
440
441 writelines = [header] + [a + b for a, b in zip(transposed, taxonomies)
442 if a[1:] != ['0'] * len(a[1:])]
443
444 # sort sample columns by name
445 lst = map(list, zip(*[w[1:-6] for w in writelines]))
446 lst.sort(key=lambda x: x[0])
447 lst = map(list, zip(*lst))
448 writelines2 = [[writelines[i][0]] + lst[i] + writelines[i][-6:]
449 for i in range(0, len(writelines))]
450
451 # output corrected shared file
452 write_output(outdir, "multi_otutable.tsv", writelines2)
453
454
455 def split_multi_otutable(otutable, with_avg=True):
456 fulltable = [line.strip().split('\t')
457 for line in open(otutable[0], 'r') if line]
458 samples = [s.split('_')[0] for s in fulltable[0][1:-6]]
459 numcols = len(fulltable[0])
460 numreplicates = (numcols - 7) / len(set(samples))
461
462 for sample in set(samples):
463 outlines = []
464 cols = [0] + [i+1 for i, s in enumerate(samples) if sample in s] \
465 + [i for i in range(numcols-6, numcols)]
466 for i, line in enumerate(fulltable):
467 out = [line[j] for j in cols]
468 if out[1:-6] != ['0'] * numreplicates:
469 out.insert(-6, 'mean' if i == 0
470 else int(round(mean(map(int, out[1:-6])))))
471 outlines.append(out)
472
473 write_output('.', sample+'.otutable', outlines)
474
475
476 def create_krona_plot_multisample(taxonomy_file, shared_file, level, outdir,
477 with_otu):
478 """
479 Create krona plots from a multisample taxonomy plot and a shared file.
480 Create one multisample plot and a plot per individual sample
481
482 :param taxonomy_file:
483 :param shared_file:
484 :param level: which level to use, e.g. unique/0.03/..
485 :param with_otu:
486 :return:
487 """
488
489 taxonomies = []
490
491 # create taxonomy file per sample
492 with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
493 taxonomy = csv.reader(tax, delimiter='\t')
494 shared = csv.reader(sh, delimiter='\t')
495 shared_header = next(shared)
496
497 for i, row in enumerate(shared):
498 tax.seek(0) # make sure to start at beginning of file each time
499 if row[0] == level:
500 sample = row[1]
501
502 outfile = os.path.join(outdir, sample+".tsv")
503 taxonomies.append(outfile)
504 with open(outfile, 'w+') as of:
505 out_table = csv.writer(of, delimiter='\t')
506 out_table.writerow(next(taxonomy)) # header line
507 for j, t in enumerate(taxonomy):
508 assert t[0] == shared_header[j+3], \
509 "OTU mismatch between taxonomy and shared file"
510 t[1] = row[j+3]
511 out_table.writerow(t + [shared_header[j+3]])
512
513 # make krona plot
514 create_krona_plot(taxonomies, outdir, with_otu)
515
516
517 def create_krona_plot(taxonomy_files, outdir, with_otu):
518 """
519 Create a krona plot from one or more mothur taxonomy files
520
521 :param taxonomy_files: mothur taxonomy file (output from classify.otu)
522 :param outdir: directory to store krona-formatted outputs. Default=cwd
523 :param with_otu: add OTU number as a level in the Krona plot? Default=True
524 :return:
525 """
526 krona_input_files = []
527
528 # convert taxonomy files to krona input.
529 for tax in taxonomy_files:
530 with open(tax, 'r') as f:
531 taxonomy = csv.reader(f, delimiter='\t')
532 out_rows = []
533
534 next(taxonomy) # skip header line
535 for row in taxonomy:
536 out_rows.append(
537 filter(None, [row[1]] + row[2].rstrip(";\n").split(';') +
538 [row[0] if with_otu else None]))
539
540 outfile = os.path.join(outdir, tax.split("/")[-1]+"krona")
541 krona_input_files.append(outfile)
542
543 with open(outfile, 'w+') as f2:
544 out_table = csv.writer(f2, delimiter='\t')
545 for row in out_rows:
546 out_table.writerow(row)
547
548 # execute krona command
549 call(["ktImportText"] + krona_input_files)
550
551
552 if __name__ == "__main__":
553 main()