mycrobiota: mycrobiota.py comparison

comparison mycrobiota.py @ 0:607c5e7e0a64 draft

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/mycrobiota commit 1c4c58018b64ff3531a719e789ce71cb0a1244c5

author	erasmus-medical-center
date	Wed, 13 Dec 2017 10:09:50 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:607c5e7e0a64
+import argparse
+import csv
+import math
+import os
+from subprocess import call
+def main():
+print("Welcome to the MYcrobiota suite")
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--command', required=True,
+help="What action to perform")
+parser.add_argument('-ct', '--count_table', action='append',
+help="mothur count table")
+parser.add_argument('-cp', '--copies', help="copies of NC for samples")
+parser.add_argument('-nccp', '--nc_copies', help="copies of NC for itself")
+parser.add_argument('-t', '--taxonomy', action='append',
+help="mothur taxonomy file")
+parser.add_argument('-s', '--shared_file', action='append',
+help="mothur shared file")
+parser.add_argument('-otu', '--otutable', action='append',
+help="mothur OTU table")
+parser.add_argument('-f', '--fasta', action='append', help="fasta")
+parser.add_argument('-sl', '--summary_log', action='append',
+help="mothur summary log file")
+parser.add_argument('-o', '--outfile', help="output file")
+parser.add_argument('-od', '--outdir', help="output directory", default="")
+parser.add_argument('-lv', '--level', help="taxonomy level")
+parser.add_argument('-nc', '--negative_control',
+help="sample name of the negative control")
+parser.add_argument('-ncs', '--negative_control_species',
+help="species name of the negative control",
+default="Oscillatoria")
+parser.add_argument('-r', '--replicate_suffix',
+help="suffix to identify replicates")
+parser.add_argument('-l', '--label', action='append',
+help="label for count table")
+parser.add_argument('--with-otu', dest='with_otu', action='store_true',
+default=False)
+args = parser.parse_args()
+try:
+os.mkdir(args.outdir)
+except OSError:
+pass
+print("Running command: "+args.command)
+if args.command == 'counttable_totals':
+count_table_totals(args.count_table[0], args.outdir, args.outfile)
+elif args.command == 'qc_report':
+if args.count_table:
+qc_report(args.count_table, args.label, 'counttables', args.outdir)
+elif args.summary_log:
+qc_report(args.summary_log, args.label, 'summarylogs', args.outdir)
+elif args.command == 'create_krona_plot':
+create_krona_plot(args.taxonomy, args.outdir, args.with_otu)
+elif args.command == 'create_krona_plot_multisample':
+create_krona_plot_multisample(args.taxonomy, args.shared_file,
+args.level, args.outdir, args.with_otu)
+elif args.command == 'correct_replicates':
+correct_replicates(args.shared_file, args.taxonomy, args.outdir,
+args.replicate_suffix, args.copies,
+args.negative_control, args.nc_copies,
+args.negative_control_species)
+elif args.command == 'make_multi_otutable':
+make_multi_otutable(args.taxonomy, args.shared_file, args.level,
+args.outdir)
+elif args.command == 'otutable_add_blast_links':
+otutable_add_blast_links(args.otutable, args.fasta)
+elif args.command == 'split_multi_otutable':
+split_multi_otutable(args.otutable)
+else:
+print("unknown command. exiting")
+def make_url(seq, baseurl):
+return baseurl+"?DATABASE=nr&PERC_IDENT=97&EXCLUDE_SEQ_UNCULT=on&" \
+"HITLIST_SIZE=10&FILTER=L&FILTER=m&FILTER=R&EXPECT=10&" \
+"FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&" \
+"SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY=" \
++ seq.lower()
+def make_RIDlink(RID, baseurl):
+return "<a target=\"_blank\" href=\""+baseurl+"?CMD=Get&RID="\
++ RID + "\">view results</a>"
+def make_rerun_link(seq, baseurl):
+return "<a target=\"_blank\" href=\"" + baseurl +\
+"?DATABASE=nr&EXCLUDE_SEQ_UNCULT=yes&FILTER=L&FORMAT_TYPE=HTML" \
+"&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&" \
+"CMD=Web&QUERY=" + seq.lower() + "\">send to BLAST</a>"
+def otutable_add_blast_links(otutable, otureps):
+baseurl = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
+# for each otu create blast search of corresponding representative sequence
+reps = [line for line in open(otureps[0], "r")]
+seqs = [r.rstrip('\n').replace('-', '') for r in reps if '>' not in r]
+seq_names = [r for r in reps if '>' in r]
+otulines = [line for line in open(otutable[0], "r")]
+# Add RID link and rerun link to table
+with open("otutable_with_blast.tsv", "w+") as outfile, \
+open("filtered_otureps.fasta", "w+") as repsout:
+outfile.write(otulines[0].rstrip() + "\tBLAST\n")
+for otuline in otulines[1:]:
+otu = otuline.split('\t')[0]
+for i, seq in enumerate(seq_names):
+if otu in seq:
+outfile.write(otuline.rstrip() + "\t" +
+make_rerun_link(seqs[i], baseurl) + "\n")
+# output otureps for these otus
+for i, seq in enumerate(reps):
+if otu in seq:
+repsout.write(reps[i])
+repsout.write(reps[i+1])
+def summarylog_total(infile):
+with open(infile) as f:
+summarylog = f.readlines()
+for line in summarylog:
+if line.startswith('# of Seqs:') \
+or line.startswith('total # of seqs:'):
+return int(line.split('\t')[-1])
+return None
+def count_table_totals(infile, outdir='', outfile=''):
+"""
+Given a Mothur counttable, calculate the total number of sequences for each
+sample. This can be appended as additional row in the count table by
+providing a file name.
+:param infile: Mothur count table.
+:param outfile: Optional. Write the count table with an additional row with
+totals to this file
+:param outdir: Optional. Write output do this directory
+:return: A list with totals for all columns (samples) in the count table
+"""
+# assume a single input file for now
+out_rows = []
+with open(infile) as f:
+count_table = csv.reader(f, delimiter='\t')
+header = next(count_table)
+totals = [0] * (len(header)-1)
+out_rows.append(header)
+for row in count_table:
+if row[0] != 'total':
+out_rows.append(row)
+for i in range(1, len(row)):
+totals[i-1] += int(row[i])
+out_rows.append(["total"] + map(str, totals))
+# write count table with totals to file if requested
+if outfile:
+write_output(outdir, outfile, out_rows)
+return totals
+def qc_report(infiles, label, inputtype, outdir=''):
+"""
+Construct QC table from multiple count files
+- Report the number of sequences lost at each consecutive QC step
+- Create a multi-sample report and a separate report for each sample
+:param infiles: set of count tables
+:param label: labels for each step
+:param outdir: directory to place output files. Default: cwd
+:return:
+"""
+assert len(infiles) == len(label), \
+"number of files and labels unequal, stopping"
+print("qcreport")
+previous_totals = []
+outlines = []
+lasttotal = None
+for (i, lab) in zip(infiles, label):
+with open(i, 'rb') as f:
+count_file = csv.reader(f, delimiter='\t')
+if inputtype == 'summarylogs':
+print("summarylogs")
+if not outlines:
+outlines = [['Step', 'Total', 'Removed', 'Percentage']]
+# get total count
+total = summarylog_total(i)
+# calculate difference with last
+if not lasttotal:
+outlines.append([lab, total, None, None])
+else:
+diff = total - lasttotal
+perc = float(diff)/float(lasttotal)*100.0
+outlines.append([lab, total, diff, str("%.1f" % perc)+"%"])
+lasttotal = total
+else:
+# add header line to output
+if not outlines:
+outlines = [['step'] + next(count_file)[1:]]
+# calculate totals of each column in the count file
+totals = count_table_totals(i)
+# calculate difference with previous count file
+if not previous_totals:
+diffs = [""] * len(totals)
+else:
+diffs = [" (" + str(t1-t2)+"; "
++ str("%.1f" % (float(t1-t2)/float(t2)*100.0)) +
+"%)" for t1, t2 in zip(totals, previous_totals)]
+outlines.append([lab] +
+[str(a)+b for a, b in zip(totals, diffs)])
+previous_totals = totals
+# write multi-sample output file
+write_output(outdir, 'all_qctable.tsv', outlines)
+# write per-sample output files
+for j in range(2, len(outlines[0])):
+sample = outlines[0][j]
+sample_outlines = [[outlines_line[0], outlines_line[j]]
+for outlines_line in outlines]
+write_output(outdir, 'persample_qctable_'+sample+'.tsv',
+sample_outlines)
+def column(matrix, i):
+return [row[i] for row in matrix]
+def mean(data):
+return sum(data) / float(len(data))
+def stdev(data):
+c = mean(data)
+ss = sum((x-c)**2 for x in data)
+n = len(data)
+return math.sqrt(ss/(n-1))
+def write_output(outdir, filename, outlines):
+with open(os.path.join(outdir, filename), 'wb') as of:
+out_table = csv.writer(of, delimiter='\t', lineterminator='\n')
+for row in outlines:
+out_table.writerow(row)
+def correct_replicates(shared, taxonomy, outdir, replicate_suffix,
+sample_copies, negative_control='', nc_copies=-1,
+negative_control_species='Oscillatoria'):
+with open(shared[0], 'rb') as f, open(taxonomy[0], 'rb') as f2:
+shared_file = csv.reader(f, delimiter='\t')
+taxonomy_file = csv.reader(f2, delimiter='\t')
+# determine which OTU number is the control, Oscillatoria by default
+# (Bacteria;Cyanobacteria;Cyanobacteria;..;Oscillatoria)
+try:
+line = next(taxonomy_file)
+while negative_control_species not in line[2]:
+line = next(taxonomy_file)
+otu = line[0]
+except StopIteration:
+print("negative control species not found in taxonomy, Exiting")
+return 1
+''' Calculate Copies '''
+# per replicate of sample and NC, determine correction factor,
+# (number Oscillatoria seqs/known copies of it)
+# correct all sequence counts with that
+myshared = [row for row in shared_file if row]
+newshared = [myshared[0]]
+newshared2 = [myshared[0]]
+newshared3 = [myshared[0]]
+newshared4 = [myshared[0]]
+oscil_column = myshared[0].index(otu)
+for row in myshared[1:]:
+if row[1].startswith(negative_control):
+copies = nc_copies
+else:
+copies = sample_copies
+correction_factor = float(row[oscil_column]) / float(copies)
+new_row = row[0:3]
+for count in row[3:]:
+try:
+newval = float(count) / correction_factor
+except ZeroDivisionError:
+newval = float(count)
+new_row.append(newval)
+newshared.append(new_row)
+''' Average copy counts across replicates  '''
+levels = set([row[0] for row in newshared[1:]])
+samples = set([row[1].split(replicate_suffix)[0]
+for row in newshared[1:]])
+for level in levels:
+for sample in samples:
+neg = True if sample.startswith(negative_control) else False
+replicates = [row for row in newshared if row[0] == level
+and row[1].startswith(sample)]
+num_otus = int(replicates[0][2])+3
+total = replicates[0][2]
+avg = [level, sample, total]
+for i in range(3, num_otus):
+counts = column(replicates, i)
+avg.append(mean(counts)) if 0 not in counts or neg \
+else avg.append(0)
+newshared2.append(avg)
+''' Average *sequence* counts across replicates  '''
+levels = set([row[0] for row in myshared[1:]])
+samples = set([row[1].split(replicate_suffix)[0]
+for row in myshared[1:]])
+for level in levels:
+for sample in samples:
+replicates = [row for row in myshared if row[0] == level
+and row[1].startswith(sample)]
+num_otus = int(replicates[0][2]) + 3
+total = replicates[0][2]
+avg = [level, sample, total]
+for i in range(3, num_otus):
+counts = map(int, column(replicates, i))
+avg.append(int(round(mean(counts))))
+newshared4.append(avg)
+''' Correct for background '''
+# for each otu, subtract 3 times the standard deviation of
+# the negative control sample
+for level in levels:
+NC = [row for row in newshared if row[0] == level
+and row[1].startswith(negative_control)]
+samples = [row for row in newshared2 if row[0] == level
+and not row[1].startswith(negative_control)]
+num_otus = int(samples[0][2])+3
+for i in range(3, num_otus):
+m = mean(column(NC, i))
+sd = stdev(column(NC, i))
+corr = m + 3*sd
+for s in samples:
+s[i] = max(0, int(round(s[i] - corr)))
+newshared3 += samples
+# remove Negative control species otu from table
+for i, row in enumerate(newshared3):
+del row[oscil_column]
+if i > 0:
+row[2] = int(row[2]) - 1
+# sort file or other mothur tools may segfault :/
+newshared3 = [newshared3[0]] + sorted(newshared3[1:],
+key=lambda a_entry: a_entry[0]
+if a_entry[0] != 'unique' else 0)
+f2.seek(0)
+taxonomy_out = [row for row in taxonomy_file if row and row[0] != otu]
+write_output(outdir, 'taxonomy_corrected.tsv', taxonomy_out)
+write_output(outdir, 'shared_corrected.tsv', newshared3)
+write_output(outdir, 'shared_averaged.tsv', newshared4)
+def make_multi_otutable(taxonomy_file, shared_file, level, outdir):
+"""
+Create an otu table from shared file and taxonomy file
+example output:
+OTU    sample1 sample2 .. sampleX Kingdom  Phylum  Class Order Family Genus
+Otu001 13      8       .. 91      Bacteria Bacteroidetes Bacteroidia ..
+...
+:param taxonomy_file:
+:param shared_file:
+:param level:
+:return:
+"""
+outlines = []
+samples = []
+# create multisample taxonomy from counts in shared file
+with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
+taxonomy = csv.reader(tax, delimiter='\t')
+shared = csv.reader(sh, delimiter='\t')
+shared_header = next(shared)
+outlines.append(shared_header[3:])
+# get all taxonomies
+taxonomies = []
+for j, t in enumerate(taxonomy):
+if j > 0:
+taxonomies.append(filter(None, [x.split('(')[0]
+for x in t[2].split(';')]))
+for i, row in enumerate(shared):
+tax.seek(0)  # make sure to start at beginning of file each time
+if row[0] == level:
+samples.append(row[1])
+outlines.append(row[3:])
+transposed = map(list, zip(*outlines))
+header = ["OTU"] + samples + ["Kingdom", "Phylum", "Class", "Order",
+"Family", "Genus"]
+writelines = [header] + [a + b for a, b in zip(transposed, taxonomies)
+if a[1:] != ['0'] * len(a[1:])]
+# sort sample columns by name
+lst = map(list, zip(*[w[1:-6] for w in writelines]))
+lst.sort(key=lambda x: x[0])
+lst = map(list, zip(*lst))
+writelines2 = [[writelines[i][0]] + lst[i] + writelines[i][-6:]
+for i in range(0, len(writelines))]
+# output corrected shared file
+write_output(outdir, "multi_otutable.tsv", writelines2)
+def split_multi_otutable(otutable, with_avg=True):
+fulltable = [line.strip().split('\t')
+for line in open(otutable[0], 'r') if line]
+samples = [s.split('_')[0] for s in fulltable[0][1:-6]]
+numcols = len(fulltable[0])
+numreplicates = (numcols - 7) / len(set(samples))
+for sample in set(samples):
+outlines = []
+cols = [0] + [i+1 for i, s in enumerate(samples) if sample in s] \
++ [i for i in range(numcols-6, numcols)]
+for i, line in enumerate(fulltable):
+out = [line[j] for j in cols]
+if out[1:-6] != ['0'] * numreplicates:
+out.insert(-6, 'mean' if i == 0
+else int(round(mean(map(int, out[1:-6])))))
+outlines.append(out)
+write_output('.', sample+'.otutable', outlines)
+def create_krona_plot_multisample(taxonomy_file, shared_file, level, outdir,
+with_otu):
+"""
+Create krona plots from a multisample taxonomy plot and a shared file.
+Create one multisample plot and a plot per individual sample
+:param taxonomy_file:
+:param shared_file:
+:param level: which level to use, e.g. unique/0.03/..
+:param with_otu:
+:return:
+"""
+taxonomies = []
+# create taxonomy file per sample
+with open(taxonomy_file[0], 'r') as tax, open(shared_file[0]) as sh:
+taxonomy = csv.reader(tax, delimiter='\t')
+shared = csv.reader(sh, delimiter='\t')
+shared_header = next(shared)
+for i, row in enumerate(shared):
+tax.seek(0)  # make sure to start at beginning of file each time
+if row[0] == level:
+sample = row[1]
+outfile = os.path.join(outdir, sample+".tsv")
+taxonomies.append(outfile)
+with open(outfile, 'w+') as of:
+out_table = csv.writer(of, delimiter='\t')
+out_table.writerow(next(taxonomy))  # header line
+for j, t in enumerate(taxonomy):
+assert t[0] == shared_header[j+3], \
+"OTU mismatch between taxonomy and shared file"
+t[1] = row[j+3]
+out_table.writerow(t + [shared_header[j+3]])
+# make krona plot
+create_krona_plot(taxonomies, outdir, with_otu)
+def create_krona_plot(taxonomy_files, outdir, with_otu):
+"""
+Create a krona plot from one or more mothur taxonomy files
+:param taxonomy_files: mothur taxonomy file (output from classify.otu)
+:param outdir: directory to store krona-formatted outputs. Default=cwd
+:param with_otu: add OTU number as a level in the Krona plot? Default=True
+:return:
+"""
+krona_input_files = []
+# convert taxonomy files to krona input.
+for tax in taxonomy_files:
+with open(tax, 'r') as f:
+taxonomy = csv.reader(f, delimiter='\t')
+out_rows = []
+next(taxonomy)  # skip header line
+for row in taxonomy:
+out_rows.append(
+filter(None, [row[1]] + row[2].rstrip(";\n").split(';') +
+[row[0] if with_otu else None]))
+outfile = os.path.join(outdir, tax.split("/")[-1]+"krona")
+krona_input_files.append(outfile)
+with open(outfile, 'w+') as f2:
+out_table = csv.writer(f2, delimiter='\t')
+for row in out_rows:
+out_table.writerow(row)
+# execute krona command
+call(["ktImportText"] + krona_input_files)
+if __name__ == "__main__":
+main()

Mercurial > repos > erasmus-medical-center > mycrobiota

comparison mycrobiota.py @ 0:607c5e7e0a64 draft