Mercurial > repos > mheinzl > hd
diff hd.py @ 29:6b15b3b6405c draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 5b3ab8c6467fe3a52e89f5a7d175bd8a0189018a-dirty
author | mheinzl |
---|---|
date | Wed, 24 Jul 2019 05:58:15 -0400 |
parents | 1fa7342a140d |
children | 46bfbec0f9e6 |
line wrap: on
line diff
--- a/hd.py Mon Jun 03 05:37:01 2019 -0400 +++ b/hd.py Wed Jul 24 05:58:15 2019 -0400 @@ -5,16 +5,17 @@ # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) # Contact: monika.heinzl@edumail.at # -# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input. -# The program produces a plot which shows a histogram of Hamming distances separated after family sizes, -# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS. -# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads -# and finally a CSV file with the data of the plots. -# It is also possible to perform the HD analysis with shortened tags with given sizes as input. +# Takes at least one TABULAR file with tags before the alignment to the SSCS and +# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances +# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) +# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the +# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the +# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. # The tool can run on a certain number of processors, which can be defined by the user. # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / -# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int +# --nr_above_bars True/False --output_tabular outptufile_name_tabular import argparse import itertools @@ -34,7 +35,7 @@ def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, - title_file1, subtitle, pdf, relative=False, diff=True): + subtitle, pdf, relative=False, diff=True, rel_freq=False): if diff is False: colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"] @@ -48,36 +49,37 @@ fig = plt.figure(figsize=(6, 7)) ax = fig.add_subplot(111) plt.subplots_adjust(bottom=0.1) - p1 = numpy.bincount(numpy.concatenate((familySizeList1))) + p1 = numpy.bincount(numpy.concatenate(familySizeList1)) maximumY = numpy.amax(p1) if len(range(minimumXFS, maximumXFS)) == 0: range1 = range(minimumXFS - 1, minimumXFS + 2) else: range1 = range(0, maximumXFS + 2) - counts = plt.hist(familySizeList1, label=labels, - color=colors, stacked=True, - rwidth=0.8, alpha=1, align="left", - edgecolor="None", bins=range1) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] + counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1)) + else: + counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, + rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + if len(numpy.concatenate(familySizeList1)) != 0: + plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.ylim((0, maximumY * 1.2)) plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) - - # plt.title(title_file1, fontsize=12) plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) plt.xlabel("Family size", fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) - ticks = numpy.arange(0, maximumXFS + 1, 1) ticks1 = map(str, ticks) if maximumXFS >= 20: ticks1[len(ticks1) - 1] = ">=20" plt.xticks(numpy.array(ticks), ticks1) [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] - plt.xlim((0, maximumXFS + 1)) - if len(numpy.concatenate(familySizeList1)) != 0: - plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) - - plt.ylim((0, maximumY * 1.2)) legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) @@ -86,17 +88,17 @@ max_count = ">= 20" else: max_count = max(originalCounts) - legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) + legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) - legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1])) + legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) plt.grid(b=True, which='major', color='#424242', linestyle=':') - pdf.savefig(fig, bbox_inches="tight") plt.close("all") -def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0): +def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): if relative is True: step = 0.1 else: @@ -104,70 +106,153 @@ fig = plt.figure(figsize=(6, 8)) plt.subplots_adjust(bottom=0.1) - con_list1 = numpy.concatenate(list1) - p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())]) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) maximumY = numpy.amax(p1) - maximumX = int(maximumX) - print("max X", maximumX ) - if relative is True: # relative difference bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) else: bin1 = maximumX + 1 - counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, - label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", - "FS>10"], rwidth=0.8, - color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], - stacked=True, alpha=1, - align="left", - range=(0, maximumX + 1)) + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][5])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + maximumY = numpy.amax(p1) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][5])) + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) - bins = counts[1] # width of bins - counts = numpy.array(map(int, counts[0][5])) plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) - # plt.title(title_file1, fontsize=12) plt.xlabel(xlabel, fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) - plt.grid(b=True, which='major', color='#424242', linestyle=':') - plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) plt.xticks(numpy.arange(0, maximumX + step, step)) - plt.ylim((0, maximumY * 1.2)) - - if nr_above_bars is True: + if nr_above_bars: bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] for x_label, label in zip(counts, bin_centers): # labels for values if x_label == 0: continue else: - plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), - xy=(label, x_label + len(con_list1) * 0.01), - xycoords="data", color="#000066", fontsize=10) + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ + (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ + .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) - legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts)) - plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + step = 1 + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + bin1 = maximumX + 1 + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][2])) - # if nr_unique_chimeras != 0 and len_sample != 0: - # if relative == True: - # legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, - # int(nr_unique_chimeras) / float(len_sample)) - # else: - # legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) - # plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure) + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][2])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ + format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + + legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0])) + plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) pdf.savefig(fig, bbox_inches="tight") plt.close("all") plt.clf() -def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample): +def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): fig = plt.figure(figsize=(6, 8)) plt.subplots_adjust(bottom=0.1) ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags - maximumX = numpy.amax(numpy.concatenate(ham_partial)) minimumX = numpy.amin(numpy.concatenate(ham_partial)) maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) @@ -177,21 +262,30 @@ else: range1 = range(minimumX, maximumX + 2) - plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, + label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Relative Frequency", fontsize=14) + # plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2) + else: + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, + label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Absolute Frequency", fontsize=14) plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) plt.suptitle('Hamming distances within tags', fontsize=14) - # plt.title(title_file1, fontsize=12) plt.xlabel("HD", fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) plt.grid(b=True, which='major', color='#424242', linestyle=':') - - plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) + plt.xlim((minimumX - 1, maximumX + 1)) + # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) - # plt.ylim(0, maximumY * 1.2) - legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial))) - - # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(ham_partial))) plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) pdf.savefig(fig, bbox_inches="tight") plt.close("all") @@ -206,7 +300,6 @@ count = numpy.zeros((len(uniqueFS), 6)) else: count = numpy.zeros((len(uniqueFS), 7)) - state = 1 for i in list1: counts = list(Counter(i).items()) @@ -218,57 +311,48 @@ continue else: if state == 1: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 0] = j[1] if state == 2: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 1] = j[1] - if state == 3: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 2] = j[1] - if state == 4: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 3] = j[1] - if state == 5: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 4] = j[1] - if state == 6: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 5] = j[1] - if state == 7: - for i, l in zip(uniqueFS, nr): + for k, l in zip(uniqueFS, nr): for j in table: if j[0] == uniqueFS[l]: count[l, 6] = j[1] state = state + 1 - sumRow = count.sum(axis=1) sumCol = count.sum(axis=0) - uniqueFS = uniqueFS.astype(str) if uniqueFS[len(uniqueFS) - 1] == "20": uniqueFS[len(uniqueFS) - 1] = ">20" - first = ["FS={}".format(i) for i in uniqueFS] final = numpy.column_stack((first, count, sumRow)) - return (final, sumCol) @@ -276,13 +360,15 @@ output_file.write(name) output_file.write("\n") if diff is False: - output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) else: if rel is False: - output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep, sep)) else: - output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) - + output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". + format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) for item in summary: for nr in item: if "FS" not in nr and "diff" not in nr: @@ -314,46 +400,40 @@ continue else: if state == 1: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 0] = j[1] if state == 2: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 1] = j[1] - if state == 3: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 2] = j[1] - if state == 4: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 3] = j[1] - if state == 5: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 4] = j[1] - if state == 6: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 5] = j[1] state = state + 1 - sumRow = count.sum(axis=1) sumCol = count.sum(axis=0) first = ["{}{}".format(row_label, i) for i in uniqueHD] final = numpy.column_stack((first, count, sumRow)) - return (final, sumCol) @@ -362,7 +442,6 @@ uniqueHD = numpy.unique(selfAB) nr = numpy.arange(0, len(uniqueHD), 1) count = numpy.zeros((len(uniqueHD), 5)) - state = 1 for i in list1: counts = list(Counter(i).items()) @@ -374,45 +453,81 @@ continue else: if state == 1: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 0] = j[1] if state == 2: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 1] = j[1] if state == 3: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 2] = j[1] if state == 4: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 3] = j[1] if state == 5: - for i, l in zip(uniqueHD, nr): + for k, l in zip(uniqueHD, nr): for j in table: if j[0] == uniqueHD[l]: count[l, 4] = j[1] - state = state + 1 - sumRow = count.sum(axis=1) sumCol = count.sum(axis=0) first = ["HD={}".format(i) for i in uniqueHD] final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + +def createTableHDwithDCS(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), len(list1))) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["HD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) return (final, sumCol) def createFileHD(summary, sumCol, overallSum, output_file, name, sep): output_file.write(name) output_file.write("\n") - output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) for item in summary: for nr in item: if "HD" not in nr and "diff" not in nr: @@ -428,10 +543,29 @@ output_file.write("\n\n") +def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "HD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): output_file.write(name) output_file.write("\n") - output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) for item in summary: for nr in item: if "HD" not in nr: @@ -454,17 +588,14 @@ for a in array1: dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero - # print(i) i += 1 return res def hamming_difference(array1, array2, mate_b): array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time - array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 - array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 @@ -499,6 +630,7 @@ half2_mate1 = array1_half half1_mate2 = array2_half2 half2_mate2 = array2_half + # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) # print(len(half1_mate1)) # half2_mate1 = half2_mate1[index_halves] @@ -514,16 +646,18 @@ array2_half_withoutSame = half1_mate2[index_withoutSame] array2_half2_withoutSame = half2_mate2[index_withoutSame] array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) - + # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in - array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + array2_half_withoutSame]) min_index = numpy.where(dist == dist.min())[0] # get index of min HD min_value = dist.min() # min_value = dist[min_index] # get minimum HDs - min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD + # get all "b's" of the tag or all "a's" of the tag with minimum HD + min_tag_half2 = array2_half2_withoutSame[min_index] min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD - dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" + dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in + min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" max_value = dist_second_half.max() max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD max_tag = min_tag_array2[max_index] @@ -548,14 +682,11 @@ min_tagsList_zeros.append(numpy.array(tag)) difference1_zeros = abs(min_value - max_value) # hd of non-identical part diff11_zeros.append(difference1_zeros) - max_tag_list.append(max_tag) + max_tag_list.append(numpy.array(max_tag)) else: min_tagsList_zeros.append(None) diff11_zeros.append(None) - max_tag_list.append(numpy.array(["None"])) - - # max_tag_list.append(numpy.array(max_tag)) - + max_tag_list.append(None) i += 1 # print(i) @@ -567,7 +698,8 @@ # relativeDiffList = [st for st in relativeDiffList if st != 999] # diff11_zeros = [st for st in diff11_zeros if st != 999] # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] - return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list]) + return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, + min_tagsList_zeros, ham1min, ham2min, max_tag_list]) def readFileReferenceFree(file): @@ -608,7 +740,6 @@ def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): hammingDistances = numpy.unique(ham) fs = numpy.asarray(fs) - ham = numpy.asarray(ham) bigFamilies2 = numpy.where(fs > 19)[0] if len(bigFamilies2) != 0: @@ -663,6 +794,53 @@ return(list1, hammingDistances, maximum, minimum) +def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): + diff_zeros = numpy.array(diff_zeros) + maximum = numpy.amax(diff_zeros) + minimum = numpy.amin(diff_zeros) + minHD_tags_zeros = numpy.array(minHD_tags_zeros) + + idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) + subset_data = data_array[idx, :] + + seq = numpy.array(subset_data[:, 1]) + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + DCS_tags = u[c == 2] + rest_tags = u[c == 1] + + dcs = numpy.repeat("DCS", len(DCS_tags)) + idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) + sscs = subset_data[idx_sscs, 2] + + all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), + numpy.concatenate((dcs, sscs)))) + hd_DCS = [] + ab_SSCS = [] + ba_SSCS = [] + + for i in range(len(all_tags)): + tag = all_tags[i, :] + hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] + + if tag[1] == "DCS": + hd_DCS.append(hd) + elif tag[1] == "ab": + ab_SSCS.append(hd) + elif tag[1] == "ba": + ba_SSCS.append(hd) + + if len(hd_DCS) != 0: + hd_DCS = numpy.concatenate(hd_DCS) + if len(ab_SSCS) != 0: + ab_SSCS = numpy.concatenate(ab_SSCS) + if len(ba_SSCS) != 0: + ba_SSCS = numpy.concatenate(ba_SSCS) + list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting + return(list1, maximum, minimum) + + def make_argparser(): parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') parser.add_argument('--inputFile', @@ -678,11 +856,15 @@ help='Only tags of the DCSs are included in the HD analysis') parser.add_argument('--minFS', default=1, type=int, - help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') + help='Only tags, which have a family size greater or equal than specified, ' + 'are included in the HD analysis') parser.add_argument('--maxFS', default=0, type=int, - help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') + help='Only tags, which have a family size smaller or equal than specified, ' + 'are included in the HD analysis') parser.add_argument('--nr_above_bars', action="store_true", - help='If no, values above bars in the histograms are removed') + help='If False, values above bars in the histograms are removed') + parser.add_argument('--rel_freq', action="store_false", + help='If True, the relative frequencies are displayed.') parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') @@ -695,36 +877,33 @@ def Hamming_Distance_Analysis(argv): - +# def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv, +# output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True, +# subset=False, nproc=12, rel_freq=False): parser = make_argparser() args = parser.parse_args(argv[1:]) - file1 = args.inputFile name1 = args.inputName1 - index_size = args.sample_size title_savedFile_pdf = args.output_pdf title_savedFile_csv = args.output_tabular output_chimeras_tabular = args.output_chimeras_tabular - - sep = "\t" onlyDuplicates = args.only_DCS + rel_freq = args.rel_freq minFS = args.minFS maxFS = args.maxFS nr_above_bars = args.nr_above_bars - subset = args.subset_tag nproc = args.nproc + sep = "\t" # input checks if index_size < 0: print("index_size is a negative integer.") exit(2) - if nproc <= 0: print("nproc is smaller or equal zero") exit(3) - if subset < 0: print("subset_tag is smaller or equal zero.") exit(5) @@ -735,22 +914,31 @@ plt.rcParams['ytick.labelsize'] = 14 plt.rcParams['patch.edgecolor'] = "#000000" plt.rc('figure', figsize=(11.69, 8.27)) # A4 format - name1 = name1.split(".tabular")[0] with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: print("dataset: ", name1) integers, data_array = readFileReferenceFree(file1) data_array = numpy.array(data_array) - print("total nr of tags with Ns:", len(data_array)) - n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] - if len(n) != 0: # delete tags with N in the tag from data - print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) + print("total nr of tags:", len(data_array)) + + # filter tags out which contain any other character than ATCG + valid_bases = ["A", "T", "G", "C"] + tagsToDelete = [] + for idx, t in enumerate(data_array[:, 1]): + for char in t: + if char not in valid_bases: + tagsToDelete.append(idx) + break + + if len(tagsToDelete) != 0: # delete tags with N in the tag from data + print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), + float(len(tagsToDelete)) / len(data_array)) index_whole_array = numpy.arange(0, len(data_array), 1) - index_withoutN_inTag = numpy.delete(index_whole_array, n) + index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) data_array = data_array[index_withoutN_inTag, :] integers = integers[index_withoutN_inTag] - print("total nr of tags without Ns:", len(data_array)) + print("total nr of filtered tags:", len(data_array)) int_f = numpy.array(data_array[:, 0]).astype(int) data_array = data_array[numpy.where(int_f >= minFS)] @@ -768,7 +956,7 @@ # find all unique tags and get the indices for ALL tags, but only once u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) - d = u[c > 1] + d = u[c == 2] # get family sizes, tag for duplicates duplTags_double = integers[numpy.in1d(seq, d)] @@ -779,9 +967,9 @@ duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags if minFS > 1: - duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)] - duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)] - duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 + duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 data_array = numpy.column_stack((duplTags, duplTags_seq)) data_array = numpy.column_stack((data_array, duplTags_tag)) @@ -819,15 +1007,32 @@ else: numpy.random.shuffle(data_array) unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags - result = numpy.random.choice(unique_indices, size=index_size, replace=False) # array of random sequences of size=index.size + result = numpy.random.choice(unique_indices, size=index_size, + replace=False) # array of random sequences of size=index.size # result = numpy.random.choice(len(integers), size=index_size, # replace=False) # array of random sequences of size=index.size # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] - # with open("index_result1_{}.pkl".format(app_f), "wb") as o: + # with open("index_result.pkl", "wb") as o: # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) + # save counts + # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: + # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) + # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: + # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) + # + # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: + # result = pickle.load(f) + # + # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: + # data_array = pickle.load(f1) + + # with open(data_folder + "index_result.txt", "w") as t: + # for text in result: + # t.write("{}\n".format(text)) + # comparison random tags to whole dataset result1 = data_array[result, 1] # random tags result2 = data_array[:, 1] # all tags @@ -873,10 +1078,9 @@ minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) - chim_tags = [item[10] for item in diff_list_a] - chim_tags2 = [item[10] for item in diff_list_b] - chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i] - chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i] + + chimera_tags1 = sum([item[10] for item in diff_list_a], []) + chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b]) rel_Diff = [] diff_zeros = [] @@ -884,7 +1088,8 @@ diff = [] chimera_tags = [] for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ - zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2): + zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, + chimera_tags1, chimera_tags2): rel_Diff.append(max(rel1, rel2)) diff.append(max(d1, d2)) @@ -903,7 +1108,7 @@ chimera_tags.append(ctag2) chimera_tags_new = chimera_tags - #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) + data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) # chimeras_dic = defaultdict(list) # # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): @@ -911,71 +1116,63 @@ # t2 = numpy.concatenate(t2) # chimeras_dic[t1].append(t2) + checked_tags = [] + stat_maxTags = [] + with open(output_chimeras_tabular, "w") as output_file1: - output_file1.write("chimera tag\tsimilar tag with HD=0\n") - for i in range(len(minHD_tags_zeros)): - tag1 = minHD_tags_zeros[i] + output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n") + for i in range(len(data_chimeraAnalysis)): + tag1 = data_chimeraAnalysis[i, 0] + + info_tag1 = data_array[data_array[:, 1] == tag1, :] + fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] + + if tag1 in checked_tags: # skip tag if already written to file + continue + sample_half_a = tag1[0:(len(tag1)) / 2] sample_half_b = tag1[len(tag1) / 2:len(tag1)] - max_tags = chimera_tags_new[i] - if isinstance(max_tags, list) and len(max_tags) > 1: + max_tags = data_chimeraAnalysis[i, 1] + if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray: max_tags = numpy.concatenate(max_tags) - #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray: - # print(max_tags) - # max_tags = numpy.concatenate(max_tags) max_tags = numpy.unique(max_tags) + stat_maxTags.append(len(max_tags)) - chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 - chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 + info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] + + chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 + chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 new_format = [] for j in range(len(max_tags)): + fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] + if sample_half_a == chimera_half_a[j]: - max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j]) + max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) new_format.append(max_tag) elif sample_half_b == chimera_half_b[j]: - max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j]) + max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) new_format.append(max_tag) + checked_tags.append(max_tags[j]) - sample_tag = "{} {}".format(sample_half_a, sample_half_b) + sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) - output_file1.write( - "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " - "The tags were separated by an empty space into their halves and the * marks the identical half.") + + checked_tags.append(tag1) - # unique_chimeras = numpy.array(minHD_tags_zeros) - # - # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 - # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 - # - # output_file1.write("sample tag\tsimilar tag\n") - # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): - # max_tags = numpy.concatenate(chimeras_dic.get(tag1)) - # max_tags = numpy.unique(max_tags) - # - # chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 - # chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 - # - # new_format = [] - # for i in range(len(max_tags)): - # if a == chimera_half_a[i]: - # max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) - # new_format.append(max_tag) - # - # elif b == chimera_half_b[i]: - # max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) - # new_format.append(max_tag) - # - # sample_tag = "{} {}".format(a, b) - # output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) - # output_file1.write( - # "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " - # "The tags were separated by an empty space into their halves and the * marks the identical half.") - - nr_chimeric_tags = len(minHD_tags_zeros) - print("nr of unique chimeras", nr_chimeric_tags) + output_file1.write( + "This file contains all tags that were identified as chimeras as the first column and the " + "corresponding tags which returned a Hamming distance of zero in either the first or the second " + "half of the sample tag as the second column.\n" + "The tags were separated by an empty space into their halves and the * marks the identical half.") + output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n") + output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) + output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) + output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) + output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) + output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) lenTags = len(data_array) len_sample = len(result1) @@ -992,6 +1189,9 @@ rel_Diff = numpy.tile(rel_Diff, 2) diff_zeros = numpy.tile(diff_zeros, 2) + nr_chimeric_tags = len(data_chimeraAnalysis) + print("nr of chimeras", nr_chimeric_tags) + # prepare data for different kinds of plots # distribution of FSs separated after HD familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) @@ -1011,8 +1211,8 @@ lst_minHD_tags.append(seqDic.get(i)) if onlyDuplicates: - lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int) - + lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], + [item_b[1] for item_b in lst_minHD_tags])).astype(int) # histogram with absolute and relative difference between HDs of both parts of the tag listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, @@ -1023,43 +1223,55 @@ for i in minHD_tags_zeros: lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads if onlyDuplicates: - lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) + lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], + [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) # histogram with HD of non-identical half - listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + + if onlyDuplicates is False: + listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, + diff_zeros, data_array) # plot Hamming Distance with Family size distribution - plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, - subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, + subtitle="Hamming distance separated by family size", lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample) # Plot FSD with separation after - plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", - pdf=pdf, relative=False, title_file1=name1, diff=False) + pdf=pdf, relative=False, diff=False) # Plot HD within tags - plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, - title_file1=name1, len_sample=len_sample) + plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, + rel_freq=rel_freq, len_sample=len_sample) # Plot difference between HD's separated after FSD plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, - subtitle="Delta Hamming distance within tags", - title_file1=name1, lenTags=lenTags, + subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq, xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, - subtitle="Chimera Analysis: relative delta Hamming distances", - title_file1=name1, lenTags=lenTags, - xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq, + xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, + nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) # plots for chimeric reads if len(minHD_tags_zeros) != 0: # HD - plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras", - title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + if onlyDuplicates is False: + plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, + subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="HD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + # print all data to a CSV file # HD summary, sumCol = createTableHD(list1, "HD=") @@ -1087,9 +1299,12 @@ summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") overallSum15 = sum(sumCol15) + if onlyDuplicates is False: + summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) + overallSum16 = sum(sumCol16) + output_file.write("{}\n".format(name1)) - output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( - numpy.concatenate(list1)), lenTags, lenTags)) + output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) # HD createFileHD(summary, sumCol, overallSum, output_file, @@ -1109,27 +1324,36 @@ # HD within tags output_file.write( - "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n" - "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n" - "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" - "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta values in the end.\n" - "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n") + "The Hamming distances were calculated by comparing the first halve against all halves and selected the " + "minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted " + "in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible " + "to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" + "These calculations were repeated, but starting with the second half in the first step to find all " + "possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta " + "values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the " + "forward and reverse (ab and ba) will be included in the plots.\n") - output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) + output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, "Hamming distance of each half in the tag", sep) createFileHD(summary11, sumCol11, overallSum11, output_file, - "Absolute delta Hamming distances within the tag", sep) + "Absolute delta Hamming distance within the tag", sep) createFileHD(summary13, sumCol13, overallSum13, output_file, - "Chimera analysis: relative delta Hamming distances", sep) + "Chimera analysis: relative delta Hamming distance", sep) if len(minHD_tags_zeros) != 0: output_file.write( - "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n") + "All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, " + "had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances " + "of the chimeric tags are shown.\n") createFileHD(summary15, sumCol15, overallSum15, output_file, - "Hamming distances of chimeras", sep) + "Hamming distance of chimeric families separated after FS", sep) + + if onlyDuplicates is False: + createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, + "Hamming distance of chimeric families separated after DCS and single SSCS", sep) output_file.write("\n")