# HG changeset patch # User iuc # Date 1571924169 14400 # Node ID 7fc3c8704c05afa0eee963e8ce2681eadc64178f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fsd commit 4cdd9c109b2c38474ef084ec92f3b56e0f73a760" diff -r 000000000000 -r 7fc3c8704c05 fsd.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd.py Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,797 @@ +#!/usr/bin/env python + +# Family size distribution of SSCSs +# +# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS, but up to 4 files can be provided, as input. +# The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and +# a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. +# If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. +# Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. + +# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf + +import argparse +import sys + +import matplotlib.pyplot as plt +import numpy +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def readFileReferenceFree(file): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') + return(data_array) + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Family Size Distribution of duplex sequencing data') + parser.add_argument('--inputFile1', help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--inputFile2', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName2') + parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName3') + parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName4') + parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') + parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') + parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') + return parser + + +def compare_read_families(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + + firstFile = args.inputFile1 + name1 = args.inputName1 + + secondFile = args.inputFile2 + name2 = args.inputName2 + thirdFile = args.inputFile3 + name3 = args.inputName3 + fourthFile = args.inputFile4 + name4 = args.inputName4 + log_axis = args.log_axis + rel_freq = args.rel_freq + + title_file = args.output_tabular + title_file2 = args.output_pdf + + sep = "\t" + + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + plt.rcParams['patch.edgecolor'] = "black" + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + + list_to_plot = [] + label = [] + data_array_list = [] + list_to_plot_original = [] + colors = [] + bins = numpy.arange(1, 22) + with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: + fig = plt.figure() + fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + fig2 = plt.figure() + fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + + if firstFile is not None: + file1 = readFileReferenceFree(firstFile) + integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers) + colors.append("#0000FF") + + # for plot: replace all big family sizes by 22 + data1 = numpy.clip(integers, bins[0], bins[-1]) + name1 = name1.split(".tabular")[0] + if len(name1) > 40: + name1 = name1[:40] + list_to_plot.append(data1) + label.append(name1) + data_array_list.append(file1) + + legend = "\n\n\n{}".format(name1) + fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + + legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], + float(numpy.bincount(data1)[1]) / len(data1)) + fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + + legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], + float(numpy.bincount(data1)[1]) / sum(integers)) + fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + + legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]), + float(len(integers[integers > 20])) / len(integers)) + fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + + legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), + float(sum(integers[integers > 20])) / sum(integers)) + fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + + legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) + fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "PE reads\n{:,}".format(sum(integers)) + fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + + if secondFile is not None: + file2 = readFileReferenceFree(secondFile) + integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers2) + colors.append("#298A08") + + data2 = numpy.clip(integers2, bins[0], bins[-1]) + list_to_plot.append(data2) + name2 = name2.split(".tabular")[0] + if len(name2) > 40: + name2 = name2[:40] + label.append(name2) + data_array_list.append(file2) + + fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) + + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) + fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + + legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) + fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + + legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), + float(len(integers2[integers2 > 20])) / len(integers2)) + fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + + legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), + float(sum(integers2[integers2 > 20])) / sum(integers2)) + fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + + legend6 = "{:,}".format(len(data2)) + fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers2)) + fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + + if thirdFile is not None: + file3 = readFileReferenceFree(thirdFile) + integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers3) + colors.append("#DF0101") + + data3 = numpy.clip(integers3, bins[0], bins[-1]) + list_to_plot.append(data3) + name3 = name3.split(".tabular")[0] + if len(name3) > 40: + name3 = name3[:40] + label.append(name3) + data_array_list.append(file3) + + fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) + + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) + fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], + float(numpy.bincount(data3)[1]) / sum(integers3)) + fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + + legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), + float(len(integers3[integers3 > 20])) / len(integers3)) + fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + + legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), + float(sum(integers3[integers3 > 20])) / sum(integers3)) + fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + + legend6 = "{:,}".format(len(data3)) + fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers3)) + fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + + if fourthFile is not None: + file4 = readFileReferenceFree(fourthFile) + integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers4) + colors.append("#04cec7") + + data4 = numpy.clip(integers4, bins[0], bins[-1]) + list_to_plot.append(data4) + name4 = name4.split(".tabular")[0] + if len(name4) > 40: + name4 = name4[:40] + label.append(name4) + data_array_list.append(file4) + + fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) + + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) + fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], + float(numpy.bincount(data4)[1]) / sum(integers4)) + fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + + legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), + float(len(integers4[integers4 > 20])) / len(integers4)) + fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + + legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), + float(sum(integers4[integers4 > 20])) / sum(integers4)) + fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + + legend6 = "{:,}".format(len(data4)) + fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + + legend6b = "{:,}".format(sum(integers4)) + fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + + list_to_plot2 = list_to_plot + + if rel_freq: + ylab = "Relative Frequency" + else: + ylab = "Absolute Frequency" + + # PLOT FSD based on tags + fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14) + ax = fig.add_subplot(1, 1, 1) + ticks = numpy.arange(1, 22, 1) + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + ax.set_xticks([], []) + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2] + counts = ax.hist(list_to_plot2, weights=w, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors, linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) + ax.set_ylim(0, 1.07) + else: + counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors) + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + + ax.set_ylabel(ylab, fontsize=14) + ax.set_xlabel("Family size", fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) + pdf.savefig(fig) + + # PLOT FSD based on PE reads + fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14) + ax2 = fig2.add_subplot(1, 1, 1) + ticks = numpy.arange(1, 22) + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + reads = [] + reads_rel = [] + + barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1) + ax2.set_xticks([], []) + + for i in range(len(list_to_plot2)): + x = list(numpy.arange(1, 22).astype(float)) + unique, c = numpy.unique(list_to_plot2[i], return_counts=True) + y = unique * c + if sum(list_to_plot_original[i] > 20) > 0: + y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) + y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] + reads.append(y) + reads_rel.append(list(numpy.float_(y)) / sum(y)) + + if len(list_to_plot2) == 1: + x = [xi * 0.5 for xi in x] + w = 0.4 + else: + x = [xi + barWidth for xi in x] + w = 1. / (len(list_to_plot) + 1) + if rel_freq: + ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i]) + ax2.set_ylim(0, 1.07) + else: + ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i]) + if i == len(list_to_plot2) - 1: + barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) + else: + barWidth += 1. / (len(list_to_plot) + 1) + + ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + + if len(list_to_plot2) == 1: + ax2.set_xticks(numpy.array([xi + 0.2 for xi in x])) + else: + ax2.set_xticks(numpy.array(ticks)) + ax2.set_xticklabels(ticks1) + ax2.set_xlabel("Family size", fontsize=14) + ax2.set_ylabel(ylab, fontsize=14) + if log_axis: + ax2.set_yscale('log') + ax2.grid(b=True, which="major", color="#424242", linestyle=":") + ax2.margins(0.01, None) + + pdf.savefig(fig2) + plt.close() + + # write data to CSV file tags + counts = [numpy.bincount(di, minlength=22)[1:] for di in list_to_plot2] # original counts of family sizes + output_file.write("Values from family size distribution with all datasets based on families\n") + output_file.write("\nFamily size") + for i in label: + output_file.write("{}{}".format(sep, i)) + output_file.write("\n") + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + for n in range(len(label)): + output_file.write("{}{}".format(int(counts[n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + for i in counts: + output_file.write("{}{}".format(int(sum(i)), sep)) + + # write data to CSV file PE reads + output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n") + output_file.write("\nFamily size") + for i in label: + output_file.write("{}{}".format(sep, i)) + output_file.write("\n") + j = 0 + + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(reads[0][j]), sep)) + else: + for n in range(len(label)): + output_file.write("{}{}".format(int(reads[n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep)) + else: + for i in reads: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("\n") + + # Family size distribution after DCS and SSCS + for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): + tags = numpy.array(data_o[:, 2]) + seq = numpy.array(data_o[:, 1]) + data = numpy.array(dataset) + data_o = numpy.array(data_o[:, 0]).astype(int) + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c > 1] + + # get family sizes, tag for duplicates + duplTags_double = data[numpy.in1d(seq, d)] + duplTags_double_o = data_o[numpy.in1d(seq, d)] + + duplTags = duplTags_double[0::2] # ab of DCS + duplTags_o = duplTags_double_o[0::2] # ab of DCS + + duplTagsBA = duplTags_double[1::2] # ba of DCS + duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS + + # get family sizes for SSCS with no partner + ab = numpy.where(tags == "ab")[0] + abSeq = seq[ab] + ab_o = data_o[ab] + ab = data[ab] + + ba = numpy.where(tags == "ba")[0] + baSeq = seq[ba] + ba_o = data_o[ba] + ba = data[ba] + + dataAB = ab[numpy.in1d(abSeq, d, invert=True)] + dataAB_o = ab_o[numpy.in1d(abSeq, d, invert=True)] + + dataBA = ba[numpy.in1d(baSeq, d, invert=True)] + dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)] + + list1 = [duplTags_double, dataAB, dataBA] # list for plotting + list1_o = [duplTags_double_o, dataAB_o, dataBA_o] # list for plotting + + # information for family size >= 3 + dataAB_FS3 = dataAB[dataAB >= 3] + dataAB_FS3_o = dataAB_o[dataAB_o >= 3] + dataBA_FS3 = dataBA[dataBA >= 3] + dataBA_FS3_o = dataBA_o[dataBA_o >= 3] + + duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 + duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 + duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 + + # original FS + duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 + duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 + duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 + + fig = plt.figure() + plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) + + if rel_freq: + w = [numpy.zeros_like(dj) + 1. / len(numpy.concatenate(list1)) for dj in list1] + plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w, edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8) + plt.ylim(0, 1.07) + else: + plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8) + + # tick labels of x axis + ticks = numpy.arange(1, 22, 1) + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + plt.xticks(numpy.array(ticks), ticks1) + singl = len(data_o[data_o == 1]) + last = len(data_o[data_o > 20]) # large families + if log_axis: + plt.yscale('log') + plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) + plt.title("{}: FSD based on families".format(name_file), fontsize=14) + plt.xlabel("Family size", fontsize=14) + plt.ylabel(ylab, fontsize=14) + plt.margins(0.01, None) + plt.grid(b=True, which="major", color="#424242", linestyle=":") + + # extra information beneath the plot + legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" + plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), + len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), + (len(ab) + len(ba))) + plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), + sum(duplTags_o), sum(duplTags_double_o), + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + (sum(ab_o) + sum(ba_o))) + plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + + legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( + float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), + float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), + float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), + (len(dataAB) + len(dataBA) + len(duplTags))) + plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), + float(len(dataBA)) / (len(ab) + len(ba)), + float(len(duplTags)) / (len(ab) + len(ba)), + float(len(duplTags_double)) / (len(ab) + len(ba)), + (len(ab) + len(ba))) + plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend1 = "\nsingletons:\nfamily size > 20:" + plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) + + legend4 = "{:,}\n{:,}".format(singl, last) + plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) + legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data)) + plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) + plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) + plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( + float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) + plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format( + float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o))) + plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + pdf.savefig(fig) + plt.close() + + # PLOT FSD based on PE reads + fig3 = plt.figure() + plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) + + fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14) + ax2 = fig3.add_subplot(1, 1, 1) + ticks = numpy.arange(1, 22) + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + reads = [] + reads_rel = [] + + # barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1) + ax2.set_xticks([], []) + + list_y = [] + label = ["duplex", "ab", "ba"] + col = ["#FF0000", "#5FB404", "#FFBF00"] + for i in range(len(list1)): + x = list(numpy.arange(1, 22).astype(float)) + unique, c = numpy.unique(list1[i], return_counts=True) + y = unique * c + if sum(list1_o[i] > 20) > 0: + y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20]) + y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] + reads.append(y) + reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))) + + if rel_freq: + y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)) + ax2.set_ylim(0, 1.07) + else: + y = y + + list_y.append(y) + if i == 0: + ax2.bar(x, y, align="center", width=0.8, edgecolor="black", label=label[0], linewidth=1, alpha=1, color=col[0]) + elif i == 1: + ax2.bar(x, y, bottom=list_y[i - 1], align="center", width=0.8, edgecolor="black", label=label[1], linewidth=1, alpha=1, color=col[1]) + elif i == 2: + bars = numpy.add(list_y[0], list_y[1]).tolist() + ax2.bar(x, y, bottom=bars, align="center", width=0.8, edgecolor="black", label=label[2], linewidth=1, alpha=1, color=col[2]) + + ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + + singl = len(data_o[data_o == 1]) + last = len(data_o[data_o > 20]) # large families + + ax2.set_xticks(numpy.array(ticks)) + ax2.set_xticklabels(ticks1) + ax2.set_xlabel("Family size", fontsize=14) + ax2.set_ylabel(ylab, fontsize=14) + if log_axis: + ax2.set_yscale('log') + ax2.grid(b=True, which="major", color="#424242", linestyle=":") + ax2.margins(0.01, None) + + # extra information beneath the plot + legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" + plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), + len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), + (len(ab) + len(ba))) + plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), + sum(duplTags_o), sum(duplTags_double_o), + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + (sum(ab_o) + sum(ba_o))) + plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + + legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( + float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), + float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), + float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), + (len(dataAB) + len(dataBA) + len(duplTags))) + plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), + float(len(dataBA)) / (len(ab) + len(ba)), + float(len(duplTags)) / (len(ab) + len(ba)), + float(len(duplTags_double)) / (len(ab) + len(ba)), + (len(ab) + len(ba))) + plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend1 = "\nsingletons:\nfamily size > 20:" + plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) + + legend4 = "{:,}\n{:,}".format(singl, last) + plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) + legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data)) + plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) + plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) + plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) + + legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( + float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) + plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format( + float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o))) + plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) + + pdf.savefig(fig3) + plt.close() + + # write same information to a csv file + count = numpy.bincount(data_o) # original counts of family sizes + + output_file.write("\nDataset:{}{}\n".format(sep, name_file)) + output_file.write("max. family size:{}{}\n".format(sep, max(data_o))) + output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) + output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) + + output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o)))) + output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o)))) + + output_file.write( + "{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, + sep, sep)) + output_file.write( + "{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format( + sep, sep, sep, sep, sep, sep, sep, sep, sep)) + output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( + name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep, + last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), + sep, sum(data_o))) + + # information for FS >= 1 + output_file.write( + "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" + "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") + output_file.write( + "FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, + sep, sep, + sep)) + output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) + output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataAB), sep, sum(dataAB_o), sep, + float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), + sep, float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)))) + output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataBA), sep, sum(dataBA_o), sep, + float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), + sep, float(len(dataBA)) / (len(ab) + len(ba)), sep, + float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), + sep, float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)))) + output_file.write( + "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( + sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep, + float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, + float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep, + float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, + float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), + float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)))) + output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( + sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, + (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep, + (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o)))) + # information for FS >= 3 + output_file.write( + "\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, + sep, + sep, + sep, + sep)) + output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) + output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep, + float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(dataAB_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, float(sum(dataAB_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( + sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep, + float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), + sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write( + "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( + sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep, + float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, + float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o), + float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( + sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), + sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, + (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), + sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, + (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) + + counts = [numpy.bincount(dk, minlength=22)[1:] for dk in list1] # original counts of family sizes + output_file.write("\nValues from family size distribution based on families\n") + output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) + + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + for n in range(3): + output_file.write("{}{}".format(int(counts[n][j]), sep)) + output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j])) + j += 1 + output_file.write("sum{}".format(sep)) + for i in counts: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2]))) + + output_file.write("\nValues from family size distribution based on PE reads\n") + output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + for n in range(3): + output_file.write("{}{}".format(int(reads[n][j]), sep)) + output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j])) + j += 1 + output_file.write("sum{}".format(sep)) + for i in reads: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2]))) + + print("Files successfully created!") + + +if __name__ == '__main__': + sys.exit(compare_read_families(sys.argv)) diff -r 000000000000 -r 7fc3c8704c05 fsd.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd.xml Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,128 @@ + + + Family Size Distribution of duplex sequencing tags + + fsd_macros.xml + + + python + matplotlib + + +python '$__tool_directory__/fsd.py' +#for $i, $s in enumerate($series) + --inputFile${i + 1} '${s.file}' + --inputName${i + 1} '${s.file.element_identifier}' +#end for +$log_axis +$rel_freq +--output_pdf '$output_pdf' +--output_tabular '$output_tabular' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 20 members. This information is very useful in early decision steps of the analysis parameters, such as the minimum number of PE-reads to build the single stranded consensus sequence (SSCS). Moreover, this tool can compare several datasets or different steps in the analysis pipeline to monitor data loss or gain (e.g families re-united with barcode correction tool from the `Du Novo Analysis Pipeline `_). In an extension of this tool, each family is stratified into SSCS (ab/ba) and DSC and visualizes the allocation of DCSs respective to SSCS-ab and SSCS-ba. This is quite handy to better understand the relationship of SSCS to DCS per family and identify sources of bias (e.g. more SSCS to DCS in a particular family size, or more forward ab than reverse ba reads). + +**Input** + +This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. At least one input file must be provided and up to 4 different datasets can be compared with this tool. All input files are produced in the same way (see section **How to generate the input**):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +.. class:: infomark + +**How to generate the input** + +The first step of the `Du Novo Analysis Pipeline `_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG + +We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAAAATGGTATG ba + +Next, the tags are sorted in ascending or descending order using the **Sort** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAAAATGGTATG ba + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + +Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +These data can now be used in this tool. + +**Output** + +The output is a PDF file with a plot and a summary of the data in the plot. The tool analyzes the family size distribution, that is the number of PE-reads per tag or family. This information is presented graphically as a histogram. The output can interpreted the following way: + +The **first part** of the output from this tool is “FSD after tags” and “FSD after PE reads” depending whether the data is compared to the total number of unique tags/families or the total number of PE-reads. The data is presented in a histogram showing the frequency of family sizes (FS) ranging from FS=1 to FS>20 members in a step-wise manner. Multiple datasets or steps of an analysis can be merged in one histogram (e.g. tags before and after barcode correction). The family size distribution gives the user an insight into the allocation of DCS in the libary; information that can be used to decide on the optimal family size parameter for building SSCSs. + +The **second part** of the output shows the family size distribution (FSD) for each of the datasets in more detail. The tags are categorized based on whether they form DCS (duplex = complementary tags in the ab and ba direction) or form only SSCS-ab or SSCS-ba with no matching complement. +]]> + + + diff -r 000000000000 -r 7fc3c8704c05 fsd_beforevsafter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd_beforevsafter.py Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,304 @@ +#!/usr/bin/env python +# Family size distribution of DCS from various steps of the Galaxy pipeline +# +# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), +# a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and +# a FASTA file with tags after trimming as input (optional). +# The program produces a plot which shows the distribution of family sizes of the DCS from the input files and +# a CSV file with the data of the plot. +# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile +# --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf + +import argparse +import sys +from collections import Counter + +import matplotlib.pyplot as plt +import numpy +import pysam +from Bio import SeqIO +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def readFileReferenceFree(file, delim): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') + return data_array + + +def readFasta(file): + tag_consensus = [] + fs_consensus = [] + with open(file, "r") as consFile: + for record in SeqIO.parse(consFile, "fasta"): + tag_consensus.append(record.id) + line = record.description + a, b = line.split(" ") + fs1, fs2 = b.split("-") + fs_consensus.extend([fs1, fs2]) + fs_consensus = numpy.array(fs_consensus).astype(int) + return (tag_consensus, fs_consensus) + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data') + parser.add_argument('--inputFile_SSCS', + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--makeDCS', + help='FASTA File with information about tag and family size in the header.') + parser.add_argument('--afterTrimming', default=None, + help='FASTA File with information about tag and family size in the header.') + parser.add_argument('--bamFile', + help='BAM file with aligned reads.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, + help='Name of the pdf and tabular file.') + parser.add_argument('--output_tabular', default="data.tabular", type=str, + help='Name of the pdf and tabular file.') + return parser + + +def compare_read_families_read_loss(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + SSCS_file = args.inputFile_SSCS + SSCS_file_name = args.inputName1 + makeConsensus = args.makeDCS + afterTrimming = args.afterTrimming + ref_genome = args.bamFile + title_file = args.output_tabular + title_file2 = args.output_pdf + sep = "\t" + + with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: + # PLOT + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + plt.rcParams['patch.edgecolor'] = "black" + fig = plt.figure() + plt.subplots_adjust(bottom=0.3) + + list1 = [] + colors = [] + labels = [] + +# data with tags of SSCS + data_array = readFileReferenceFree(SSCS_file, "\t") + seq = numpy.array(data_array[:, 1]) + tags = numpy.array(data_array[:, 2]) + quant = numpy.array(data_array[:, 0]).astype(int) + + # split data with all tags of SSCS after ab and ba strands + all_ab = seq[numpy.where(tags == "ab")[0]] + all_ba = seq[numpy.where(tags == "ba")[0]] + quant_ab_sscs = quant[numpy.where(tags == "ab")[0]] + quant_ba_sscs = quant[numpy.where(tags == "ba")[0]] + + seqDic_ab = dict(zip(all_ab, quant_ab_sscs)) + seqDic_ba = dict(zip(all_ba, quant_ba_sscs)) + + # get tags of the SSCS which form a DCS + # group large family sizes + bigFamilies = numpy.where(quant > 20)[0] + quant[bigFamilies] = 22 + maximumX = numpy.amax(quant) + + # find all unique tags and get the indices for ALL tags (ab AND ba) + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c > 1] + + # get family sizes, tag for the duplicates + duplTags_double = quant[numpy.in1d(seq, d)] + list1.append(duplTags_double) + colors.append("#0000FF") + labels.append("before SSCS building") + + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS + + d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)] # ab and ba FS>=3 + + # all SSCSs FS>=3 + seq_unique, seqUnique_index = numpy.unique(seq, return_index=True) + seq_unique_FS = quant[seqUnique_index] + seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3] + + legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before SSCS building, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before SSCS building, FS>=3):" + legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), + len(seq_unique_FS3), len(d2)) + plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) + plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) + + # data make DCS + tag_consensus, fs_consensus = readFasta(makeConsensus) + # group large family sizes in the plot of fasta files + bigFamilies = numpy.where(fs_consensus > 20)[0] + fs_consensus[bigFamilies] = 22 + list1.append(fs_consensus) + colors.append("#298A08") + labels.append("after DCS building") + legend3 = "after DCS building:" + legend4 = "{:,}".format(len(tag_consensus)) + plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) + plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) + + # data after trimming + if afterTrimming is not None: + tag_trimming, fs_trimming = readFasta(afterTrimming) + bigFamilies = numpy.where(fs_trimming > 20)[0] + fs_trimming[bigFamilies] = 22 + list1.append(fs_trimming) + colors.append("#DF0101") + labels.append("after trimming") + legend5 = "after trimming:" + legend6 = "{:,}".format(len(tag_trimming)) + plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) + plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) + + # data of tags aligned to reference genome + if ref_genome is not None: + pysam.index(ref_genome) + bam = pysam.AlignmentFile(ref_genome, "rb") + seq_mut = [] + for read in bam.fetch(): + if not read.is_unmapped: + if '_' in read.query_name: + tags = read.query_name.split('_')[0] + else: + tags = read.query_name + seq_mut.append(tags) + + # use only unique tags that were alignment to the reference genome + seq_mut = numpy.array(seq_mut) + seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True) + # get family sizes for each tag in the BAM file + quant_ab = [] + quant_ba = [] + for i in seq_mut: + quant_ab.append(seqDic_ab.get(i)) + quant_ba.append(seqDic_ba.get(i)) + + quant_ab_ref = numpy.array(quant_ab) + quant_ba_ref = numpy.array(quant_ba) + quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) + bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes + quant_all_ref[bigFamilies] = 22 + + list1.append(quant_all_ref) + colors.append("#04cec7") + labels.append("after alignment\nto reference") + legend7 = "after alignment to reference:" + length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome + legend8 = "{:,}".format(length_DCS_ref) + plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) + plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) + + counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, align="left", alpha=1, edgecolor="black", linewidth=1) + ticks = numpy.arange(0, maximumX, 1) + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + plt.xticks(numpy.array(ticks), ticks1) + if ref_genome is not None: + count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands + legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" + plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure) + + legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ + .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), + sum(numpy.array(data_array[:, 0]).astype(int))) + plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) + + count2 = numpy.array( + [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands + legend = "BA\n{}\n{}\n{:.5f}" \ + .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) + plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) + + legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags." + plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) + + plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) + plt.title("Family size distribution of tags from various steps of the Du Novo pipeline", fontsize=14) + plt.xlabel("Family size", fontsize=14) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.grid(b=True, which="major", color="#424242", linestyle=":") + plt.margins(0.01, None) + + pdf.savefig(fig, bbox_inch="tight") + plt.close() + + # write information about plot into a csv file + output_file.write("Dataset:{}{}\n".format(sep, SSCS_file_name)) + if ref_genome is not None: + output_file.write("{}AB{}BA\n".format(sep, sep)) + output_file.write("max. family size:{}{}{}{}\n".format(sep, max(quant_ab_ref), sep, max(quant_ba_ref))) + output_file.write( + "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) + output_file.write( + "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, + float(count2[len(count2) - 1]) / sum(count2))) + + output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) + output_file.write("\n\nValues from family size distribution\n") + + if afterTrimming is None and ref_genome is None: + if afterTrimming is None: + output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep)) + elif ref_genome is None: + output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep)) + + for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]): + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) + output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) + + elif afterTrimming is None or ref_genome is None: + if afterTrimming is None: + output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) + elif ref_genome is None: + output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) + + for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]): + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference))) + output_file.write("sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])))) + + else: + output_file.write("{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(sep, sep, sep, sep)) + for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])], counts[0][3][2:len(counts[0][3])]): + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, int(reference))) + output_file.write("sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])), sep, int(sum(counts[0][3])))) + + output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") + output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) + output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags))) + output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) + output_file.write("DCS (before SSCS building, FS>=3){}{}\n".format(sep, len(d2))) + output_file.write("after DCS building{}{}\n".format(sep, len(tag_consensus))) + if afterTrimming is not None: + output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming))) + if ref_genome is not None: + output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref)) + + print("Files successfully created!") + + +if __name__ == '__main__': + sys.exit(compare_read_families_read_loss(sys.argv)) diff -r 000000000000 -r 7fc3c8704c05 fsd_beforevsafter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd_beforevsafter.xml Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,122 @@ + + + Family Size Distribution of duplex sequencing tags during Du Novo analysis + + fsd_macros.xml + + + python + matplotlib + biopython + pysam + + +python '$__tool_directory__/fsd_beforevsafter.py' +--inputFile_SSCS '$file1' +--inputName1 '$file1.element_identifier' +--makeDCS '$makeDCS' +#if $afterTrimming: + --afterTrimming '$afterTrimming' +#end if +#if $bamFile: +--bamFile '$bamFile' +#end if +--output_pdf '$output_pdf' +--output_tabular '$output_tabular' + + + + + + + + + + + + + + + + + + + + + + `_. + +**Input** + +**Dataset 1:** This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands.:: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +.. class:: infomark + +**How to generate the input** + +The first step of the `Du Novo Analysis Pipeline `_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG + +We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAAAATGGTATG ba + +Next, the tags are sorted in ascending or descending order using the **Sort** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAAAATGGTATG ba + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + +Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +These data can now be used in this tool. + +**Dataset 2:** A fasta file is required with all tags and the associated family size of both strands (forward and reverse) in the header and the read itself in the next line. This input file can be obtained by the `Du Novo Analysis Pipeline `_ with the tool **Make consensus reads**. + +**Dataset 3 (optional):** In addition, the fasta file with all tags, which were not filtered out after trimming, can be included. This file can also be obtained by the `Du Novo Analysis Pipeline` with the tool **Sequence Content Trimmer**. + +For both input files (dataset 2 and 3), only the DCSs of one mate are necessary these tools give information on both mates in the output file), since both files have the same tags and family sizes, but different DCSs, which are not required in this tool:: + + >AAAAAAAAATAGATCATAGACTCT 7-10 + CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCACGTG + >AAAAAAAAGGCAGAAGATATACGC 11-3 + CNCNGGCCCCCCGCTCCGTGCACAGACGNNGCNACTGACAA + +**Dataset 4 (optional):** BAM file of the aligned reads. This file can be obtained by the tool `Map with BWA-MEM `_. + +**Output** + +The output is a PDF file with a plot and a summary of the data of the plots. The tool compares various datasets from the `Du Novo Analysis Pipeline `_ and helps in decision making of various parameters (e.g family size, minimum read length, etc). For example: Re-running trimming with different parameters allows to recover reads that would be lost due to stringent filtering by read length. This tool also allows to assess reads on target. The tool extracts the tags of reads and their family sizes before SSCS building, after DCS building, after trimming and finally after the alignment to the reference genome. In the plot, the family sizes for both SSCS-ab and SSCS-ba are shown; whereas the summary represents only counts either of SSCS-ab or of SSCS-ba. + +]]> + + + diff -r 000000000000 -r 7fc3c8704c05 fsd_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd_macros.xml Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,13 @@ + + + + + @misc{duplex, + author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene}, + year = {2019}, + title = {{Variant Analyzer: a quality control for variant calling in duplex sequencing data (manuscript)}} + } + + + + \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 fsd_regions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd_regions.py Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,270 @@ +#!/usr/bin/env python + +# Family size distribution of tags which were aligned to the reference genome +# +# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS, +# a BAM file with tags of reads that overlap the regions of the reference genome and +# an optional BED file with chromosome, start and stop position of the regions as input. +# The program produces a plot which shows the distribution of family sizes of the tags from the input files and +# a tabular file with the data of the plot. + +# USAGE: python FSD_regions.py --inputFile filenameSSCS --inputName1 filenameSSCS +# --bamFile DCSbamFile --rangesFile BEDfile --output_tabular outptufile_name_tabular +# --output_pdf outputfile_name_pdf + +import argparse +import collections +import re +import sys + +import matplotlib.pyplot as plt +import numpy as np +import pysam +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def readFileReferenceFree(file, delim): + with open(file, 'r') as dest_f: + data_array = np.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') + return data_array + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') + parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--bamFile', help='BAM file with aligned reads.') + parser.add_argument('--rangesFile', default=None, help='BED file with chromosome, start and stop positions.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.') + parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.') + return parser + + +def compare_read_families_refGenome(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + + firstFile = args.inputFile + name1 = args.inputName1 + name1 = name1.split(".tabular")[0] + bamFile = args.bamFile + + rangesFile = args.rangesFile + title_file = args.output_pdf + title_file2 = args.output_tabular + sep = "\t" + + with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: + data_array = readFileReferenceFree(firstFile, "\t") + pysam.index(bamFile) + + bam = pysam.AlignmentFile(bamFile, "rb") + qname_dict = collections.OrderedDict() + + if rangesFile is not None: + with open(rangesFile, 'r') as regs: + range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#', dtype='string') + + if range_array.ndim == 0: + print("Error: file has 0 lines") + exit(2) + + if range_array.ndim == 1: + chrList = range_array[0] + start_posList = range_array[1].astype(int) + stop_posList = range_array[2].astype(int) + chrList = [chrList.tolist()] + start_posList = [start_posList.tolist()] + stop_posList = [stop_posList.tolist()] + else: + chrList = range_array[:, 0] + start_posList = range_array[:, 1].astype(int) + stop_posList = range_array[:, 2].astype(int) + + if len(start_posList) != len(stop_posList): + print("start_positions and end_positions do not have the same length") + exit(3) + + chrList = np.array(chrList) + start_posList = np.array(start_posList).astype(int) + stop_posList = np.array(stop_posList).astype(int) + for chr, start_pos, stop_pos in zip(chrList, start_posList, stop_posList): + chr_start_stop = "{}_{}_{}".format(chr, start_pos, stop_pos) + qname_dict[chr_start_stop] = [] + for read in bam.fetch(chr.tobytes(), start_pos, stop_pos): + if not read.is_unmapped: + if re.search('_', read.query_name): + tags = re.split('_', read.query_name)[0] + else: + tags = read.query_name + qname_dict[chr_start_stop].append(tags) + + else: + for read in bam.fetch(): + if not read.is_unmapped: + if re.search(r'_', read.query_name): + tags = re.split('_', read.query_name)[0] + else: + tags = read.query_name + + if read.reference_name not in qname_dict.keys(): + qname_dict[read.reference_name] = [tags] + else: + qname_dict[read.reference_name].append(tags) + + seq = np.array(data_array[:, 1]) + tags = np.array(data_array[:, 2]) + quant = np.array(data_array[:, 0]).astype(int) + group = np.array(qname_dict.keys()) + + all_ab = seq[np.where(tags == "ab")[0]] + all_ba = seq[np.where(tags == "ba")[0]] + quant_ab = quant[np.where(tags == "ab")[0]] + quant_ba = quant[np.where(tags == "ba")[0]] + + seqDic_ab = dict(zip(all_ab, quant_ab)) + seqDic_ba = dict(zip(all_ba, quant_ba)) + + lst_ab = [] + lst_ba = [] + quantAfterRegion = [] + length_regions = 0 + for i in group: + lst_ab_r = [] + lst_ba_r = [] + seq_mut = qname_dict[i] + if rangesFile is None: + seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True) + length_regions = length_regions + len(seq_mut) * 2 + for r in seq_mut: + count_ab = seqDic_ab.get(r) + count_ba = seqDic_ba.get(r) + lst_ab_r.append(count_ab) + lst_ab.append(count_ab) + lst_ba_r.append(count_ba) + lst_ba.append(count_ba) + + dataAB = np.array(lst_ab_r) + dataBA = np.array(lst_ba_r) + bigFamilies = np.where(dataAB > 20)[0] + dataAB[bigFamilies] = 22 + bigFamilies = np.where(dataBA > 20)[0] + dataBA[bigFamilies] = 22 + + quantAll = np.concatenate((dataAB, dataBA)) + quantAfterRegion.append(quantAll) + + quant_ab = np.array(lst_ab) + quant_ba = np.array(lst_ba) + + maximumX = np.amax(np.concatenate(quantAfterRegion)) + minimumX = np.amin(np.concatenate(quantAfterRegion)) + + # PLOT + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + plt.rcParams['patch.edgecolor'] = "black" + fig = plt.figure() + plt.subplots_adjust(bottom=0.3) + + colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"] + + col = [] + for i in range(0, len(group)): + col.append(colors[i]) + + counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=group, + align="left", alpha=1, color=col, edgecolor="black", linewidth=1) + ticks = np.arange(minimumX - 1, maximumX, 1) + + ticks1 = map(str, ticks) + ticks1[len(ticks1) - 1] = ">20" + plt.xticks(np.array(ticks), ticks1) + count = np.bincount(map(int, quant_ab)) # original counts + + legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" + plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure) + + legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int))) + plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) + + count2 = np.bincount(map(int, quant_ba)) # original counts + + legend = "BA\n{}\n{}\n{:.5f}" \ + .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) + plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) + + plt.text(0.55, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure) + plt.text(0.8, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11, + transform=plt.gcf().transFigure) + + legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" + plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure) + + space = 0 + for i, count in zip(group, quantAfterRegion): + plt.text(0.55, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure) + plt.text(0.8, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) + space = space + 0.02 + + plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) + plt.xlabel("Family size", fontsize=14) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.grid(b=True, which="major", color="#424242", linestyle=":") + plt.margins(0.01, None) + + pdf.savefig(fig, bbox_inch="tight") + plt.close() + + output_file.write("Dataset:{}{}\n".format(sep, name1)) + output_file.write("{}AB{}BA\n".format(sep, sep)) + output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) + output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) + output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) + output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int)))) + output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2)) + output_file.write("\n\nValues from family size distribution\n") + output_file.write("{}".format(sep)) + for i in group: + output_file.write("{}{}".format(i, sep)) + output_file.write("\n") + + j = 0 + for fs in counts[1][0:len(counts[1]) - 1]: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + if len(group) == 1: + output_file.write("{}{}".format(int(counts[0][j]), sep)) + else: + for n in range(len(group)): + output_file.write("{}{}".format(int(counts[0][n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + + if len(group) == 1: + output_file.write("{}{}".format(int(sum(counts[0])), sep)) + else: + for i in counts[0]: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("\n") + output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") + output_file.write("Region{}total nr. of tags per region\n".format(sep)) + for i, count in zip(group, quantAfterRegion): + output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) + + print("Files successfully created!") + + +if __name__ == '__main__': + sys.exit(compare_read_families_refGenome(sys.argv)) diff -r 000000000000 -r 7fc3c8704c05 fsd_regions.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fsd_regions.xml Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,112 @@ + + + Family size distribution of user-specified regions in the reference genome + + fsd_macros.xml + + + python + matplotlib + pysam + + +python '$__tool_directory__/fsd_regions.py' +--inputFile '$file1' +--inputName1 '$file1.element_identifier' +--bamFile '$file2' +#if $file3: + --rangesFile '$file3' +#end if +--output_pdf '$output_pdf' +--output_tabular '$output_tabular' + + + + + + + + + + + + + + + + + + + + `_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG + +We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAAAATGGTATG ba + +Next, the tags are sorted in ascending or descending order using the **Sort** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAAAATGGTATG ba + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + +Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +These data can now be used in this tool. + +**Dataset 2:** BAM file of the aligned reads. This file can be obtained by the tool `Map with BWA-MEM `_. + +**Dataset 3 (optional):** BED file with start and stop positions of the regions. If it is not provided, then all aligned reads of the BAM file are used in the distribution of family sizes:: + + 1 2 3 + --------------------- + ACH_TDII 90 633 + ACH_TDII 659 1140 + ACH_TDII 1144 1561 + +**Output** + +The output is a PDF file with the plot and the summarized data of the plot. The tool creates a distribution of family sizes of tags that were aligned to the reference genome. Note that tags that overlap different regions of the reference genome are counted for each region. This tool is useful to examine differences in coverage among targeted regions. The plot includes both complementary SSCS-ab and SSCS-ba that form a DCS; whereas the table shows only single counts of the tags per region. + + ]]> + + + diff -r 000000000000 -r 7fc3c8704c05 td.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/td.py Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,1296 @@ +#!/usr/bin/env python + +# Tag distance analysis of SSCSs +# +# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS and +# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances +# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) +# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the +# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the +# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. +# The tool can run on a certain number of processors, which can be defined by the user. + +# USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int / +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int +# --nr_above_bars True/False --output_tabular outptufile_name_tabular + +import argparse +import itertools +import operator +import sys +from collections import Counter, defaultdict +from functools import partial +from multiprocessing.pool import Pool + +import matplotlib.pyplot as plt +import numpy +from matplotlib.backends.backend_pdf import PdfPages + +plt.switch_backend('agg') + + +def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, + subtitle, pdf, relative=False, diff=True, rel_freq=False): + if diff is False: + colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] + labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"] + else: + colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] + if relative is True: + labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] + else: + labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] + + fig = plt.figure(figsize=(6, 7)) + ax = fig.add_subplot(111) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.bincount(numpy.concatenate(familySizeList1)) + maximumY = numpy.amax(p1) + + if len(range(minimumXFS, maximumXFS)) == 0: + range1 = range(minimumXFS - 1, minimumXFS + 2) + else: + range1 = range(0, maximumXFS + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] + plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim((0, 1.07)) + else: + plt.hist(familySizeList1, label=labels, color=colors, stacked=True, rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) + if len(numpy.concatenate(familySizeList1)) != 0: + plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) + plt.ylabel("Absolute Frequency", fontsize=14) + plt.ylim((0, maximumY * 1.2)) + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel("Family size", fontsize=14) + ticks = numpy.arange(0, maximumXFS + 1, 1) + ticks1 = map(str, ticks) + if maximumXFS >= 20: + ticks1[len(ticks1) - 1] = ">=20" + plt.xticks(numpy.array(ticks), ticks1) + [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] + plt.xlim((0, maximumXFS + 1)) + legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " + plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) + + # count = numpy.bincount(originalCounts) # original counts + if max(originalCounts) >= 20: + max_count = ">= 20" + else: + max_count = max(originalCounts) + legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) + plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) + legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) + plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + + +def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + if relative is True: + step = 0.1 + else: + step = 1 + + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + if relative is True: # relative difference + bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) + else: + bin1 = maximumX + 1 + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][5])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, + color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + maximumY = numpy.amax(p1) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][5])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (relative and ((counts[len(counts) - 1] / nr_unique_chimeras) == 2)) or \ + (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ + .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, + nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): + step = 1 + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) + maximumY = numpy.amax(p1) + bin1 = maximumX + 1 + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, 1.07)) + plt.ylabel("Relative Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(float, counts[0][2])) + + else: + counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, + label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], + stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) + plt.ylim((0, maximumY * 1.2)) + plt.ylabel("Absolute Frequency", fontsize=14) + bins = counts[1] # width of bins + counts = numpy.array(map(int, counts[0][2])) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) + plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) + plt.xlabel(xlabel, fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - step, maximumX + step)) + plt.xticks(numpy.arange(0, maximumX + step, step)) + + if nr_above_bars: + bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] + for x_label, label in zip(counts, bin_centers): # labels for values + if x_label == 0: + continue + else: + if rel_freq: + plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), + float(x_label)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), + xycoords="data", color="#000066", fontsize=10) + else: + plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), + xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), + xycoords="data", color="#000066", fontsize=10) + + if nr_unique_chimeras != 0: + if (sum(counts) / nr_unique_chimeras) == 2: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ + format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) + else: + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(list1))) + plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) + + legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format( + len(list1[1]), len(list1[1]) / float(nr_unique_chimeras), + len(list1[2]), len(list1[2]) / float(nr_unique_chimeras), + len(list1[0]), len(list1[0]) / float(nr_unique_chimeras)) + plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) + + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): + fig = plt.figure(figsize=(6, 8)) + plt.subplots_adjust(bottom=0.1) + + ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags + maximumX = numpy.amax(numpy.concatenate(ham_partial)) + minimumX = numpy.amin(numpy.concatenate(ham_partial)) + + if len(range(minimumX, maximumX)) == 0: + range1 = minimumX + else: + range1 = range(minimumX, maximumX + 2) + + if rel_freq: + w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Relative Frequency", fontsize=14) + plt.ylim(0, 1.07) + else: + plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, + label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], + bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], + edgecolor='black', linewidth=1) + plt.ylabel("Absolute Frequency", fontsize=14) + + plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1)) + plt.suptitle('Tag distances within tags', fontsize=14) + plt.xlabel("TD", fontsize=14) + plt.grid(b=True, which='major', color='#424242', linestyle=':') + plt.xlim((minimumX - 1, maximumX + 1)) + plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) + legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( + lenTags, len_sample, len(numpy.concatenate(ham_partial))) + plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) + pdf.savefig(fig, bbox_inches="tight") + plt.close("all") + plt.clf() + + +def createTableFSD2(list1, diff=True): + selfAB = numpy.concatenate(list1) + uniqueFS = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueFS), 1) + if diff is False: + count = numpy.zeros((len(uniqueFS), 6)) + else: + count = numpy.zeros((len(uniqueFS), 7)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 5] = j[1] + if state == 7: + for k, l in zip(uniqueFS, nr): + for j in table: + if j[0] == uniqueFS[l]: + count[l, 6] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + uniqueFS = uniqueFS.astype(str) + if uniqueFS[len(uniqueFS) - 1] == "20": + uniqueFS[len(uniqueFS) - 1] = ">20" + first = ["FS={}".format(i) for i in uniqueFS] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): + output_file.write(name) + output_file.write("\n") + if diff is False: + output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + else: + if rel is False: + output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep, sep)) + else: + output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". + format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "FS" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createTableHD(list1, row_label): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 6)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + if state == 6: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 5] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["{}{}".format(row_label, i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithTags(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), 5)) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + if state == 4: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 3] = j[1] + if state == 5: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 4] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createTableHDwithDCS(list1): + selfAB = numpy.concatenate(list1) + uniqueHD = numpy.unique(selfAB) + nr = numpy.arange(0, len(uniqueHD), 1) + count = numpy.zeros((len(uniqueHD), len(list1))) + state = 1 + for i in list1: + counts = list(Counter(i).items()) + hd = [item[0] for item in counts] + c = [item[1] for item in counts] + table = numpy.column_stack((hd, c)) + if len(table) == 0: + state = state + 1 + continue + else: + if state == 1: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 0] = j[1] + if state == 2: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 1] = j[1] + if state == 3: + for k, l in zip(uniqueHD, nr): + for j in table: + if j[0] == uniqueHD[l]: + count[l, 2] = j[1] + state = state + 1 + sumRow = count.sum(axis=1) + sumCol = count.sum(axis=0) + first = ["TD={}".format(i) for i in uniqueHD] + final = numpy.column_stack((first, count, sumRow)) + return (final, sumCol) + + +def createFileHD(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( + sep, sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr and "diff" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): + output_file.write(name) + output_file.write("\n") + output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) + for item in summary: + for nr in item: + if "TD" not in nr: + nr = nr.astype(float) + nr = nr.astype(int) + output_file.write("{}{}".format(nr, sep)) + output_file.write("\n") + output_file.write("sum{}".format(sep)) + sumCol = map(int, sumCol) + for el in sumCol: + output_file.write("{}{}".format(el, sep)) + output_file.write("{}{}".format(overallSum.astype(int), sep)) + output_file.write("\n\n") + + +def hamming(array1, array2): + res = 99 * numpy.ones(len(array1)) + i = 0 + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + for a in array1: + dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest + res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero + i += 1 + return res + + +def hamming_difference(array1, array2, mate_b): + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 + array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 + array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 + array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 + + diff11 = [] + relativeDiffList = [] + ham1 = [] + ham2 = [] + ham1min = [] + ham2min = [] + min_valueList = [] + min_tagsList = [] + diff11_zeros = [] + min_tagsList_zeros = [] + max_tag_list = [] + i = 0 # counter, only used to see how many HDs of tags were already calculated + if mate_b is False: # HD calculation for all a's + half1_mate1 = array1_half + half2_mate1 = array1_half2 + half1_mate2 = array2_half + half2_mate2 = array2_half2 + elif mate_b is True: # HD calculation for all b's + half1_mate1 = array1_half2 + half2_mate1 = array1_half + half1_mate2 = array2_half2 + half2_mate2 = array2_half + + for a, b, tag in zip(half1_mate1, half2_mate1, array1): + # exclude identical tag from array2, to prevent comparison to itself + sameTag = numpy.where(array2 == tag)[0] + indexArray2 = numpy.arange(0, len(array2), 1) + index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data + + # all tags without identical tag + array2_half_withoutSame = half1_mate2[index_withoutSame] + array2_half2_withoutSame = half2_mate2[index_withoutSame] + array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) + # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in + array2_half_withoutSame]) + min_index = numpy.where(dist == dist.min())[0] # get index of min HD + min_value = dist.min() + # get all "b's" of the tag or all "a's" of the tag with minimum HD + min_tag_half2 = array2_half2_withoutSame[min_index] + min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD + + dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in + min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" + max_value = dist_second_half.max() + max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD + max_tag = min_tag_array2[max_index] + + # for d, d2 in zip(min_value, max_value): + if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b + ham2.append(min_value) + ham2min.append(max_value) + else: # half1, corrects the variable of the HD from both halfs if it is a or b + ham1.append(min_value) + ham1min.append(max_value) + + min_valueList.append(min_value + max_value) + min_tagsList.append(tag) + difference1 = abs(min_value - max_value) + diff11.append(difference1) + rel_difference = round(float(difference1) / (min_value + max_value), 1) + relativeDiffList.append(rel_difference) + + # tags which have identical parts: + if min_value == 0 or max_value == 0: + min_tagsList_zeros.append(numpy.array(tag)) + difference1_zeros = abs(min_value - max_value) # td of non-identical part + diff11_zeros.append(difference1_zeros) + max_tag_list.append(numpy.array(max_tag)) + else: + min_tagsList_zeros.append(None) + diff11_zeros.append(None) + max_tag_list.append(None) + i += 1 + return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, + min_tagsList_zeros, ham1min, ham2min, max_tag_list]) + + +def readFileReferenceFree(file): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') + integers = numpy.array(data_array[:, 0]).astype(int) + return (integers, data_array) + + +def hammingDistanceWithFS(fs, ham): + fs = numpy.asarray(fs) + maximum = max(ham) + minimum = min(ham) + ham = numpy.asarray(ham) + + singletons = numpy.where(fs == 1)[0] + data = ham[singletons] + + hd2 = numpy.where(fs == 2)[0] + data2 = ham[hd2] + + hd3 = numpy.where(fs == 3)[0] + data3 = ham[hd3] + + hd4 = numpy.where(fs == 4)[0] + data4 = ham[hd4] + + hd5 = numpy.where((fs >= 5) & (fs <= 10))[0] + data5 = ham[hd5] + + hd6 = numpy.where(fs > 10)[0] + data6 = ham[hd6] + + list1 = [data, data2, data3, data4, data5, data6] + return (list1, maximum, minimum) + + +def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): + hammingDistances = numpy.unique(ham) + fs = numpy.asarray(fs) + ham = numpy.asarray(ham) + bigFamilies2 = numpy.where(fs > 19)[0] + if len(bigFamilies2) != 0: + fs[bigFamilies2] = 20 + maximum = max(fs) + minimum = min(fs) + if diff is True: + hd0 = numpy.where(ham == 0)[0] + data0 = fs[hd0] + + if rel is True: + hd1 = numpy.where(ham == 0.1)[0] + else: + hd1 = numpy.where(ham == 1)[0] + data = fs[hd1] + + if rel is True: + hd2 = numpy.where(ham == 0.2)[0] + else: + hd2 = numpy.where(ham == 2)[0] + data2 = fs[hd2] + + if rel is True: + hd3 = numpy.where(ham == 0.3)[0] + else: + hd3 = numpy.where(ham == 3)[0] + data3 = fs[hd3] + + if rel is True: + hd4 = numpy.where(ham == 0.4)[0] + else: + hd4 = numpy.where(ham == 4)[0] + data4 = fs[hd4] + + if rel is True: + hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] + else: + hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] + data5 = fs[hd5] + + if rel is True: + hd6 = numpy.where(ham > 0.8)[0] + else: + hd6 = numpy.where(ham > 8)[0] + data6 = fs[hd6] + + if diff is True: + list1 = [data0, data, data2, data3, data4, data5, data6] + else: + list1 = [data, data2, data3, data4, data5, data6] + + return (list1, hammingDistances, maximum, minimum) + + +def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): + diff_zeros = numpy.array(diff_zeros) + maximum = numpy.amax(diff_zeros) + minimum = numpy.amin(diff_zeros) + minHD_tags_zeros = numpy.array(minHD_tags_zeros) + + idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) + subset_data = data_array[idx, :] + + seq = numpy.array(subset_data[:, 1]) + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + DCS_tags = u[c == 2] + rest_tags = u[c == 1] + + dcs = numpy.repeat("DCS", len(DCS_tags)) + idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) + sscs = subset_data[idx_sscs, 2] + + all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), + numpy.concatenate((dcs, sscs)))) + hd_DCS = [] + ab_SSCS = [] + ba_SSCS = [] + + for i in range(len(all_tags)): + tag = all_tags[i, :] + hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] + + if tag[1] == "DCS": + hd_DCS.append(hd) + elif tag[1] == "ab": + ab_SSCS.append(hd) + elif tag[1] == "ba": + ba_SSCS.append(hd) + + if len(hd_DCS) != 0: + hd_DCS = numpy.concatenate(hd_DCS) + if len(ab_SSCS) != 0: + ab_SSCS = numpy.concatenate(ab_SSCS) + if len(ba_SSCS) != 0: + ba_SSCS = numpy.concatenate(ba_SSCS) + list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting + return (list1, maximum, minimum) + + +def make_argparser(): + parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data') + parser.add_argument('--inputFile', + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--sample_size', default=1000, type=int, + help='Sample size of Tag distance analysis.') + parser.add_argument('--subset_tag', default=0, type=int, + help='The tag is shortened to the given number.') + parser.add_argument('--nproc', default=4, type=int, + help='The tool runs with the given number of processors.') + parser.add_argument('--only_DCS', action="store_false", + help='Only tags of the DCSs are included in the HD analysis') + + parser.add_argument('--minFS', default=1, type=int, + help='Only tags, which have a family size greater or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--maxFS', default=0, type=int, + help='Only tags, which have a family size smaller or equal than specified, ' + 'are included in the HD analysis') + parser.add_argument('--nr_above_bars', action="store_true", + help='If False, values above bars in the histograms are removed') + parser.add_argument('--rel_freq', action="store_false", + help='If True, the relative frequencies are displayed.') + + parser.add_argument('--output_tabular', default="data.tabular", type=str, + help='Name of the tabular file.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, + help='Name of the pdf file.') + parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, + help='Name of the tabular file with all chimeric tags.') + + return parser + + +def Hamming_Distance_Analysis(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + file1 = args.inputFile + name1 = args.inputName1 + index_size = args.sample_size + title_savedFile_pdf = args.output_pdf + title_savedFile_csv = args.output_tabular + output_chimeras_tabular = args.output_chimeras_tabular + onlyDuplicates = args.only_DCS + rel_freq = args.rel_freq + minFS = args.minFS + maxFS = args.maxFS + nr_above_bars = args.nr_above_bars + subset = args.subset_tag + nproc = args.nproc + sep = "\t" + + # input checks + if index_size < 0: + print("index_size is a negative integer.") + exit(2) + if nproc <= 0: + print("nproc is smaller or equal zero") + exit(3) + if subset < 0: + print("subset_tag is smaller or equal zero.") + exit(5) + + # PLOT + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 14 + plt.rcParams['ytick.labelsize'] = 14 + plt.rcParams['patch.edgecolor'] = "#000000" + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + name1 = name1.split(".tabular")[0] + + with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: + print("dataset: ", name1) + integers, data_array = readFileReferenceFree(file1) + data_array = numpy.array(data_array) + print("total nr of tags:", len(data_array)) + + # filter tags out which contain any other character than ATCG + valid_bases = ["A", "T", "G", "C"] + tagsToDelete = [] + for idx, t in enumerate(data_array[:, 1]): + for char in t: + if char not in valid_bases: + tagsToDelete.append(idx) + break + + if len(tagsToDelete) != 0: # delete tags with N in the tag from data + print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), + float(len(tagsToDelete)) / len(data_array)) + index_whole_array = numpy.arange(0, len(data_array), 1) + index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) + data_array = data_array[index_withoutN_inTag, :] + integers = integers[index_withoutN_inTag] + print("total nr of filtered tags:", len(data_array)) + + int_f = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f >= minFS)] + integers = integers[integers >= minFS] + + # select family size for tags + if maxFS > 0: + int_f2 = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f2 <= maxFS)] + integers = integers[integers <= maxFS] + + if onlyDuplicates is True: + tags = data_array[:, 2] + seq = data_array[:, 1] + + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c == 2] + + # get family sizes, tag for duplicates + duplTags_double = integers[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS + + duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab + duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags + + if minFS > 1: + duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] + duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 + + data_array = numpy.column_stack((duplTags, duplTags_seq)) + data_array = numpy.column_stack((data_array, duplTags_tag)) + integers = numpy.array(data_array[:, 0]).astype(int) + print("DCS in whole dataset", len(data_array)) + + print("min FS", min(integers)) + print("max FS", max(integers)) + + # HD analysis for a subset of the tag + if subset > 0: + tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) + tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) + + flanking_region_float = float((len(tag1[0]) - subset)) / 2 + flanking_region = int(flanking_region_float) + if flanking_region_float % 2 == 0: + tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) + tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) + else: + flanking_region_rounded = int(round(flanking_region, 1)) + flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded + tag1_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) + tag2_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) + + data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) + data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) + + print("length of tag= ", len(data_array[0, 1])) + # select sample: if no size given --> all vs. all comparison + if index_size == 0: + result = numpy.arange(0, len(data_array), 1) + else: + numpy.random.shuffle(data_array) + unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags + result = numpy.random.choice(unique_indices, size=index_size, + replace=False) # array of random sequences of size=index.size + + # comparison random tags to whole dataset + result1 = data_array[result, 1] # random tags + result2 = data_array[:, 1] # all tags + print("sample size= ", len(result1)) + + # HD analysis of whole tag + proc_pool = Pool(nproc) + chunks_sample = numpy.array_split(result1, nproc) + ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) + proc_pool.close() + proc_pool.join() + ham = numpy.concatenate(ham).astype(int) + # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: + # for h, tag in zip(ham, result1): + # output_file1.write("{}\t{}\n".format(tag, h)) + + proc_pool_b = Pool(nproc) + diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) + diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) + proc_pool_b.close() + proc_pool_b.join() + HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), + numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) + HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), + numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) + minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), + numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) + HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), + numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) + HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), + numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) + + rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) + rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) + diff1 = numpy.concatenate([item[0] for item in diff_list_a]) + diff2 = numpy.concatenate([item[0] for item in diff_list_b]) + + diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) + diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) + minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) + minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) + minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) + + chimera_tags1 = sum([item[10] for item in diff_list_a], []) + chimera_tags2 = sum([item[10] for item in diff_list_b], []) + + rel_Diff = [] + diff_zeros = [] + minHD_tags_zeros = [] + diff = [] + chimera_tags = [] + for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ + zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, + chimera_tags1, chimera_tags2): + relatives = numpy.array([rel1, rel2]) + absolutes = numpy.array([d1, d2]) + max_idx = numpy.argmax(relatives) + rel_Diff.append(relatives[max_idx]) + diff.append(absolutes[max_idx]) + + if all(i is not None for i in [zeros1, zeros2]): + diff_zeros.append(max(zeros1, zeros2)) + minHD_tags_zeros.append(str(tag1)) + tags = [ctag1, ctag2] + chimera_tags.append(tags) + elif zeros1 is not None and zeros2 is None: + diff_zeros.append(zeros1) + minHD_tags_zeros.append(str(tag1)) + chimera_tags.append(ctag1) + elif zeros1 is None and zeros2 is not None: + diff_zeros.append(zeros2) + minHD_tags_zeros.append(str(tag2)) + chimera_tags.append(ctag2) + + chimera_tags_new = chimera_tags + data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) + + checked_tags = [] + stat_maxTags = [] + + with open(output_chimeras_tabular, "w") as output_file1: + output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n") + for i in range(len(data_chimeraAnalysis)): + tag1 = data_chimeraAnalysis[i, 0] + + info_tag1 = data_array[data_array[:, 1] == tag1, :] + fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] + + if tag1 in checked_tags: # skip tag if already written to file + continue + + sample_half_a = tag1[0:(len(tag1)) / 2] + sample_half_b = tag1[len(tag1) / 2:len(tag1)] + + max_tags = data_chimeraAnalysis[i, 1] + if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray: + max_tags = numpy.concatenate(max_tags) + max_tags = numpy.unique(max_tags) + stat_maxTags.append(len(max_tags)) + + info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] + + chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 + chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 + + new_format = [] + for j in range(len(max_tags)): + fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] + + if sample_half_a == chimera_half_a[j]: + max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + + elif sample_half_b == chimera_half_b[j]: + max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) + new_format.append(max_tag) + checked_tags.append(max_tags[j]) + + sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) + output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) + checked_tags.append(tag1) + + output_file1.write( + "This file contains all tags that were identified as chimeras as the first column and the " + "corresponding tags which returned a Hamming distance of zero in either the first or the second " + "half of the sample tag as the second column.\n" + "The tags were separated by an empty space into their halves and the * marks the identical half.") + output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n") + output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) + output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) + output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) + output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) + output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) + + lenTags = len(data_array) + len_sample = len(result1) + + quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags + seq = numpy.array(data_array[result, 1]) # tags of sample + ham = numpy.asarray(ham) # HD for sample of tags + + if onlyDuplicates is True: # ab and ba strands of DCSs + quant = numpy.concatenate((quant, duplTagsBA[result])) + seq = numpy.tile(seq, 2) + ham = numpy.tile(ham, 2) + diff = numpy.tile(diff, 2) + rel_Diff = numpy.tile(rel_Diff, 2) + diff_zeros = numpy.tile(diff_zeros, 2) + + nr_chimeric_tags = len(data_chimeraAnalysis) + print("nr of chimeras", nr_chimeric_tags) + + # prepare data for different kinds of plots + # distribution of FSs separated after HD + familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) + list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS + + # get FS for all tags with min HD of analysis of chimeric reads + # there are more tags than sample size in the plot, because one tag can have multiple minimas + if onlyDuplicates: + seqDic = defaultdict(list) + for s, q in zip(seq, quant): + seqDic[s].append(q) + else: + seqDic = dict(zip(seq, quant)) + + lst_minHD_tags = [] + for i in minHD_tags: + lst_minHD_tags.append(seqDic.get(i)) + + if onlyDuplicates: + lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], + [item_b[1] for item_b in lst_minHD_tags])).astype(int) + # histogram with absolute and relative difference between HDs of both parts of the tag + listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) + listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, rel_Diff) + # chimeric read analysis: tags which have TD=0 in one of the halfs + if len(minHD_tags_zeros) != 0: + lst_minHD_tags_zeros = [] + for i in minHD_tags_zeros: + lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads + if onlyDuplicates: + lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], + [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) + + # histogram with HD of non-identical half + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + + if onlyDuplicates is False: + listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array) + + # plot Hamming Distance with Family size distribution + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, + subtitle="Tag distance separated by family size", lenTags=lenTags, + xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample) + + # Plot FSD with separation after + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, + originalCounts=quant, subtitle="Family size distribution separated by Tag distance", + pdf=pdf, relative=False, diff=False) + + # Plot HD within tags + plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, + rel_freq=rel_freq, len_sample=len_sample) + + # Plot difference between HD's separated after FSD + plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, + subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq, + xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) + + plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, + subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq, + xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars, + nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # plots for chimeric reads + if len(minHD_tags_zeros) != 0: + # HD + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + if onlyDuplicates is False: + plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, + subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, + lenTags=lenTags, xlabel="TD", relative=False, + nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) + + # print all data to a CSV file + # HD + summary, sumCol = createTableHD(list1, "TD=") + overallSum = sum(sumCol) # sum of columns in table + + # FSD + summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) + overallSum5 = sum(sumCol5) + + # HD of both parts of the tag + summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) + overallSum9 = sum(sumCol9) + + # HD + # absolute difference + summary11, sumCol11 = createTableHD(listDifference1, "diff=") + overallSum11 = sum(sumCol11) + # relative difference and all tags + summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") + overallSum13 = sum(sumCol13) + + # chimeric reads + if len(minHD_tags_zeros) != 0: + # absolute difference and tags where at least one half has HD=0 + summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=") + overallSum15 = sum(sumCol15) + + if onlyDuplicates is False: + summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) + overallSum16 = sum(sumCol16) + + output_file.write("{}\n".format(name1)) + output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) + + # HD + createFileHD(summary, sumCol, overallSum, output_file, + "Tag distance separated by family size", sep) + # FSD + createFileFSD2(summary5, sumCol5, overallSum5, output_file, + "Family size distribution separated by Tag distance", sep, + diff=False) + + # output_file.write("{}{}\n".format(sep, name1)) + output_file.write("\n") + max_fs = numpy.bincount(integers[result]) + output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result]))) + output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1])) + output_file.write( + "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) + + # HD within tags + output_file.write( + "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n" + "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n" + "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n" + "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n" + "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n" + "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n" + "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n" + "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n") + + output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) + + createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, + "Tag distance of each half in the tag", sep) + createFileHD(summary11, sumCol11, overallSum11, output_file, + "Absolute delta Tag distance within the tag", sep) + + createFileHD(summary13, sumCol13, overallSum13, output_file, + "Chimera analysis: relative delta Tag distance", sep) + + if len(minHD_tags_zeros) != 0: + output_file.write( + "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n" + "These tags are considered as chimeras.\n") + createFileHD(summary15, sumCol15, overallSum15, output_file, + "Tag distance of chimeric families separated after FS", sep) + + if onlyDuplicates is False: + createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, + "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep) + + output_file.write("\n") + + +if __name__ == '__main__': + sys.exit(Hamming_Distance_Analysis(sys.argv)) diff -r 000000000000 -r 7fc3c8704c05 td.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/td.xml Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,145 @@ + + + Tag distance analysis of duplex tags + + fsd_macros.xml + + + python + matplotlib + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_. In this context the Hamming distance is simply the number of differences between two tags. The tool compares in a randomly selected subset of tags (default n=1000), the difference between each tag of the subset with the tags of the complete dataset. Each tag will differ by a certain number of nucleotides with the other tags; yet the tool uses the smallest difference observed with any other tag. + +**Input** + +This tools expects a tabular file with the tags of all families, the family sizes and information about forward (ab) and reverse (ba) strands:: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +.. class:: infomark + +**How to generate the input** + +The first step of the `Du Novo Analysis Pipeline `_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form:: + + 1 2 3 4 + ------------------------------------------------------ + AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG + AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG + +We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAAAATGGTATG ba + +Next, the tags are sorted in ascending or descending order using the **Sort** tool:: + + 1 2 + --------------------------- + AAAAAAAAAAAAAAAAATGGTATG ba + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + AAAAAAAAAAAAAAATAGCTCGAT ab + +Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1):: + + 1 2 3 + ----------------------------- + 1 AAAAAAAAAAAAAAAAATGGTATG ba + 3 AAAAAAAAAAAAAATGGTATGGAC ab + +These data can now be used in this tool. + +**Output** + +The output is one PDF file with various plots of the tag distance, a tabular file with the summarized data of the plots and a tabular file with the chimeras. The PDF file contains several pages: + + 1. This first page contains a graph representing the minimum tag distance (smallest number of differences) categorized after the family sizes. + + 2. The second page contains the same information as the first page, but plots the family size categorized by the minimum tag distance. + + 3. The third page contains the **first step** of the **chimera analysis**, which examines the differences between the tags at both ends of a read (a/b). Chimeras can be distinguished by carrying the same tag at one end combined with multiple different tags at the other end of a read. Here, we describe the calculation of the TDs for only one tag in detail, but the process is repeated for each tag in the sample (default n=1000). First, the tool splits the tag into its upstream and downstream part (named a and b) and compares it with all other a parts of the families in the dataset. Next, the tool estimates the sequence differences (TD) among the a parts and extracts those tags with the smallest difference (TD a.min) and calculates the TD of the b part. The tags with the largest differences are extracted to estimate the maximum TD (TD b.max). The process is repeated starting with the b part instead and estimates TD a.max and TD b.min. Next, we calculate the sum of TD a.min and TD b.max. + + 4. The fourth page contains the **second step** of the **chimera analysis**: the absolute difference (=delta TD) between the partial TDs (TD a.min & TD b.max and TD b.min & TD a.max). The partial TDs of chimeric tags are normally very different which means that multiple combinations of the same a part with different b parts are likely. But it is possible that small delta TDs occur due to a half of a tag that is identical to other halves in the data. For this purpose, the relative difference between the partial TDs is estimated in the next step. + + 5. The fifth page contains the **third step** of the **chimera analysis**: the relative differences of the partial TDs (=relative delta TD). These are calculated as the absolute difference between TD a.min and TD b.max equal to TD delta. Since it is not known whether the absolute difference originates due to a low and a very large TD within a tag or an identical half (TD=0), the tool estimates the relative TD delta as the ratio of the difference to the sum of the partial TDs. In a chimera, it is expected that only one end of the tag contributes to the TD of the whole tag. In other words, if the same a part is observed in combination with several different b parts, then one end will have a TD = 0. Thus, the TD difference between the parts (TD a.min - TD b.max, TD a.max - TD b.min) is the same as the sum of the parts (TD a.min + TD b.max, TD a.max + TD b.min) or the ratio of the difference to the sum (relative delta TD = TD a.min - TD b.max / (TD a.min + TD b.max or TD a.max - TD b.min / (TD a.max + TD b.min)) will equal 1 in chimeric families. Here, the maximum value of the relative delta TD and the respective delta TD (in step 4) are selected and the plot can be interpreted as the following: + + - A low relative difference indicates that the total TD is equally distributed in the two partial TDs. This case would be expected, if all tags originate from different molecules. + - A relative delta TD of 1 means that one part of the tags is identical. Since it is very unlikely that by chance two different tags have a TD of 0, the TDs in the other half are probably artificially introduced and represents chimeric families. + + 6. The sixth page is an analysis only of **chimeric tags** (relative delta TD =1) from step 5. + + 7. The last page is only generated when the parameter **"Include only DCS in the analysis?"** is set to **False (NO)**. The graph represents the **TD of the chimeric tags** that form a DCS (complementary ab and ba). + + .. class:: infomark + +**Note:** +Chimeras can be identical in the first or second part of the tag and can have an identical TD with mutliple tags. Therefore, the second column of the output file can have multiple tag entries. The file also contains the family sizes and the direction of the read (ab, ba). The asterisks mark the identical part of the tag:: + + 1 2 + -------------------------------------------------------------------------------------------------- + GAAAGGGAGG GCGCTTCACG 1 ba GCAATCGACG *GCGCTTCACG* 1 ba + CCCTCCCTGA GGTTCGTTAT 1 ba CGTCCTTTTC *GGTTCGTTAT* 1 ba, GCACCTCCTT *GGTTCGTTAT* 1 ba + ATGCTGATCT CGAATGCATA 55 ba, 59 ab AGGTGCCGCC *CGAATGCATA* 27 ba, *ATGCTGATCT* GAATGTTTAC 1 ba + ]]> + + + + diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba.bam Binary file test-data/fsd_ba.bam has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba_DCS.fna --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_ba_DCS.fna Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,50 @@ +>AAAAAAGGACCCTACCACCAACGT 8-5 +CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGNGTCCAACGCGTCCATGAGCTNCNACNCNCCACTGGTGCGCNNCGCNNGNCTNNNNNCAGNNNANNNCCNCANNNNNNCCNNNNNCNNNNNNNNNNNNNNNCCNNNNNNNNNNNNNN +>AAAAAAGGATTCCAAATCTCTGGA 3-7 +TACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGNCCCTGGGGACACGGGCTCCTCAGACGGGNTGCCAGGCNCNGGAGGNCCGCNCAGCCGGNNNCCACCGCNNSNNNCCNNNNCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNN +>AAAAAAGGCCAGTTTAAAAAAACT 37-3 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTNGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNNNTGGGCCTGGCANCCCNNCNGAGGAGCCNGNNNCCNCAGGTCCCCTGTACNNNCTNGTG +>AAAAAAGGCCCAGATCTCTTAAAA 194-31 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTGCTGGTGGAGTACGCGGCCAAGGGTAACCTGCGGGAGTTTCAGTCATTTTAAG +>AAAAAAGGGGCCTCATGCGTCAGT 4-3 +CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCNGGAAGNGGGAGANCTTGTGCACGGTGGNNGANCCNAGGCCTTNCTTGGGGGGNNTGCGNNNNNNNNNNNNNNNNNCNNNNNNNNNNNNNNGGNNNANNNGNNNNNNNNNNNNNNNNT +>AAAAAAGTCCTTCGACTCAAGCGG 4-5 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTNCTGCCAGGTACCGGCNTCTNCTGCTGCTGNNGNNCNNCNNTNNCNNNNNNNNNNTNNCNNNNNNNNNNNN +>AAAAAAGTGGGATCGGGTTGCAGC 11-6 +CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCNGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCNCCCCAGGGCCGGGCNCACGTTGGTTGTCTTCTTGNANTAGTCNNNNTNGTGCNCGTNNNNNNNCNNNNNNNNNNNNNCNNNN +>AAAAAATAATTTCGCCCTCGAGTA 16-4 +CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCANCNCACCACTGGNNNGNNTCNCANNGCTNTCNNCAGNNNNNGGCCNNNNNCTGNNCNANNTCNCC +>AAAAAATAGTTAAGCCGACACACT 5-7 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGNGTACACCTGCCTGGCGGGCAATTCTATNGGGTTTTCTCATCACTCTGCGTGGNNGGTGGTGCTGCCAGGTACCGGCNTCTGCTGCTNNNGCTGNNCCGCNNNNNNNNNNNNNNNNNGNNNNNNNNNNCN +>AAAAAATCAGAATAGAGCGCTTTC 4-3 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGNGGACGCCGGGGAGTANACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGNCTGGTGGNGCTGCCNGNNNCCNNNNNCTNNNNNNNNNGCTNNNNNNNNNTNNNNNNNNNNNNNNNNNCTNNCNNNNNNNN +>AAAAAATCATAAACATTTTAACAA 65-21 +CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGCATCGCAAGGCTGTCCTCAGGGGAGGGCCCCACGCTGGCCAATGTCTCCNNNCNNGNGCNGCCTGCCNNNNNCAAANGG +>AAAAAATCTTGACTCGGTACACAA 9-3 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGMCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNCNGAGGAGCCNNNNTCNCCAGGNCCNNTGNANNNNCNGGNGGAGTANNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>AAAAAATGACGAACGATTCGTCAT 3-6 +CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACNNTGGGCACAGGGNCAGGAGNNNNNGCNCAAGNANNNNNNNNNNNNNNNNNNTCNNNNNNNNNNNCNNNNNNNNNNKNNNNNNNNNNNNNNNNNNNCNNNNNNNN +>AAAAAATGTGCCGAACCTTGGCGA 7-10 +CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGCTCAAGAAGCGGGNNGNNNGNAAGTCNCAGGATTNNNNNCNNNNNTNNNANNTTTGGCNNNNNNNNNNNNANN +>AAAAAATTGAATCCGTGGATATAG 3-8 +CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGNNCCNNNCCNGGCCNNAACGCCCATNTCTTTNCNGCNNNGNNNGANCNGGTGGAGGNNNGACNAGGCGGGNNGTGNNTNNNNMNGNCNNNNNNNNNNNNGNNNNGNGNNNNNNNNNTGTNNNTCMNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>AAAAAATTGCATGCATCGTCCCTG 6-9 +CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGNATCGCAAGGCTGTCCTCAGGGGNGGNCCNCNNNNTGNCNNATNNCNCCGNGCNNNNGCNNCCTGCNNNNNNCNNNN +>AAAAAATTGGCATTGTGTATGCAT 18-5 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCNGNNCNNNNGNNNANNCCGTGNCCCCAGGGNCNNTGNNNNNNNNGGTNNNNNACGNNNNCNANNNTANCCNNNNNNNGT +>AAAAAATTTTCCCACCAAAATTTC 18-3 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTNNNGGNTNGGCGGCCCTCCTGGGCCTGGCNGCNCGTCTGAGGNNCCNNNGTCNNCAGGNNNNCNNTNNNNNNNGGNGNNNNANNNNGNNNNGNNNNNNNNNNNNNNNNNNCNGNNNNNNNNNNNN +>AAAAAATTTTTCTTTACCACCTGT 4-4 +CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGGCCNGGCNNNNCNNNAACGCCCANNNTNTTTNNANCNNNNGNGNNNNNGNNNNNNNSNNNCNNNNNNNNNNGNNNNNNNNNNNNNCNNNNNNNNCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>AAAAACAACATAGCTTGAAAATTT 7-4 +CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCNCAGGGCCAGGAGTGAGGGCNCNAGNAGCNGGACGGNNGTAAGTNNNNNNANTNNNNNNNNTNNNNGCNNNNNTNNNNNNNCNNNNNNNAGN +>AAAAACAACCAACGTTCTATCTCT 18-4 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNNNGNGGNNNNNGNGTCCCCAGGGCCCCTGNNNNNNNNGGNNGNNNNNNNGGNCNNGNNNNNCNTGNNNNANNNNNNNNNNNNNNNNNNN +>AAAAACAAGATAATTGGCGCCCGT 5-22 +CTGCCATACACCCGTCCCAGGAGCATGTCCACAGAACCCCAGCCACACCCAACATCCGCCACATCCCTGACGGCCCCTAAACCCAGCCGGGCCTCTGACTGGTGGCTGTTTCACCCCCACCACCAAGCCCCCTACAGCCAACGCTGGCCCTCAGCACCACTGACCGGGCCCGAGACAGCTCCCATTTGGGGTCGGCAGGCAGCTCGAGCTCGGANACATTGGCCAGCGTGGNGNNNNNCCCNNNGNNCNNCCNTNNNNNNYNNNNNANNNNNNNNNNNNNNNMNNN +>AAAAACAAGCATCTGTCGACACTA 69-60 +TGCCGCCTGCGCAGCCCCCCCAAGAAAGGCCTGGGCTCCCCCACCGTGCACAAGATCTCCCGCTTCCCGCTCAAGCGACAGGTAACAGAAAGTAGATACCAGGTTCTGAGCTGCCTGCCCGCCAGGCCTCCTGGAGCCCCACCTCGGCCCACGCTGGTCCTGGGCTGTGTGAGCCCTCTCTGCAGCCAGGCGGGCTCCCCTCTCCTCGTCTCTGCTCACCATGTAGAGCCTAGGGAGTCATAGTGTCGACAG +>AAAAACAATCTTAACCGCGATCTA 10-4 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCNGNNTCCCCNGGTCCCCTGTNNGTGCTGGNGGANTACNNGGNCNAGGGTNNCCNGCNNNNGNTNNTGNNNGNNNNNNNN +>AAAAACACGCGGACTTTCCGCATT 4-7 +ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGCCCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGNGGGCCGCCCAGCCGGCACCACCGCCGCTACCGCNNCTACCNCCNNNNCGTGCNNNNNNCNNNCAGNNNNNN \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba_data.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_ba_data.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,32 @@ +10 AAAAAACATCCCAATAAGAAATCA ab +9 AAAAAACATCCCAATAAGAAATCA ba +4 AAAAAAGTCCTTCGACTCAAGCGG ab +5 AAAAAAGTCCTTCGACTCAAGCGG ba +5 AAAAAATAGTTAAGCCGACACACT ab +7 AAAAAATAGTTAAGCCGACACACT ba +7 AAAAAATGTGCCGAACCTTGGCGA ab +10 AAAAAATGTGCCGAACCTTGGCGA ba +7 AAAAACAACATAGCTTGAAAATTT ab +4 AAAAACAACATAGCTTGAAAATTT ba +81 ATTCGGATAATTCGACGCAACATT ab +11 ATTCGGATAATTCGACGCAACATT ba +41 ATTCGTCGACAATACAAAGGGGCC ab +226 ATTCGTCGACAATACAAAGGGGCC ba +6 ATTGCCAGTGTGGGCTGGTTAGTA ab +41 ATTGCCAGTGTGGGCTGGTTAGTA ba +50 ATTTCGCGACCATCCGCCACTTTG ab +332 ATTTCGCGACCATCCGCCACTTTG ba +64 CAAACTTTAGCACAGTGTGTGTCC ab +57 CAAACTTTAGCACAGTGTGTGTCC ba +85 ATAAACGGCCTTCGACATTGTGAC ab +15 ATAAACGGCCTTCGACATTGTGAC ba +11 ATAAAGTCACCTGTGAATACGTTG ab +35 ATAAAGTCACCTGTGAATACGTTG ba +83 ATAAATCGAAACCGTGCCCAACAA ab +63 ATAAATCGAAACCGTGCCCAACAA ba +9 ATTTAGATATTTTCTTCTTTTTCT ab +7 ATTTAGATATTTTCTTCTTTTTCT ba +7 ATTTAGTTATCCGTCGGCGACGAA ab +3 ATTTAGTTATCCGTCGGCGACGAA ba +8 ATTTAGTTTGAATTGCCCTGCGTC ab +9 ATTTAGTTTGAATTGCCCTGCGTC ba \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba_output.pdf Binary file test-data/fsd_ba_output.pdf has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba_output.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_ba_output.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,45 @@ +Dataset: fsd_ba_data.tab + AB BA +max. family size: 81 226 +absolute frequency: 1 1 +relative frequency: 0.100 0.100 + + +total nr. of reads before SSCS building 1312 + + +Values from family size distribution + before SSCS building after DCS building after trimming after alignment to reference +FS=1 0 0 0 0 +FS=2 0 0 0 0 +FS=3 1 8 0 0 +FS=4 2 10 6 2 +FS=5 2 5 3 2 +FS=6 1 3 3 0 +FS=7 5 5 3 4 +FS=8 1 2 0 1 +FS=9 3 2 1 3 +FS=10 2 2 3 2 +FS=11 2 1 0 2 +FS=12 0 0 1 0 +FS=13 0 0 4 0 +FS=14 0 0 0 0 +FS=15 1 0 1 0 +FS=16 0 1 3 0 +FS=17 0 0 3 0 +FS=18 0 3 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 12 8 3 4 +sum 32 50 34 20 + + +In the plot, the family sizes of ab and ba strands and of both duplex tags were used. +Whereas the total numbers indicate only the single count of the formed duplex tags. +total nr. of tags (unique, FS>=1) 16 +DCS (before SSCS building, FS>=1) 16 +total nr. of tags (unique, FS>=3) 16 +DCS (before SSCS building, FS>=3) 16 +after DCS building 25 +after trimming 17 +after alignment to reference 10 diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_ba_trimmed.fna --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_ba_trimmed.fna Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,34 @@ +>AAAAAGAAAAACGATGCTTGACCA 4-6 +CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGC +>AAAAAGAAAAGTTTGCTTTTTCTT 13-17 +CTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCCGCT +>AAAAAGAAATGAATTGGTCCTAGA 24-7 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTNTGAGGAGCC +>AAAAAGACAGCCTGAATTCCTTGT 17-4 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGNGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNCCTGGGCCTGGCAGCCCGTCNGAG +>AAAAAGACGATTACACAATAACCT 16-7 +CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGT +>AAAAAGATACGGGAGGTGAATTGT 75-6 +CTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGGCTTCTGCTGCTGCTGCTGCTCCGCACTGTCTGGGGGACGCTGGCTCGGGACACGCCAAAGCTGCCAGGACGGACGGGAATCCTGTGACTTACGGCCGTCCCGCTTCTTGAGCCCTCACTCCTGGCCCTGTGCCCAGTGTGGGGACAAAGTTGGCCTGGCCCGGTCCTGGTCCCAGAGGGGC +>AAAAAGATATTTTAATCGGCCCGA 7-6 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTMTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGC +>AAAAAGATTACACTGAAATCTTTT 25-5 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCNTGTCCCCAGGGCC +>AAAAAGCCATATGGTCGAAGAGAT 13-10 +ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCC +>AAAAAGCGAAAGTGCCCCATATTT 13-16 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTNCTGGTGG +>AAAAAGCGATTTAACTGAAATTAT 5-4 +CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGC +>AAAAAGCGGGGTGGCCTTACGCCC 17-10 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGNGTCCCCAGGGCCNCTGTACGT +>AAAAAGCTCTACCCCCACGAAGCG 5-10 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGNCTTCTGCTGCTGCTGC +>AAAAAGGATATGTCTAACATCCCT 15-16 +CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCGGGAAGCGGGAGATCTTGTGCACGGTGGGGGAGCCCAGGCCTTTCTTGGGGGGGCTGCGCAGGCGGCAGAGCGTCACAGCCGCCACCACCAGGATGAACAGGAAGAAGCCCACCCCGT +>AAAAAGGTACACCCGAGATGAACT 13-9 +CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCCGTGTCCCCAGGTCCCCTGTACGTGCTGGTGGAGTAC +>AAAAAGTAGCTTCGGTTCGGGTCT 12-4 +GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGANTACACCTGCCTGGCGGGCANTTCTATTGGGTTTTCTCATCACTCTGCGTG +>AAAAAGTAGGGACATAATTGACTT 4-4 +CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGRTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCACCCCA \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_data1.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_data1.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,112 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +21 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba +3 AAAAAAAAAAATCACACTTAACTT ba +1 AAAAAAAAAAATCCGCTCCTCACA ba +1 AAAAAAAAAAATTAACTAAACTTA ab +1 AAAAAAAAAACAAATTCTATTATT ab +1 AAAAAAAAAACTCCCAGATTTTTT ab +1 AAAAAAAAAACTTCTGCTTGGCGG ba +11 AAAAAAAAAAGAATCGTGGTTTGT ba +5 AAAAAAAAAAGATAGCCCTAAACG ab +1 AAAAAAAAAAGCAATAATGCCAGT ab +2 AAAAAAAAAAGTACCGCACTCTCA ba +1 AAAAAAAAAAGTTCTTTCTTTGAG ab +1 AAAAAAAAAATAACTTCAATAATG ba +2 AAAAAAAAAATAATCATAGACTCT ab +1 AAAAAAAAAATAGTCTCACATTTA ab +1 AAAAAAAAAATATAACCTTTGGCG ab +3 AAAAAAAAACAAAATTCTATTATT ab +1 AAAAAAAAACAAGTACGCGGCATT ab +1 AAAAAAAAACAAGTACGCGGTATT ab +1 AAAAAAAAACAATATCGAATTAAC ab +3 AAAAAAAAACACGGTGAGACAAGG ba +1 AAAAAAAAACACGTTTCTCCCCTT ba +1 AAAAAAAAACATATCGTCCCGAGC ba +1 AAAAAAAAACCTACCTGAGGCCCC ab +3 AAAAAAAAACCTTATTACAGCGGA ab +1 AAAAAAAAACGATTCTCTGTATCT ba +1 AAAAAAAAACGTACCGCACTCTCA ba +4 AAAAAAAAACTACCCAGATTTTTT ba +1 AAAAAAAAACTAGATGAGACGACC ba +4 AAAAAAAAACTGTCTGCTTGGCGG ba +1 AAAAAAAAAGAAGTTTAATTTTAA ab +1 AAAAAAAAAGAATGCCTAAGACGA ba +6 AAAAAAAAAGACCGGCCTTAGACA ba +1 AAAAAAAAAGATATCGTGGTTTGT ba +1 AAAAAAAAAGCAATACTCAAGCTG ba +6 AAAAAAAAAGCAATGTCTAAGCCT ba +1 AAAAAAAAAGCACTGTCTAAGCCT ab +2 AAAAAAAAAGCTAATAATGCCAGT ab +1 AAAAAAAAAGTTTCGTGAAGGTCC ba +1 AAAAAAAAATAAAGGTCCGAATCT ab +1 AAAAAAAAATAAATGAGAGTGTAA ba +8 AAAAAAAAATAAGTCTCACATTTA ab +1 AAAAAAAAATAATAACCTCTGGCG ab +10 AAAAAAAAATAATAACCTTTGGCG ab +1 AAAAAAAAATAATCCCCTTTGTCG ab +6 AAAAAAAAATACGCAAACGCTGAG ab +4 AAAAAAAAATAGATCATAGACTCT ab +10 AAAAAAAAATAGATCATAGACTCT ba +10 AAAAAAAAATAGTAGGATTTCATG ba +7 AAAAAAAAATATGAATACCCTCGT ba +1 AAAAAAAAATATGCCACTTGATCC ba +1 AAAAAAAAATATTCTGCCACTTGA ba +3 AAAAAAAAATCAAACCAAGAGGAC ba +1 AAAAAAAAATCAGTACCCCTAAAC ab +12 AAAAAAAAATCCTAGTTAATGAAG ba +1 AAAAAAAAATCGATTCTTTATGCG ab +1 AAAAAAAAATGTCTGAAAATATCT ab +4 AAAAAAAAATGTCTGAAAATATCT ba +1 AAAAAAAAATTTCCGCAGACCGTT ba +8 AAAAAAAAATTTGGGCTACTACAA ba +1 AAAAAAAACAAAATTAGAACCCTT ab +1 AAAAAAAACAAACCGCTCCTCACA ba +5 AAAAAAAACAACGTACGCGGTATT ab +4 AAAAAAAACAATATCGTTGATATG ba +4 AAAAAAAACAATCACGTTAATAGG ab +1 AAAAAAAACAGAATCGTGGTTTGT ba +1 AAAAAAAACCAAATCGTTGATATG ba +9 AAAAAAAACCAAGTCCAGGCATCT ba +2 AAAAAAAACCACGGTGAGACAAGG ba +1 AAAAAAAACCGCCCAACTGCCGGT ab +5 AAAAAAAACCTCTCAACCCCAAAT ba +7 AAAAAAAACCTCTTGCGATGTTGT ab +1 AAAAAAAACCTCTTGCGCTGTTGT ab +1 AAAAAAAACCTCTTGTGATGTTGT ab +12 AAAAAAAACCTGAGCAATGGTTCC ab +3 AAAAAAAACCTTGACCCTCACATG ba +6 AAAAAAAACCTTGCACTCGTCCTA ba +9 AAAAAAAACGAAATAAAAAAACCT ba +1 AAAAAAAACGACCGGCCTTAGACA ba +4 AAAAAAAACGCCACCACCCCCTTT ab +12 AAAAAAAACGCCACGGGCACTATT ba +13 AAAAAAAACGTATCAGTAGATCCT ab +1 AAAAAAAACTAGTAGGATTTCATG ba +3 AAAAAAAACTATAGAAAATCCATT ba +1 AAAAAAAACTATTCTATTTCCGAT ba +13 AAAAAAAACTGATCTGCTTGGCGG ba +8 AAAAAAAACTTGCGAATAGCATCG ba +4 AAAAAAAACTTGTTATCAAAACGT ab +1 AAAAAAAAGAAAAGTTCAACACGC ba +1 AAAAAAAAGAAGTTCGCCCTCCGA ab +13 AAAAAAAAGAGAGTTTAGTCATGG ab +1 AAAAAAAAGAGAGTTTAGTCATGG ba +1 AAAAAAAAGAGAGTTTAGTCCTGG ab \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_data2.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_data2.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,112 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +21 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba +3 AAAAAAAAAAATCACACTTAACTT ba +1 AAAAAAAAAAATCCGCTCCTCACA ba +1 AAAAAAAAAAATTAACTAAACTTA ab +1 AAAAAAAAAACAAATTCTATTATT ab +1 AAAAAAAAAACTCCCAGATTTTTT ab +1 AAAAAAAAAACTTCTGCTTGGCGG ba +11 AAAAAAAAAAGAATCGTGGTTTGT ba +5 AAAAAAAAAAGATAGCCCTAAACG ab +1 AAAAAAAAAAGCAATAATGCCAGT ab +2 AAAAAAAAAAGTACCGCACTCTCA ba +1 AAAAAAAAAAGTTCTTTCTTTGAG ab +1 AAAAAAAAAATAACTTCAATAATG ba +2 AAAAAAAAAATAATCATAGACTCT ab +1 AAAAAAAAAATAGTCTCACATTTA ab +1 AAAAAAAAAATATAACCTTTGGCG ab +3 AAAAAAAAACAAAATTCTATTATT ab +1 AAAAAAAAACAAGTACGCGGCATT ab +1 AAAAAAAAACAAGTACGCGGTATT ab +1 AAAAAAAAACAATATCGAATTAAC ab +3 AAAAAAAAACACGGTGAGACAAGG ba +1 AAAAAAAAACACGTTTCTCCCCTT ba +1 AAAAAAAAACATATCGTCCCGAGC ba +1 AAAAAAAAACCTACCTGAGGCCCC ab +3 AAAAAAAAACCTTATTACAGCGGA ab +1 AAAAAAAAACGATTCTCTGTATCT ba +1 AAAAAAAAACGTACCGCACTCTCA ba +4 AAAAAAAAACTACCCAGATTTTTT ba +1 AAAAAAAAACTAGATGAGACGACC ba +4 AAAAAAAAACTGTCTGCTTGGCGG ba +1 AAAAAAAAAGAAGTTTAATTTTAA ab +1 AAAAAAAAAGAATGCCTAAGACGA ba +6 AAAAAAAAAGACCGGCCTTAGACA ba +1 AAAAAAAAAGATATCGTGGTTTGT ba +1 AAAAAAAAAGCAATACTCAAGCTG ba +6 AAAAAAAAAGCAATGTCTAAGCCT ba +1 AAAAAAAAAGCACTGTCTAAGCCT ab +2 AAAAAAAAAGCTAATAATGCCAGT ab +1 AAAAAAAAAGTTTCGTGAAGGTCC ba +1 AAAAAAAAATAAAGGTCCGAATCT ab +1 AAAAAAAAATAAATGAGAGTGTAA ba +8 AAAAAAAAATAAGTCTCACATTTA ab +1 AAAAAAAAATAATAACCTCTGGCG ab +10 AAAAAAAAATAATAACCTTTGGCG ab +1 AAAAAAAAATAATCCCCTTTGTCG ab +6 AAAAAAAAATACGCAAACGCTGAG ab +4 AAAAAAAAATAGATCATAGACTCT ab +10 AAAAAAAAATAGATCATAGACTCT ba +10 AAAAAAAAATAGTAGGATTTCATG ba +7 AAAAAAAAATATGAATACCCTCGT ba +1 AAAAAAAAATATGCCACTTGATCC ba +1 AAAAAAAAATATTCTGCCACTTGA ba +3 AAAAAAAAATCAAACCAAGAGGAC ba +1 AAAAAAAAATCAGTACCCCTAAAC ab +12 AAAAAAAAATCCTAGTTAATGAAG ba +1 AAAAAAAAATCGATTCTTTATGCG ab +1 AAAAAAAAATGTCTGAAAATATCT ab +4 AAAAAAAAATGTCTGAAAATATCT ba +1 AAAAAAAAATTTCCGCAGACCGTT ba +8 AAAAAAAAATTTGGGCTACTACAA ba +1 AAAAAAAACAAAATTAGAACCCTT ab +1 AAAAAAAACAAACCGCTCCTCACA ba +5 AAAAAAAACAACGTACGCGGTATT ab +4 AAAAAAAACAATATCGTTGATATG ba +4 AAAAAAAACAATCACGTTAATAGG ab +1 AAAAAAAACAGAATCGTGGTTTGT ba +1 AAAAAAAACCAAATCGTTGATATG ba +9 AAAAAAAACCAAGTCCAGGCATCT ba +2 AAAAAAAACCACGGTGAGACAAGG ba +1 AAAAAAAACCGCCCAACTGCCGGT ab +5 AAAAAAAACCTCTCAACCCCAAAT ba +7 AAAAAAAACCTCTTGCGATGTTGT ab +1 AAAAAAAACCTCTTGCGCTGTTGT ab +1 AAAAAAAACCTCTTGTGATGTTGT ab +12 AAAAAAAACCTGAGCAATGGTTCC ab +3 AAAAAAAACCTTGACCCTCACATG ba +6 AAAAAAAACCTTGCACTCGTCCTA ba +9 AAAAAAAACGAAATAAAAAAACCT ba +1 AAAAAAAACGACCGGCCTTAGACA ba +4 AAAAAAAACGCCACCACCCCCTTT ab +12 AAAAAAAACGCCACGGGCACTATT ba +13 AAAAAAAACGTATCAGTAGATCCT ab +1 AAAAAAAACTAGTAGGATTTCATG ba +3 AAAAAAAACTATAGAAAATCCATT ba +1 AAAAAAAACTATTCTATTTCCGAT ba +13 AAAAAAAACTGATCTGCTTGGCGG ba +8 AAAAAAAACTTGCGAATAGCATCG ba +4 AAAAAAAACTTGTTATCAAAACGT ab +1 AAAAAAAAGAAAAGTTCAACACGC ba +1 AAAAAAAAGAAGTTCGCCCTCCGA ab +13 AAAAAAAAGAGAGTTTAGTCATGG ab +1 AAAAAAAAGAGAGTTTAGTCATGG ba +1 AAAAAAAAGAGAGTTTAGTCCTGG ab \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_data3.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_data3.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,112 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +21 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba +3 AAAAAAAAAAATCACACTTAACTT ba +1 AAAAAAAAAAATCCGCTCCTCACA ba +1 AAAAAAAAAAATTAACTAAACTTA ab +1 AAAAAAAAAACAAATTCTATTATT ab +1 AAAAAAAAAACTCCCAGATTTTTT ab +1 AAAAAAAAAACTTCTGCTTGGCGG ba +11 AAAAAAAAAAGAATCGTGGTTTGT ba +5 AAAAAAAAAAGATAGCCCTAAACG ab +1 AAAAAAAAAAGCAATAATGCCAGT ab +2 AAAAAAAAAAGTACCGCACTCTCA ba +1 AAAAAAAAAAGTTCTTTCTTTGAG ab +1 AAAAAAAAAATAACTTCAATAATG ba +2 AAAAAAAAAATAATCATAGACTCT ab +1 AAAAAAAAAATAGTCTCACATTTA ab +1 AAAAAAAAAATATAACCTTTGGCG ab +3 AAAAAAAAACAAAATTCTATTATT ab +1 AAAAAAAAACAAGTACGCGGCATT ab +1 AAAAAAAAACAAGTACGCGGTATT ab +1 AAAAAAAAACAATATCGAATTAAC ab +3 AAAAAAAAACACGGTGAGACAAGG ba +1 AAAAAAAAACACGTTTCTCCCCTT ba +1 AAAAAAAAACATATCGTCCCGAGC ba +1 AAAAAAAAACCTACCTGAGGCCCC ab +3 AAAAAAAAACCTTATTACAGCGGA ab +1 AAAAAAAAACGATTCTCTGTATCT ba +1 AAAAAAAAACGTACCGCACTCTCA ba +4 AAAAAAAAACTACCCAGATTTTTT ba +1 AAAAAAAAACTAGATGAGACGACC ba +4 AAAAAAAAACTGTCTGCTTGGCGG ba +1 AAAAAAAAAGAAGTTTAATTTTAA ab +1 AAAAAAAAAGAATGCCTAAGACGA ba +6 AAAAAAAAAGACCGGCCTTAGACA ba +1 AAAAAAAAAGATATCGTGGTTTGT ba +1 AAAAAAAAAGCAATACTCAAGCTG ba +6 AAAAAAAAAGCAATGTCTAAGCCT ba +1 AAAAAAAAAGCACTGTCTAAGCCT ab +2 AAAAAAAAAGCTAATAATGCCAGT ab +1 AAAAAAAAAGTTTCGTGAAGGTCC ba +1 AAAAAAAAATAAAGGTCCGAATCT ab +1 AAAAAAAAATAAATGAGAGTGTAA ba +8 AAAAAAAAATAAGTCTCACATTTA ab +1 AAAAAAAAATAATAACCTCTGGCG ab +10 AAAAAAAAATAATAACCTTTGGCG ab +1 AAAAAAAAATAATCCCCTTTGTCG ab +6 AAAAAAAAATACGCAAACGCTGAG ab +4 AAAAAAAAATAGATCATAGACTCT ab +10 AAAAAAAAATAGATCATAGACTCT ba +10 AAAAAAAAATAGTAGGATTTCATG ba +7 AAAAAAAAATATGAATACCCTCGT ba +1 AAAAAAAAATATGCCACTTGATCC ba +1 AAAAAAAAATATTCTGCCACTTGA ba +3 AAAAAAAAATCAAACCAAGAGGAC ba +1 AAAAAAAAATCAGTACCCCTAAAC ab +12 AAAAAAAAATCCTAGTTAATGAAG ba +1 AAAAAAAAATCGATTCTTTATGCG ab +1 AAAAAAAAATGTCTGAAAATATCT ab +4 AAAAAAAAATGTCTGAAAATATCT ba +1 AAAAAAAAATTTCCGCAGACCGTT ba +8 AAAAAAAAATTTGGGCTACTACAA ba +1 AAAAAAAACAAAATTAGAACCCTT ab +1 AAAAAAAACAAACCGCTCCTCACA ba +5 AAAAAAAACAACGTACGCGGTATT ab +4 AAAAAAAACAATATCGTTGATATG ba +4 AAAAAAAACAATCACGTTAATAGG ab +1 AAAAAAAACAGAATCGTGGTTTGT ba +1 AAAAAAAACCAAATCGTTGATATG ba +9 AAAAAAAACCAAGTCCAGGCATCT ba +2 AAAAAAAACCACGGTGAGACAAGG ba +1 AAAAAAAACCGCCCAACTGCCGGT ab +5 AAAAAAAACCTCTCAACCCCAAAT ba +7 AAAAAAAACCTCTTGCGATGTTGT ab +1 AAAAAAAACCTCTTGCGCTGTTGT ab +1 AAAAAAAACCTCTTGTGATGTTGT ab +12 AAAAAAAACCTGAGCAATGGTTCC ab +3 AAAAAAAACCTTGACCCTCACATG ba +6 AAAAAAAACCTTGCACTCGTCCTA ba +9 AAAAAAAACGAAATAAAAAAACCT ba +1 AAAAAAAACGACCGGCCTTAGACA ba +4 AAAAAAAACGCCACCACCCCCTTT ab +12 AAAAAAAACGCCACGGGCACTATT ba +13 AAAAAAAACGTATCAGTAGATCCT ab +1 AAAAAAAACTAGTAGGATTTCATG ba +3 AAAAAAAACTATAGAAAATCCATT ba +1 AAAAAAAACTATTCTATTTCCGAT ba +13 AAAAAAAACTGATCTGCTTGGCGG ba +8 AAAAAAAACTTGCGAATAGCATCG ba +4 AAAAAAAACTTGTTATCAAAACGT ab +1 AAAAAAAAGAAAAGTTCAACACGC ba +1 AAAAAAAAGAAGTTCGCCCTCCGA ab +13 AAAAAAAAGAGAGTTTAGTCATGG ab +1 AAAAAAAAGAGAGTTTAGTCATGG ba +1 AAAAAAAAGAGAGTTTAGTCCTGG ab \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_data4.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_data4.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,112 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +21 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba +3 AAAAAAAAAAATCACACTTAACTT ba +1 AAAAAAAAAAATCCGCTCCTCACA ba +1 AAAAAAAAAAATTAACTAAACTTA ab +1 AAAAAAAAAACAAATTCTATTATT ab +1 AAAAAAAAAACTCCCAGATTTTTT ab +1 AAAAAAAAAACTTCTGCTTGGCGG ba +11 AAAAAAAAAAGAATCGTGGTTTGT ba +5 AAAAAAAAAAGATAGCCCTAAACG ab +1 AAAAAAAAAAGCAATAATGCCAGT ab +2 AAAAAAAAAAGTACCGCACTCTCA ba +1 AAAAAAAAAAGTTCTTTCTTTGAG ab +1 AAAAAAAAAATAACTTCAATAATG ba +2 AAAAAAAAAATAATCATAGACTCT ab +1 AAAAAAAAAATAGTCTCACATTTA ab +1 AAAAAAAAAATATAACCTTTGGCG ab +3 AAAAAAAAACAAAATTCTATTATT ab +1 AAAAAAAAACAAGTACGCGGCATT ab +1 AAAAAAAAACAAGTACGCGGTATT ab +1 AAAAAAAAACAATATCGAATTAAC ab +3 AAAAAAAAACACGGTGAGACAAGG ba +1 AAAAAAAAACACGTTTCTCCCCTT ba +1 AAAAAAAAACATATCGTCCCGAGC ba +1 AAAAAAAAACCTACCTGAGGCCCC ab +3 AAAAAAAAACCTTATTACAGCGGA ab +1 AAAAAAAAACGATTCTCTGTATCT ba +1 AAAAAAAAACGTACCGCACTCTCA ba +4 AAAAAAAAACTACCCAGATTTTTT ba +1 AAAAAAAAACTAGATGAGACGACC ba +4 AAAAAAAAACTGTCTGCTTGGCGG ba +1 AAAAAAAAAGAAGTTTAATTTTAA ab +1 AAAAAAAAAGAATGCCTAAGACGA ba +6 AAAAAAAAAGACCGGCCTTAGACA ba +1 AAAAAAAAAGATATCGTGGTTTGT ba +1 AAAAAAAAAGCAATACTCAAGCTG ba +6 AAAAAAAAAGCAATGTCTAAGCCT ba +1 AAAAAAAAAGCACTGTCTAAGCCT ab +2 AAAAAAAAAGCTAATAATGCCAGT ab +1 AAAAAAAAAGTTTCGTGAAGGTCC ba +1 AAAAAAAAATAAAGGTCCGAATCT ab +1 AAAAAAAAATAAATGAGAGTGTAA ba +8 AAAAAAAAATAAGTCTCACATTTA ab +1 AAAAAAAAATAATAACCTCTGGCG ab +10 AAAAAAAAATAATAACCTTTGGCG ab +1 AAAAAAAAATAATCCCCTTTGTCG ab +6 AAAAAAAAATACGCAAACGCTGAG ab +4 AAAAAAAAATAGATCATAGACTCT ab +10 AAAAAAAAATAGATCATAGACTCT ba +10 AAAAAAAAATAGTAGGATTTCATG ba +7 AAAAAAAAATATGAATACCCTCGT ba +1 AAAAAAAAATATGCCACTTGATCC ba +1 AAAAAAAAATATTCTGCCACTTGA ba +3 AAAAAAAAATCAAACCAAGAGGAC ba +1 AAAAAAAAATCAGTACCCCTAAAC ab +12 AAAAAAAAATCCTAGTTAATGAAG ba +1 AAAAAAAAATCGATTCTTTATGCG ab +1 AAAAAAAAATGTCTGAAAATATCT ab +4 AAAAAAAAATGTCTGAAAATATCT ba +1 AAAAAAAAATTTCCGCAGACCGTT ba +8 AAAAAAAAATTTGGGCTACTACAA ba +1 AAAAAAAACAAAATTAGAACCCTT ab +1 AAAAAAAACAAACCGCTCCTCACA ba +5 AAAAAAAACAACGTACGCGGTATT ab +4 AAAAAAAACAATATCGTTGATATG ba +4 AAAAAAAACAATCACGTTAATAGG ab +1 AAAAAAAACAGAATCGTGGTTTGT ba +1 AAAAAAAACCAAATCGTTGATATG ba +9 AAAAAAAACCAAGTCCAGGCATCT ba +2 AAAAAAAACCACGGTGAGACAAGG ba +1 AAAAAAAACCGCCCAACTGCCGGT ab +5 AAAAAAAACCTCTCAACCCCAAAT ba +7 AAAAAAAACCTCTTGCGATGTTGT ab +1 AAAAAAAACCTCTTGCGCTGTTGT ab +1 AAAAAAAACCTCTTGTGATGTTGT ab +12 AAAAAAAACCTGAGCAATGGTTCC ab +3 AAAAAAAACCTTGACCCTCACATG ba +6 AAAAAAAACCTTGCACTCGTCCTA ba +9 AAAAAAAACGAAATAAAAAAACCT ba +1 AAAAAAAACGACCGGCCTTAGACA ba +4 AAAAAAAACGCCACCACCCCCTTT ab +12 AAAAAAAACGCCACGGGCACTATT ba +13 AAAAAAAACGTATCAGTAGATCCT ab +1 AAAAAAAACTAGTAGGATTTCATG ba +3 AAAAAAAACTATAGAAAATCCATT ba +1 AAAAAAAACTATTCTATTTCCGAT ba +13 AAAAAAAACTGATCTGCTTGGCGG ba +8 AAAAAAAACTTGCGAATAGCATCG ba +4 AAAAAAAACTTGTTATCAAAACGT ab +1 AAAAAAAAGAAAAGTTCAACACGC ba +1 AAAAAAAAGAAGTTCGCCCTCCGA ab +13 AAAAAAAAGAGAGTTTAGTCATGG ab +1 AAAAAAAAGAGAGTTTAGTCATGG ba +1 AAAAAAAAGAGAGTTTAGTCCTGG ab \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_output1.pdf Binary file test-data/fsd_output1.pdf has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_output1.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_output1.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,367 @@ +Values from family size distribution with all datasets based on families + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab +FS=1 63 63 63 63 +FS=2 5 5 5 5 +FS=3 8 8 8 8 +FS=4 9 9 9 9 +FS=5 3 3 3 3 +FS=6 5 5 5 5 +FS=7 3 3 3 3 +FS=8 3 3 3 3 +FS=9 2 2 2 2 +FS=10 3 3 3 3 +FS=11 1 1 1 1 +FS=12 3 3 3 3 +FS=13 3 3 3 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 1 1 1 1 +sum 112 112 112 112 + +Values from family size distribution with all datasets based on PE reads + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab +FS=1 63 63 63 63 +FS=2 10 10 10 10 +FS=3 24 24 24 24 +FS=4 36 36 36 36 +FS=5 15 15 15 15 +FS=6 30 30 30 30 +FS=7 21 21 21 21 +FS=8 24 24 24 24 +FS=9 18 18 18 18 +FS=10 30 30 30 30 +FS=11 11 11 11 11 +FS=12 36 36 36 36 +FS=13 39 39 39 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 21 21 21 21 +sum 378 378 378 378 + +Dataset: fsd_data1.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 + +Dataset: fsd_data2.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 + +Dataset: fsd_data3.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 + +Dataset: fsd_data4.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data4.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_output2.pdf Binary file test-data/fsd_output2.pdf has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_output2.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_output2.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,288 @@ +Values from family size distribution with all datasets based on families + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab +FS=1 63 63 63 +FS=2 5 5 5 +FS=3 8 8 8 +FS=4 9 9 9 +FS=5 3 3 3 +FS=6 5 5 5 +FS=7 3 3 3 +FS=8 3 3 3 +FS=9 2 2 2 +FS=10 3 3 3 +FS=11 1 1 1 +FS=12 3 3 3 +FS=13 3 3 3 +FS=14 0 0 0 +FS=15 0 0 0 +FS=16 0 0 0 +FS=17 0 0 0 +FS=18 0 0 0 +FS=19 0 0 0 +FS=20 0 0 0 +FS>20 1 1 1 +sum 112 112 112 + +Values from family size distribution with all datasets based on PE reads + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab +FS=1 63 63 63 +FS=2 10 10 10 +FS=3 24 24 24 +FS=4 36 36 36 +FS=5 15 15 15 +FS=6 30 30 30 +FS=7 21 21 21 +FS=8 24 24 24 +FS=9 18 18 18 +FS=10 30 30 30 +FS=11 11 11 11 +FS=12 36 36 36 +FS=13 39 39 39 +FS=14 0 0 0 +FS=15 0 0 0 +FS=16 0 0 0 +FS=17 0 0 0 +FS=18 0 0 0 +FS=19 0 0 0 +FS=20 0 0 0 +FS>20 21 21 21 +sum 378 378 378 + +Dataset: fsd_data1.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 + +Dataset: fsd_data2.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 + +Dataset: fsd_data3.tab +max. family size: 21 +absolute frequency: 1 +relative frequency: 0.009 + +median family size: 1.0 +mean family size: 3.375 + + singletons: family size > 20: length of dataset: + nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads +fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 + +The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS) +Whereas the total frequencies were calculated from the whole dataset (=including the DCS). + +FS >= 1 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 47 123 0.431 0.420 0.339 0.325 +SSCS ba 59 222 0.541 0.527 0.612 0.587 +DCS (total) 3 (6) 18 (33) 0.028 0.027 (0.054) 0.050 0.048 (0.087) +total nr. of tags 109 363 109 112 363 378 + +FS >= 3 nr. of tags nr. of PE reads rel. freq of tags rel. freq of PE reads: + unique: total unique total: +SSCS ab 14 87 0.341 0.259 0.313 0.224 +SSCS ba 26 187 0.491 0.481 0.495 0.482 +DCS (total) 1 (2) 4 (14) 0.024 0.024 (0.048) 0.014 0.014 (0.049) +total nr. of tags 41 278 41 42 278 288 + +Values from family size distribution based on families + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 3 2 5 +FS=3 0 3 5 8 +FS=4 2 3 4 9 +FS=5 0 2 1 3 +FS=6 0 1 4 5 +FS=7 0 1 2 3 +FS=8 0 1 2 3 +FS=9 0 0 2 2 +FS=10 1 1 1 3 +FS=11 0 0 1 1 +FS=12 0 1 2 3 +FS=13 1 1 1 3 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 1 1 +sum 6 47 59 112 + +Values from family size distribution based on PE reads + duplex ab ba sum +FS=1 2 30 31 63 +FS=2 0 6 4 10 +FS=3 0 9 15 24 +FS=4 8 12 16 36 +FS=5 0 10 5 15 +FS=6 0 6 24 30 +FS=7 0 7 14 21 +FS=8 0 8 16 24 +FS=9 0 0 18 18 +FS=10 10 10 10 30 +FS=11 0 0 11 11 +FS=12 0 12 24 36 +FS=13 13 13 13 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 0 21 21 +sum 33 123 222 378 diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_reg.bam Binary file test-data/fsd_reg.bam has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_reg.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_reg.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,32 @@ +10 AAAAAACATCCCAATAAGAAATCA ab +9 AAAAAACATCCCAATAAGAAATCA ba +4 AAAAAAGTCCTTCGACTCAAGCGG ab +5 AAAAAAGTCCTTCGACTCAAGCGG ba +5 AAAAAATAGTTAAGCCGACACACT ab +7 AAAAAATAGTTAAGCCGACACACT ba +7 AAAAAATGTGCCGAACCTTGGCGA ab +10 AAAAAATGTGCCGAACCTTGGCGA ba +7 AAAAACAACATAGCTTGAAAATTT ab +4 AAAAACAACATAGCTTGAAAATTT ba +81 ATTCGGATAATTCGACGCAACATT ab +11 ATTCGGATAATTCGACGCAACATT ba +41 ATTCGTCGACAATACAAAGGGGCC ab +226 ATTCGTCGACAATACAAAGGGGCC ba +6 ATTGCCAGTGTGGGCTGGTTAGTA ab +41 ATTGCCAGTGTGGGCTGGTTAGTA ba +50 ATTTCGCGACCATCCGCCACTTTG ab +332 ATTTCGCGACCATCCGCCACTTTG ba +64 CAAACTTTAGCACAGTGTGTGTCC ab +57 CAAACTTTAGCACAGTGTGTGTCC ba +85 ATAAACGGCCTTCGACATTGTGAC ab +15 ATAAACGGCCTTCGACATTGTGAC ba +11 ATAAAGTCACCTGTGAATACGTTG ab +35 ATAAAGTCACCTGTGAATACGTTG ba +83 ATAAATCGAAACCGTGCCCAACAA ab +63 ATAAATCGAAACCGTGCCCAACAA ba +9 ATTTAGATATTTTCTTCTTTTTCT ab +7 ATTTAGATATTTTCTTCTTTTTCT ba +7 ATTTAGTTATCCGTCGGCGACGAA ab +3 ATTTAGTTATCCGTCGGCGACGAA ba +8 ATTTAGTTTGAATTGCCCTGCGTC ab +9 ATTTAGTTTGAATTGCCCTGCGTC ba \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_reg_output.pdf Binary file test-data/fsd_reg_output.pdf has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_reg_output.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_reg_output.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,38 @@ +Dataset: fsd_reg.tab + AB BA +max. family size: 11 35 +absolute frequency: 22 2 +relative frequency: 0.333 0.167 + +total nr. of reads 1312 +total nr. of tags 24 (12) + + +Values from family size distribution + ACH_TDII_5regions_90_633 ACH_TDII_5regions_659_1140 +FS=4 4 0 +FS=5 4 0 +FS=6 0 0 +FS=7 6 0 +FS=8 0 0 +FS=9 2 0 +FS=10 4 0 +FS=11 0 2 +FS=12 0 0 +FS=13 0 0 +FS=14 0 0 +FS=15 0 0 +FS=16 0 0 +FS=17 0 0 +FS=18 0 0 +FS=19 0 0 +FS=20 0 0 +FS>20 0 2 +sum 20 4 + + +In the plot, both family sizes of the ab and ba strands were used. +Whereas the total numbers indicate only the single count of the tags per region. +Region total nr. of tags per region +ACH_TDII_5regions_90_633 10 +ACH_TDII_5regions_659_1140 2 diff -r 000000000000 -r 7fc3c8704c05 test-data/fsd_reg_ranges.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fsd_reg_ranges.bed Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,2 @@ +ACH_TDII_5regions 90 633 +ACH_TDII_5regions 659 1140 diff -r 000000000000 -r 7fc3c8704c05 test-data/td_chimeras_output.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_chimeras_output.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,22 @@ +chimera tag family size, read direction similar tag with TD=0 +AAAAAAAAAAAA AACCAAAACTTC 1 ba *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab +AAAAAAAAAAAA ACCAGGCGTCGA 1 ba *AAAAAAAAAAAA* AACCAAAACTTC 1 ba, *AAAAAAAAAAAA* AGCTCCACGTTG 1 ba, *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab, *AAAAAAAAAAAA* TTGGGTTCCTTA 1 ab +AAAAAAAAAAAA ATCGTGGTTTGT 1 ba *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAG *ATCGTGGTTTGT* 4 ba +AAAAAAAAAAAA ATTCACCCTTGT 1 ba *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAT *ATTCACCCTTGT* 6 ba +AAAAAAAAAAAA CACACTTAACTT 7 ba *AAAAAAAAAAAA* ATTCACCCTTGT 1 ba, *AAAAAAAAAAAA* CCGCTCCTCACA 4 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab +AAAAAAAAAAAA GGCAACACAGAA 1 ab *AAAAAAAAAAAA* ATCGTGGTTTGT 1 ba, AAAAAAAAAAAG *GGCAACACAGAA* 3 ab +AAAAAAAAAAAG AGTCGCACCCAG 1 ba *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG CGCAACACAGAA 1 ab *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG TAGCCCTAAACG 1 ab *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba +AAAAAAAAAAAG TCTTTCTTTGAG 1 ab AAAAAAAAAAAA *TCTTTCTTTGAG* 2 ab, *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba, *AAAAAAAAAAAG* CGCAACACAGAA 1 ab, *AAAAAAAAAAAG* GGCAACACAGAA 3 ab +AAAAAAAAAAAT ATCATAGACTCT 1 ab *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba +AAAAAAAAAAAT ATTCGAAAGTTA 1 ba *AAAAAAAAAAAT* ATCATAGACTCT 1 ab, *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba +This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column. +The tags were separated by an empty space into their halves and the * marks the identical half. + +Statistics of nr. of tags that returned max. TD (2nd column) +minimum 1 tag(s) +mean 2.08333333333 tag(s) +median 2.0 tag(s) +maximum 5 tag(s) +sum 25 tag(s) diff -r 000000000000 -r 7fc3c8704c05 test-data/td_data.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_data.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,20 @@ +1 AAAAAAAAAAAAAACCAAAACTTC ba +1 AAAAAAAAAAAAACCAGGCGTCGA ba +1 AAAAAAAAAAAAAGCTCCACGTTG ba +1 AAAAAAAAAAAAATCGTGGTTTGT ba +1 AAAAAAAAAAAAATTCACCCTTGT ba +7 AAAAAAAAAAAACACACTTAACTT ba +1 AAAAAAAAAAAACAGTGTTGAGAC ba +4 AAAAAAAAAAAACCGCTCCTCACA ba +1 AAAAAAAAAAAAGGCAACACAGAA ab +2 AAAAAAAAAAAATCTTTCTTTGAG ab +1 AAAAAAAAAAAATTGGGTTCCTTA ab +1 AAAAAAAAAAAGAGTCGCACCCAG ba +4 AAAAAAAAAAAGATCGTGGTTTGT ba +1 AAAAAAAAAAAGCGCAACACAGAA ab +3 AAAAAAAAAAAGGGCAACACAGAA ab +1 AAAAAAAAAAAGTAGCCCTAAACG ab +1 AAAAAAAAAAAGTCTTTCTTTGAG ab +1 AAAAAAAAAAATATCATAGACTCT ab +6 AAAAAAAAAAATATTCACCCTTGT ba +1 AAAAAAAAAAATATTCGAAAGTTA ba \ No newline at end of file diff -r 000000000000 -r 7fc3c8704c05 test-data/td_output.pdf Binary file test-data/td_output.pdf has changed diff -r 000000000000 -r 7fc3c8704c05 test-data/td_output.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/td_output.tab Thu Oct 24 09:36:09 2019 -0400 @@ -0,0 +1,92 @@ +td_data.tab +nr of tags 20 +sample size 20 + +Tag distance separated by family size + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +TD=1 5 1 1 1 1 0 9 +TD=6 3 0 0 0 0 0 3 +TD=7 4 0 0 0 1 0 5 +TD=8 2 0 0 1 0 0 3 +sum 14 1 1 2 2 0 20 + +Family size distribution separated by Tag distance + TD=1 TD=2 TD=3 TD=4 TD=5-8 TD>8 sum +FS=1 5 0 0 0 9 0 14 +FS=2 1 0 0 0 0 0 1 +FS=3 1 0 0 0 0 0 1 +FS=4 1 0 0 0 1 0 2 +FS=6 1 0 0 0 0 0 1 +FS=7 0 0 0 0 1 0 1 +sum 9 0 0 0 11 0 20 + + +max. family size in sample: 7 +absolute frequency: 1 +relative frequency: 0.05 + +Chimera Analysis: +The tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately. +The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min). +In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset. +The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated. +Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD). +These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD). +For simplicity, we used the maximum value of the relative differences and the respective delta HD. +Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots. + +length of one half of the tag 12 + +Tag distance of each half in the tag + TD a.min TD b.max TD b.min TD a.max TD a.min + b.max, TD a.max + b.min sum +TD=0 20 0 8 1 0 29 +TD=1 0 0 1 19 8 28 +TD=2 0 0 0 0 1 1 +TD=5 0 0 3 0 0 3 +TD=6 0 0 2 0 3 5 +TD=7 0 1 6 0 4 11 +TD=8 0 2 0 0 7 9 +TD=9 0 1 0 0 1 2 +TD=10 0 2 0 0 2 4 +TD=11 0 7 0 0 7 14 +TD=12 0 7 0 0 7 14 +sum 20 20 20 20 40 120 + +Absolute delta Tag distance within the tag + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +diff=7 1 0 0 0 0 0 1 +diff=8 1 0 0 0 1 0 2 +diff=9 1 0 0 0 0 0 1 +diff=10 2 0 0 0 0 0 2 +diff=11 4 0 1 1 1 0 7 +diff=12 5 1 0 1 0 0 7 +sum 14 1 1 2 2 0 20 + +Chimera analysis: relative delta Tag distance + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +diff=1.0 14 1 1 2 2 0 20 +sum 14 1 1 2 2 0 20 + +All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept. +These tags are considered as chimeras. +Tag distance of chimeric families separated after FS + FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum +TD=7 1 0 0 0 0 0 1 +TD=8 1 0 0 0 1 0 2 +TD=9 1 0 0 0 0 0 1 +TD=10 2 0 0 0 0 0 2 +TD=11 4 0 1 1 1 0 7 +TD=12 5 1 0 1 0 0 7 +sum 14 1 1 2 2 0 20 + +Tag distance of chimeric families separated after DCS and single SSCS (ab, ba) + DCS SSCS ab SSCS ba sum +TD=7.0 0 0 1 1 +TD=8.0 0 1 1 2 +TD=9.0 0 1 0 1 +TD=10.0 0 1 1 2 +TD=11.0 0 3 4 7 +TD=12.0 0 2 5 7 +sum 0 8 12 20 + +