Mercurial > repos > iuc > duplex_family_size_distribution

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd.py	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,797 @@
+#!/usr/bin/env python
+
+# Family size distribution of SSCSs
+#
+# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS, but up to 4 files can be provided, as input.
+# The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and
+# a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes.
+# If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
+# Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.
+
+# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
+
+import argparse
+import sys
+
+import matplotlib.pyplot as plt
+import numpy
+from matplotlib.backends.backend_pdf import PdfPages
+
+plt.switch_backend('agg')
+
+
+def readFileReferenceFree(file):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
+        return(data_array)
+
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Family Size Distribution of duplex sequencing data')
+    parser.add_argument('--inputFile1', help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--inputFile2', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName2')
+    parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName3')
+    parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName4')
+    parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
+    parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
+    return parser
+
+
+def compare_read_families(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+
+    firstFile = args.inputFile1
+    name1 = args.inputName1
+
+    secondFile = args.inputFile2
+    name2 = args.inputName2
+    thirdFile = args.inputFile3
+    name3 = args.inputName3
+    fourthFile = args.inputFile4
+    name4 = args.inputName4
+    log_axis = args.log_axis
+    rel_freq = args.rel_freq
+
+    title_file = args.output_tabular
+    title_file2 = args.output_pdf
+
+    sep = "\t"
+
+    plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+    plt.rcParams['patch.edgecolor'] = "black"
+    plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+    plt.rcParams['xtick.labelsize'] = 14
+    plt.rcParams['ytick.labelsize'] = 14
+
+    list_to_plot = []
+    label = []
+    data_array_list = []
+    list_to_plot_original = []
+    colors = []
+    bins = numpy.arange(1, 22)
+    with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
+        fig = plt.figure()
+        fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+        fig2 = plt.figure()
+        fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+
+        if firstFile is not None:
+            file1 = readFileReferenceFree(firstFile)
+            integers = numpy.array(file1[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers)
+            colors.append("#0000FF")
+
+            # for plot: replace all big family sizes by 22
+            data1 = numpy.clip(integers, bins[0], bins[-1])
+            name1 = name1.split(".tabular")[0]
+            if len(name1) > 40:
+                name1 = name1[:40]
+            list_to_plot.append(data1)
+            label.append(name1)
+            data_array_list.append(file1)
+
+            legend = "\n\n\n{}".format(name1)
+            fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+                                                                       float(numpy.bincount(data1)[1]) / len(data1))
+            fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+                                                        float(numpy.bincount(data1)[1]) / sum(integers))
+            fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]),
+                                                                             float(len(integers[integers > 20])) / len(integers))
+            fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]),
+                                                       float(sum(integers[integers > 20])) / sum(integers))
+            fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
+            fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+
+            legend6b = "PE reads\n{:,}".format(sum(integers))
+            fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+
+        if secondFile is not None:
+            file2 = readFileReferenceFree(secondFile)
+            integers2 = numpy.array(file2[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers2)
+            colors.append("#298A08")
+
+            data2 = numpy.clip(integers2, bins[0], bins[-1])
+            list_to_plot.append(data2)
+            name2 = name2.split(".tabular")[0]
+            if len(name2) > 40:
+                name2 = name2[:40]
+            label.append(name2)
+            data_array_list.append(file2)
+
+            fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2))
+            fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
+            fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]),
+                                             float(len(integers2[integers2 > 20])) / len(integers2))
+            fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]),
+                                             float(sum(integers2[integers2 > 20])) / sum(integers2))
+            fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend6 = "{:,}".format(len(data2))
+            fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+
+            legend6b = "{:,}".format(sum(integers2))
+            fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+
+        if thirdFile is not None:
+            file3 = readFileReferenceFree(thirdFile)
+            integers3 = numpy.array(file3[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers3)
+            colors.append("#DF0101")
+
+            data3 = numpy.clip(integers3, bins[0], bins[-1])
+            list_to_plot.append(data3)
+            name3 = name3.split(".tabular")[0]
+            if len(name3) > 40:
+                name3 = name3[:40]
+            label.append(name3)
+            data_array_list.append(file3)
+
+            fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
+            fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1],
+                                              float(numpy.bincount(data3)[1]) / sum(integers3))
+            fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]),
+                                             float(len(integers3[integers3 > 20])) / len(integers3))
+            fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
+                                             float(sum(integers3[integers3 > 20])) / sum(integers3))
+            fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend6 = "{:,}".format(len(data3))
+            fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+
+            legend6b = "{:,}".format(sum(integers3))
+            fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+
+        if fourthFile is not None:
+            file4 = readFileReferenceFree(fourthFile)
+            integers4 = numpy.array(file4[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers4)
+            colors.append("#04cec7")
+
+            data4 = numpy.clip(integers4, bins[0], bins[-1])
+            list_to_plot.append(data4)
+            name4 = name4.split(".tabular")[0]
+            if len(name4) > 40:
+                name4 = name4[:40]
+            label.append(name4)
+            data_array_list.append(file4)
+
+            fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
+            fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1],
+                                              float(numpy.bincount(data4)[1]) / sum(integers4))
+            fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]),
+                                             float(len(integers4[integers4 > 20])) / len(integers4))
+            fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
+                                             float(sum(integers4[integers4 > 20])) / sum(integers4))
+            fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend6 = "{:,}".format(len(data4))
+            fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+
+            legend6b = "{:,}".format(sum(integers4))
+            fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+
+        list_to_plot2 = list_to_plot
+
+        if rel_freq:
+            ylab = "Relative Frequency"
+        else:
+            ylab = "Absolute Frequency"
+
+        # PLOT FSD based on tags
+        fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14)
+        ax = fig.add_subplot(1, 1, 1)
+        ticks = numpy.arange(1, 22, 1)
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        ax.set_xticks([], [])
+        if rel_freq:
+            w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
+            counts = ax.hist(list_to_plot2, weights=w, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors, linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
+            ax.set_ylim(0, 1.07)
+        else:
+            counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
+        ax.set_xticks(numpy.array(ticks))
+        ax.set_xticklabels(ticks1)
+        ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+
+        ax.set_ylabel(ylab, fontsize=14)
+        ax.set_xlabel("Family size", fontsize=14)
+        if log_axis:
+            ax.set_yscale('log')
+        ax.grid(b=True, which="major", color="#424242", linestyle=":")
+        ax.margins(0.01, None)
+        pdf.savefig(fig)
+
+        # PLOT FSD based on PE reads
+        fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14)
+        ax2 = fig2.add_subplot(1, 1, 1)
+        ticks = numpy.arange(1, 22)
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        reads = []
+        reads_rel = []
+
+        barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+        ax2.set_xticks([], [])
+
+        for i in range(len(list_to_plot2)):
+            x = list(numpy.arange(1, 22).astype(float))
+            unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
+            y = unique * c
+            if sum(list_to_plot_original[i] > 20) > 0:
+                y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
+            y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+            reads.append(y)
+            reads_rel.append(list(numpy.float_(y)) / sum(y))
+
+            if len(list_to_plot2) == 1:
+                x = [xi * 0.5 for xi in x]
+                w = 0.4
+            else:
+                x = [xi + barWidth for xi in x]
+                w = 1. / (len(list_to_plot) + 1)
+            if rel_freq:
+                ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i])
+                ax2.set_ylim(0, 1.07)
+            else:
+                ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i])
+            if i == len(list_to_plot2) - 1:
+                barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
+            else:
+                barWidth += 1. / (len(list_to_plot) + 1)
+
+        ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+
+        if len(list_to_plot2) == 1:
+            ax2.set_xticks(numpy.array([xi + 0.2 for xi in x]))
+        else:
+            ax2.set_xticks(numpy.array(ticks))
+        ax2.set_xticklabels(ticks1)
+        ax2.set_xlabel("Family size", fontsize=14)
+        ax2.set_ylabel(ylab, fontsize=14)
+        if log_axis:
+            ax2.set_yscale('log')
+        ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+        ax2.margins(0.01, None)
+
+        pdf.savefig(fig2)
+        plt.close()
+
+        # write data to CSV file tags
+        counts = [numpy.bincount(di, minlength=22)[1:] for di in list_to_plot2]  # original counts of family sizes
+        output_file.write("Values from family size distribution with all datasets based on families\n")
+        output_file.write("\nFamily size")
+        for i in label:
+            output_file.write("{}{}".format(sep, i))
+        output_file.write("\n")
+        j = 0
+        for fs in bins:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs, sep))
+            for n in range(len(label)):
+                output_file.write("{}{}".format(int(counts[n][j]), sep))
+            output_file.write("\n")
+            j += 1
+        output_file.write("sum{}".format(sep))
+        for i in counts:
+            output_file.write("{}{}".format(int(sum(i)), sep))
+
+        # write data to CSV file PE reads
+        output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n")
+        output_file.write("\nFamily size")
+        for i in label:
+            output_file.write("{}{}".format(sep, i))
+        output_file.write("\n")
+        j = 0
+
+        for fs in bins:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs, sep))
+            if len(label) == 1:
+                output_file.write("{}{}".format(int(reads[0][j]), sep))
+            else:
+                for n in range(len(label)):
+                    output_file.write("{}{}".format(int(reads[n][j]), sep))
+            output_file.write("\n")
+            j += 1
+        output_file.write("sum{}".format(sep))
+        if len(label) == 1:
+            output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep))
+        else:
+            for i in reads:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+        output_file.write("\n")
+
+        # Family size distribution after DCS and SSCS
+        for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label):
+            tags = numpy.array(data_o[:, 2])
+            seq = numpy.array(data_o[:, 1])
+            data = numpy.array(dataset)
+            data_o = numpy.array(data_o[:, 0]).astype(int)
+            # find all unique tags and get the indices for ALL tags, but only once
+            u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+            d = u[c > 1]
+
+            # get family sizes, tag for duplicates
+            duplTags_double = data[numpy.in1d(seq, d)]
+            duplTags_double_o = data_o[numpy.in1d(seq, d)]
+
+            duplTags = duplTags_double[0::2]  # ab of DCS
+            duplTags_o = duplTags_double_o[0::2]  # ab of DCS
+
+            duplTagsBA = duplTags_double[1::2]  # ba of DCS
+            duplTagsBA_o = duplTags_double_o[1::2]  # ba of DCS
+
+            # get family sizes for SSCS with no partner
+            ab = numpy.where(tags == "ab")[0]
+            abSeq = seq[ab]
+            ab_o = data_o[ab]
+            ab = data[ab]
+
+            ba = numpy.where(tags == "ba")[0]
+            baSeq = seq[ba]
+            ba_o = data_o[ba]
+            ba = data[ba]
+
+            dataAB = ab[numpy.in1d(abSeq, d, invert=True)]
+            dataAB_o = ab_o[numpy.in1d(abSeq, d, invert=True)]
+
+            dataBA = ba[numpy.in1d(baSeq, d, invert=True)]
+            dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)]
+
+            list1 = [duplTags_double, dataAB, dataBA]  # list for plotting
+            list1_o = [duplTags_double_o, dataAB_o, dataBA_o]  # list for plotting
+
+            # information for family size >= 3
+            dataAB_FS3 = dataAB[dataAB >= 3]
+            dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
+            dataBA_FS3 = dataBA[dataBA >= 3]
+            dataBA_FS3_o = dataBA_o[dataBA_o >= 3]
+
+            duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+            duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)]  # ba+ab with FS>=3
+            duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA)  # both ab and ba strands with FS>=3
+
+            # original FS
+            duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)]  # ab+ba with FS>=3
+            duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)]  # ba+ab with FS>=3
+            duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o)  # both ab and ba strands with FS>=3
+
+            fig = plt.figure()
+            plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
+
+            if rel_freq:
+                w = [numpy.zeros_like(dj) + 1. / len(numpy.concatenate(list1)) for dj in list1]
+                plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w, edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8)
+                plt.ylim(0, 1.07)
+            else:
+                plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8)
+
+            # tick labels of x axis
+            ticks = numpy.arange(1, 22, 1)
+            ticks1 = map(str, ticks)
+            ticks1[len(ticks1) - 1] = ">20"
+            plt.xticks(numpy.array(ticks), ticks1)
+            singl = len(data_o[data_o == 1])
+            last = len(data_o[data_o > 20])  # large families
+            if log_axis:
+                plt.yscale('log')
+            plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+            plt.title("{}: FSD based on families".format(name_file), fontsize=14)
+            plt.xlabel("Family size", fontsize=14)
+            plt.ylabel(ylab, fontsize=14)
+            plt.margins(0.01, None)
+            plt.grid(b=True, which="major", color="#424242", linestyle=":")
+
+            # extra information beneath the plot
+            legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
+            plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+                                                                                  len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)),
+                                                                                  (len(ab) + len(ba)))
+            plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+                                                                                sum(duplTags_o), sum(duplTags_double_o),
+                                                                                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                                                                                (sum(ab_o) + sum(ba_o)))
+            plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                (len(dataAB) + len(dataBA) + len(duplTags)))
+            plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+                                                                           float(len(dataBA)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags_double)) / (len(ab) + len(ba)),
+                                                                           (len(ab) + len(ba)))
+            plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "\nsingletons:\nfamily size > 20:"
+            plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,}\n{:,}".format(singl, last)
+            plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
+            plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
+            plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+            plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)))
+            plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o)))
+            plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            pdf.savefig(fig)
+            plt.close()
+
+            # PLOT FSD based on PE reads
+            fig3 = plt.figure()
+            plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
+
+            fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14)
+            ax2 = fig3.add_subplot(1, 1, 1)
+            ticks = numpy.arange(1, 22)
+            ticks1 = map(str, ticks)
+            ticks1[len(ticks1) - 1] = ">20"
+            reads = []
+            reads_rel = []
+
+            # barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+            ax2.set_xticks([], [])
+
+            list_y = []
+            label = ["duplex", "ab", "ba"]
+            col = ["#FF0000", "#5FB404", "#FFBF00"]
+            for i in range(len(list1)):
+                x = list(numpy.arange(1, 22).astype(float))
+                unique, c = numpy.unique(list1[i], return_counts=True)
+                y = unique * c
+                if sum(list1_o[i] > 20) > 0:
+                    y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20])
+                y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+                reads.append(y)
+                reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)))
+
+                if rel_freq:
+                    y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))
+                    ax2.set_ylim(0, 1.07)
+                else:
+                    y = y
+
+                list_y.append(y)
+                if i == 0:
+                    ax2.bar(x, y, align="center", width=0.8, edgecolor="black", label=label[0], linewidth=1, alpha=1, color=col[0])
+                elif i == 1:
+                    ax2.bar(x, y, bottom=list_y[i - 1], align="center", width=0.8, edgecolor="black", label=label[1], linewidth=1, alpha=1, color=col[1])
+                elif i == 2:
+                    bars = numpy.add(list_y[0], list_y[1]).tolist()
+                    ax2.bar(x, y, bottom=bars, align="center", width=0.8, edgecolor="black", label=label[2], linewidth=1, alpha=1, color=col[2])
+
+            ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+
+            singl = len(data_o[data_o == 1])
+            last = len(data_o[data_o > 20])  # large families
+
+            ax2.set_xticks(numpy.array(ticks))
+            ax2.set_xticklabels(ticks1)
+            ax2.set_xlabel("Family size", fontsize=14)
+            ax2.set_ylabel(ylab, fontsize=14)
+            if log_axis:
+                ax2.set_yscale('log')
+            ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+            ax2.margins(0.01, None)
+
+            # extra information beneath the plot
+            legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
+            plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+                                                                                  len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)),
+                                                                                  (len(ab) + len(ba)))
+            plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+                                                                                sum(duplTags_o), sum(duplTags_double_o),
+                                                                                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                                                                                (sum(ab_o) + sum(ba_o)))
+            plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                (len(dataAB) + len(dataBA) + len(duplTags)))
+            plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+                                                                           float(len(dataBA)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags_double)) / (len(ab) + len(ba)),
+                                                                           (len(ab) + len(ba)))
+            plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "\nsingletons:\nfamily size > 20:"
+            plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,}\n{:,}".format(singl, last)
+            plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
+            plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
+            plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+            plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)))
+            plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o)))
+            plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            pdf.savefig(fig3)
+            plt.close()
+
+            # write same information to a csv file
+            count = numpy.bincount(data_o)  # original counts of family sizes
+
+            output_file.write("\nDataset:{}{}\n".format(sep, name_file))
+            output_file.write("max. family size:{}{}\n".format(sep, max(data_o)))
+            output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
+            output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
+
+            output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o))))
+            output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o))))
+
+            output_file.write(
+                "{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep,
+                                                                                          sep, sep))
+            output_file.write(
+                "{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(
+                    sep, sep, sep, sep, sep, sep, sep, sep, sep))
+            output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
+                name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep,
+                last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data),
+                sep, sum(data_o)))
+
+            # information for FS >= 1
+            output_file.write(
+                "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
+                "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
+            output_file.write(
+                "FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep,
+                                                                                                              sep, sep,
+                                                                                                              sep))
+            output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
+            output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
+                sep, len(dataAB), sep, sum(dataAB_o), sep,
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                sep, float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o))))
+            output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
+                sep, len(dataBA), sep, sum(dataBA_o), sep,
+                float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                sep, float(len(dataBA)) / (len(ab) + len(ba)), sep,
+                float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                sep, float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o))))
+            output_file.write(
+                "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+                    sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
+                    float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+                    float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
+                    float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+                    float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+                    float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
+            output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
+                sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+                (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep,
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o))))
+            # information for FS >= 3
+            output_file.write(
+                "\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep,
+                                                                                                                sep,
+                                                                                                                sep,
+                                                                                                                sep,
+                                                                                                                sep))
+            output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
+            output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
+                sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep,
+                float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                sep, float(sum(dataAB_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                sep, float(sum(dataAB_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+            output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
+                sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep,
+                float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
+                sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+            output_file.write(
+                "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+                    sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
+                    float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                    float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                    float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                    sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                    sep,
+                    float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
+                    float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+            output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
+                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
+                (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+
+            counts = [numpy.bincount(dk, minlength=22)[1:] for dk in list1]  # original counts of family sizes
+            output_file.write("\nValues from family size distribution based on families\n")
+            output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
+
+            j = 0
+            for fs in bins:
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}".format(fs, sep))
+                for n in range(3):
+                    output_file.write("{}{}".format(int(counts[n][j]), sep))
+                output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j]))
+                j += 1
+            output_file.write("sum{}".format(sep))
+            for i in counts:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+            output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2])))
+
+            output_file.write("\nValues from family size distribution based on PE reads\n")
+            output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
+            j = 0
+            for fs in bins:
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}".format(fs, sep))
+                for n in range(3):
+                    output_file.write("{}{}".format(int(reads[n][j]), sep))
+                output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j]))
+                j += 1
+            output_file.write("sum{}".format(sep))
+            for i in reads:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+            output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2])))
+
+    print("Files successfully created!")
+
+
+if __name__ == '__main__':
+    sys.exit(compare_read_families(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd.xml	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,128 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="fsd" name="FSD:" version="1.0.0" profile="19.01">
+    <description>Family Size Distribution of duplex sequencing tags</description>
+    <macros>
+        <import>fsd_macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4.0">matplotlib</requirement>
+    </requirements>
+    <command>
+python '$__tool_directory__/fsd.py'
+#for $i, $s in enumerate($series)
+    --inputFile${i + 1} '${s.file}'
+    --inputName${i + 1} '${s.file.element_identifier}'
+#end for
+$log_axis
+$rel_freq
+--output_pdf '$output_pdf'
+--output_tabular '$output_tabular'
+    </command>
+    <inputs>
+        <repeat name="series" title="Input tags" min="1" max="4" help="All datasets are generated by post-processing of the output from 'Make Families' or 'Correct Barcodes' tool by extracting the first two columns, sorting the tags (column 1) and adding the counts of unique occurencies of each tag. See Help section below for a detailed explanation.">
+            <param name="file" type="data" format="tabular" label="Input tags"/>
+        </repeat>
+        <param name="log_axis" type="boolean" label="Log scale for y axis?" truevalue="" falsevalue="--log_axis" checked="False" help="Transform y axis in log scale."/>
+        <param name="rel_freq" type="boolean" label="Relative frequency?" truevalue="" falsevalue="--rel_freq" checked="False" help="If True (YES), the relative frequency to the total tags/families is plotted instead of the absolute numbers."/>
+    </inputs>
+    <outputs>
+        <data name="output_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF"/>
+        <data name="output_tabular" format="tabular" label="${tool.name} on ${on_string}: Summary"/>
+    </outputs>
+    <tests>
+        <test>
+            <repeat name="series">
+                <param name="file" value="fsd_data1.tab"/>
+            </repeat>
+            <repeat name="series">
+                <param name="file" value="fsd_data2.tab"/>
+            </repeat>
+            <repeat name="series">
+                <param name="file" value="fsd_data3.tab"/>
+            </repeat>
+            <repeat name="series">
+                <param name="file" value="fsd_data4.tab"/>
+            </repeat>
+            <output name="output_pdf" file="fsd_output1.pdf" lines_diff="285"/>
+            <output name="output_tabular" file="fsd_output1.tab"/>
+        </test>
+        <test>
+            <repeat name="series">
+                <param name="file" value="fsd_data1.tab"/>
+            </repeat>
+            <repeat name="series">
+                <param name="file" value="fsd_data2.tab"/>
+            </repeat>
+            <repeat name="series">
+                <param name="file" value="fsd_data3.tab"/>
+            </repeat>
+            <output name="output_pdf" file="fsd_output2.pdf" lines_diff="285"/>
+            <output name="output_tabular" file="fsd_output2.tab"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool provides a computationally very fast insight into the distribution of the family sizes of ALL tags from a Duplex Sequencing experiment (DS) and gives a first assessment of the distribution of PE-reads in families with 1 member up to >20 members. This information is very useful in early decision steps of the analysis parameters, such as the minimum number of PE-reads to build the single stranded consensus sequence (SSCS). Moreover, this tool can compare several datasets or different steps in the analysis pipeline to monitor data loss or gain (e.g families re-united with barcode correction tool from the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_). In an extension of this tool, each family is stratified into SSCS (ab/ba) and DSC and visualizes the allocation of DCSs respective to SSCS-ab and SSCS-ba. This is quite handy to better understand the relationship of SSCS to DCS per family and identify sources of bias (e.g. more SSCS to DCS in a particular family size, or more forward ab than reverse ba reads).
+
+**Input**
+
+This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. At least one input file must be provided and up to 4 different datasets can be compared with this tool. All input files are produced in the same way (see section **How to generate the input**)::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+.. class:: infomark
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form::
+
+ 1                        2  3     4
+ ------------------------------------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG
+
+We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAAAATGGTATG ba
+
+Next, the tags are sorted in ascending or descending order using the **Sort** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAAAATGGTATG ba
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+
+Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1)::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+These data can now be used in this tool.
+
+**Output**
+
+The output is a PDF file with a plot and a summary of the data in the plot. The tool analyzes the family size distribution, that is the number of PE-reads per tag or family. This information is presented graphically as a histogram. The output can interpreted the following way:
+
+The **first part** of the output from this tool is “FSD after tags” and “FSD after PE reads” depending whether the data is compared to the total number of unique tags/families or the total number of PE-reads. The data is presented in a histogram showing the frequency of family sizes (FS) ranging from FS=1 to FS>20 members in a step-wise manner. Multiple datasets or steps of an analysis can be merged in one histogram (e.g. tags before and after barcode correction). The family size distribution gives the user an insight into the allocation of DCS in the libary; information that can be used to decide on the optimal family size parameter for building SSCSs.
+
+The **second part** of the output shows the family size distribution (FSD) for each of the datasets in more detail. The tags are categorized based on whether they form DCS (duplex = complementary tags in the ab and ba direction) or form only SSCS-ab or SSCS-ba with no matching complement.
+]]>
+</help>
+ <expand macro="citation" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_beforevsafter.py	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,304 @@
+#!/usr/bin/env python
+# Family size distribution of DCS from various steps of the Galaxy pipeline
+#
+# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
+# a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and
+# a FASTA file with tags after trimming as input (optional).
+# The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
+# a CSV file with the data of the plot.
+# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile
+# --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf
+
+import argparse
+import sys
+from collections import Counter
+
+import matplotlib.pyplot as plt
+import numpy
+import pysam
+from Bio import SeqIO
+from matplotlib.backends.backend_pdf import PdfPages
+
+plt.switch_backend('agg')
+
+
+def readFileReferenceFree(file, delim):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
+        return data_array
+
+
+def readFasta(file):
+    tag_consensus = []
+    fs_consensus = []
+    with open(file, "r") as consFile:
+        for record in SeqIO.parse(consFile, "fasta"):
+            tag_consensus.append(record.id)
+            line = record.description
+            a, b = line.split(" ")
+            fs1, fs2 = b.split("-")
+            fs_consensus.extend([fs1, fs2])
+    fs_consensus = numpy.array(fs_consensus).astype(int)
+    return (tag_consensus, fs_consensus)
+
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data')
+    parser.add_argument('--inputFile_SSCS',
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--makeDCS',
+                        help='FASTA File with information about tag and family size in the header.')
+    parser.add_argument('--afterTrimming', default=None,
+                        help='FASTA File with information about tag and family size in the header.')
+    parser.add_argument('--bamFile',
+                        help='BAM file with aligned reads.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str,
+                        help='Name of the pdf and tabular file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str,
+                        help='Name of the pdf and tabular file.')
+    return parser
+
+
+def compare_read_families_read_loss(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+    SSCS_file = args.inputFile_SSCS
+    SSCS_file_name = args.inputName1
+    makeConsensus = args.makeDCS
+    afterTrimming = args.afterTrimming
+    ref_genome = args.bamFile
+    title_file = args.output_tabular
+    title_file2 = args.output_pdf
+    sep = "\t"
+
+    with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
+        # PLOT
+        plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+        plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+        plt.rcParams['xtick.labelsize'] = 14
+        plt.rcParams['ytick.labelsize'] = 14
+        plt.rcParams['patch.edgecolor'] = "black"
+        fig = plt.figure()
+        plt.subplots_adjust(bottom=0.3)
+
+        list1 = []
+        colors = []
+        labels = []
+
+# data with tags of SSCS
+        data_array = readFileReferenceFree(SSCS_file, "\t")
+        seq = numpy.array(data_array[:, 1])
+        tags = numpy.array(data_array[:, 2])
+        quant = numpy.array(data_array[:, 0]).astype(int)
+
+        # split data with all tags of SSCS after ab and ba strands
+        all_ab = seq[numpy.where(tags == "ab")[0]]
+        all_ba = seq[numpy.where(tags == "ba")[0]]
+        quant_ab_sscs = quant[numpy.where(tags == "ab")[0]]
+        quant_ba_sscs = quant[numpy.where(tags == "ba")[0]]
+
+        seqDic_ab = dict(zip(all_ab, quant_ab_sscs))
+        seqDic_ba = dict(zip(all_ba, quant_ba_sscs))
+
+        # get tags of the SSCS which form a DCS
+        # group large family sizes
+        bigFamilies = numpy.where(quant > 20)[0]
+        quant[bigFamilies] = 22
+        maximumX = numpy.amax(quant)
+
+        # find all unique tags and get the indices for ALL tags (ab AND ba)
+        u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+        d = u[c > 1]
+
+        # get family sizes, tag for the duplicates
+        duplTags_double = quant[numpy.in1d(seq, d)]
+        list1.append(duplTags_double)
+        colors.append("#0000FF")
+        labels.append("before SSCS building")
+
+        duplTags = duplTags_double[0::2]  # ab of DCS
+        duplTagsBA = duplTags_double[1::2]  # ba of DCS
+
+        d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab and ba FS>=3
+
+        # all SSCSs FS>=3
+        seq_unique, seqUnique_index = numpy.unique(seq, return_index=True)
+        seq_unique_FS = quant[seqUnique_index]
+        seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3]
+
+        legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before SSCS building, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before SSCS building, FS>=3):"
+        legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags),
+                                                                    len(seq_unique_FS3), len(d2))
+        plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure)
+        plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure)
+
+        # data make DCS
+        tag_consensus, fs_consensus = readFasta(makeConsensus)
+        # group large family sizes in the plot of fasta files
+        bigFamilies = numpy.where(fs_consensus > 20)[0]
+        fs_consensus[bigFamilies] = 22
+        list1.append(fs_consensus)
+        colors.append("#298A08")
+        labels.append("after DCS building")
+        legend3 = "after DCS building:"
+        legend4 = "{:,}".format(len(tag_consensus))
+        plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure)
+        plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure)
+
+        # data after trimming
+        if afterTrimming is not None:
+            tag_trimming, fs_trimming = readFasta(afterTrimming)
+            bigFamilies = numpy.where(fs_trimming > 20)[0]
+            fs_trimming[bigFamilies] = 22
+            list1.append(fs_trimming)
+            colors.append("#DF0101")
+            labels.append("after trimming")
+            legend5 = "after trimming:"
+            legend6 = "{:,}".format(len(tag_trimming))
+            plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
+            plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
+
+        # data of tags aligned to reference genome
+        if ref_genome is not None:
+            pysam.index(ref_genome)
+            bam = pysam.AlignmentFile(ref_genome, "rb")
+            seq_mut = []
+            for read in bam.fetch():
+                if not read.is_unmapped:
+                    if '_' in read.query_name:
+                        tags = read.query_name.split('_')[0]
+                    else:
+                        tags = read.query_name
+                    seq_mut.append(tags)
+
+            # use only unique tags that were alignment to the reference genome
+            seq_mut = numpy.array(seq_mut)
+            seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True)
+            # get family sizes for each tag in the BAM file
+            quant_ab = []
+            quant_ba = []
+            for i in seq_mut:
+                quant_ab.append(seqDic_ab.get(i))
+                quant_ba.append(seqDic_ba.get(i))
+
+            quant_ab_ref = numpy.array(quant_ab)
+            quant_ba_ref = numpy.array(quant_ba)
+            quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
+            bigFamilies = numpy.where(quant_all_ref > 20)[0]  # group large family sizes
+            quant_all_ref[bigFamilies] = 22
+
+            list1.append(quant_all_ref)
+            colors.append("#04cec7")
+            labels.append("after alignment\nto reference")
+            legend7 = "after alignment to reference:"
+            length_DCS_ref = len(quant_ba_ref)  # count of duplex tags that were aligned to reference genome
+            legend8 = "{:,}".format(length_DCS_ref)
+            plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure)
+            plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure)
+
+        counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, align="left", alpha=1, edgecolor="black", linewidth=1)
+        ticks = numpy.arange(0, maximumX, 1)
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        plt.xticks(numpy.array(ticks), ticks1)
+        if ref_genome is not None:
+            count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())])  # count all family sizes from all ab strands
+            legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
+            plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure)
+
+            legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
+                .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
+                        sum(numpy.array(data_array[:, 0]).astype(int)))
+            plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+
+            count2 = numpy.array(
+                [v for k, v in sorted(Counter(quant_ba_ref).iteritems())])  # count all family sizes from all ba strands
+            legend = "BA\n{}\n{}\n{:.5f}" \
+                .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
+            plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
+
+        legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags."
+        plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
+
+        plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+        plt.title("Family size distribution of tags from various steps of the Du Novo pipeline", fontsize=14)
+        plt.xlabel("Family size", fontsize=14)
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.grid(b=True, which="major", color="#424242", linestyle=":")
+        plt.margins(0.01, None)
+
+        pdf.savefig(fig, bbox_inch="tight")
+        plt.close()
+
+        # write information about plot into a csv file
+        output_file.write("Dataset:{}{}\n".format(sep, SSCS_file_name))
+        if ref_genome is not None:
+            output_file.write("{}AB{}BA\n".format(sep, sep))
+            output_file.write("max. family size:{}{}{}{}\n".format(sep, max(quant_ab_ref), sep, max(quant_ba_ref)))
+            output_file.write(
+                "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
+            output_file.write(
+                "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep,
+                                                                 float(count2[len(count2) - 1]) / sum(count2)))
+
+        output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
+        output_file.write("\n\nValues from family size distribution\n")
+
+        if afterTrimming is None and ref_genome is None:
+            if afterTrimming is None:
+                output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep))
+            elif ref_genome is None:
+                output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
+
+            for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]):
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
+            output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
+
+        elif afterTrimming is None or ref_genome is None:
+            if afterTrimming is None:
+                output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
+            elif ref_genome is None:
+                output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
+
+            for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]):
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference)))
+            output_file.write("sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
+
+        else:
+            output_file.write("{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(sep, sep, sep, sep))
+            for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])], counts[0][3][2:len(counts[0][3])]):
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, int(reference)))
+            output_file.write("sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
+
+        output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n")
+        output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS)))
+        output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags)))
+        output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3)))
+        output_file.write("DCS (before SSCS building, FS>=3){}{}\n".format(sep, len(d2)))
+        output_file.write("after DCS building{}{}\n".format(sep, len(tag_consensus)))
+        if afterTrimming is not None:
+            output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming)))
+        if ref_genome is not None:
+            output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref))
+
+        print("Files successfully created!")
+
+
+if __name__ == '__main__':
+    sys.exit(compare_read_families_read_loss(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_beforevsafter.xml	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,122 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="fsd_beforevsafter" name="FSD Before/After:" version="1.0.0" profile="19.01">
+    <description>Family Size Distribution of duplex sequencing tags during Du Novo analysis</description>
+    <macros>
+        <import>fsd_macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4">matplotlib</requirement>
+        <requirement type="package" version="1.71">biopython</requirement>
+        <requirement type="package" version="0.15">pysam</requirement>
+    </requirements>
+    <command>
+python '$__tool_directory__/fsd_beforevsafter.py'
+--inputFile_SSCS '$file1'
+--inputName1 '$file1.element_identifier'
+--makeDCS '$makeDCS'
+#if $afterTrimming:
+    --afterTrimming '$afterTrimming'
+#end if
+#if $bamFile:
+--bamFile '$bamFile'
+#end if
+--output_pdf '$output_pdf'
+--output_tabular '$output_tabular'
+    </command>
+    <inputs>
+        <param name="file1" type="data" format="tabular" label="Input tags of SSCSs" optional="false" help="This dataset is generated by post-processing of the output from 'Make Families' or 'Correct Barcodes' tool by extracting the first two columns, sorting the tags (column 1) and adding the counts of unique occurencies of each tag. See Help section below for a detailed explanation."/>
+        <param name="makeDCS" type="data" format="fasta" label="Input tags after making DCSs" help="Input in fasta format with the tags of the reads, which were aligned to DCSs. This file is produced by the 'Make consensus reads' tool."/>
+        <param name="afterTrimming" type="data" format="fasta" optional="true" label="Input tags after trimming" help="Input in fasta format with the tags of the reads, which were not filtered out after trimming. This file is produced by the 'Sequence Content Trimmer'."/>
+        <param name="bamFile" type="data" format="bam" optional="true" label="Input tags aligned to the reference genome" help="Input in BAM format with the reads that were aligned to the reference genome."/>
+    </inputs>
+    <outputs>
+        <data name="output_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF"/>
+        <data name="output_tabular" format="tabular" label="${tool.name} on ${on_string}: Summary"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="file1" value="fsd_ba_data.tab"/>
+            <param name="makeDCS" value="fsd_ba_DCS.fna"/>
+            <param name="afterTrimming" value="fsd_ba_trimmed.fna"/>
+            <param name="bamFile" value="fsd_ba.bam"/>
+            <output name="output_pdf" file="fsd_ba_output.pdf" lines_diff="183"/>
+            <output name="output_tabular" file="fsd_ba_output.tab"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**What it does**
+
+This tool will create a distribution of family sizes from tags of various steps of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_.
+
+**Input**
+
+**Dataset 1:** This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands.::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+.. class:: infomark
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form::
+
+ 1                        2  3     4
+ ------------------------------------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG
+
+We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAAAATGGTATG ba
+
+Next, the tags are sorted in ascending or descending order using the **Sort** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAAAATGGTATG ba
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+
+Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1)::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+These data can now be used in this tool.
+
+**Dataset 2:** A fasta file is required with all tags and the associated family size of both strands (forward and reverse) in the header and the read itself in the next line. This input file can be obtained by the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ with the tool **Make consensus reads**.
+
+**Dataset 3 (optional):** In addition, the fasta file with all tags, which were not filtered out after trimming, can be included. This file can also be obtained by the `Du Novo Analysis Pipeline` with the tool **Sequence Content Trimmer**.
+
+For both input files (dataset 2 and 3), only the DCSs of one mate are necessary these tools give information on both mates in the output file), since both files have the same tags and family sizes, but different DCSs, which are not required in this tool::
+
+ >AAAAAAAAATAGATCATAGACTCT 7-10
+ CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCACGTG
+ >AAAAAAAAGGCAGAAGATATACGC 11-3
+ CNCNGGCCCCCCGCTCCGTGCACAGACGNNGCNACTGACAA
+
+**Dataset 4 (optional):** BAM file of the aligned reads. This file can be obtained by the tool `Map with BWA-MEM <https://arxiv.org/abs/1303.3997>`_.
+
+**Output**
+
+The output is a PDF file with a plot and a summary of the data of the plots. The tool compares various datasets from the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ and helps in decision making of various parameters (e.g family size, minimum read length, etc). For example: Re-running trimming with different parameters allows to recover reads that would be lost due to stringent filtering by read length. This tool also allows to assess reads on target. The tool extracts the tags of reads and their family sizes before SSCS building, after DCS building, after trimming and finally after the alignment to the reference genome. In the plot, the family sizes for both SSCS-ab and SSCS-ba are shown; whereas the summary represents only counts either of SSCS-ab or of SSCS-ba.
+
+]]>
+    </help>
+    <expand macro="citation" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_macros.xml	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,13 @@
+<macros>
+    <xml name="citation">
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene},
+            year = {2019},
+            title = {{Variant Analyzer: a quality control for variant calling in duplex sequencing data (manuscript)}}
+         }
+        </citation>
+    </citations>
+</xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_regions.py	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+
+# Family size distribution of tags which were aligned to the reference genome
+#
+# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS,
+# a BAM file with tags of reads that overlap the regions of the reference genome and
+# an optional BED file with chromosome, start and stop position of the regions as input.
+# The program produces a plot which shows the distribution of family sizes of the tags from the input files and
+# a tabular file with the data of the plot.
+
+# USAGE: python FSD_regions.py --inputFile filenameSSCS --inputName1 filenameSSCS
+# --bamFile DCSbamFile --rangesFile BEDfile --output_tabular outptufile_name_tabular
+# --output_pdf outputfile_name_pdf
+
+import argparse
+import collections
+import re
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pysam
+from matplotlib.backends.backend_pdf import PdfPages
+
+plt.switch_backend('agg')
+
+
+def readFileReferenceFree(file, delim):
+    with open(file, 'r') as dest_f:
+        data_array = np.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
+        return data_array
+
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
+    parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--bamFile', help='BAM file with aligned reads.')
+    parser.add_argument('--rangesFile', default=None, help='BED file with chromosome, start and stop positions.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
+    return parser
+
+
+def compare_read_families_refGenome(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+
+    firstFile = args.inputFile
+    name1 = args.inputName1
+    name1 = name1.split(".tabular")[0]
+    bamFile = args.bamFile
+
+    rangesFile = args.rangesFile
+    title_file = args.output_pdf
+    title_file2 = args.output_tabular
+    sep = "\t"
+
+    with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
+        data_array = readFileReferenceFree(firstFile, "\t")
+        pysam.index(bamFile)
+
+        bam = pysam.AlignmentFile(bamFile, "rb")
+        qname_dict = collections.OrderedDict()
+
+        if rangesFile is not None:
+            with open(rangesFile, 'r') as regs:
+                range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#', dtype='string')
+
+            if range_array.ndim == 0:
+                print("Error: file has 0 lines")
+                exit(2)
+
+            if range_array.ndim == 1:
+                chrList = range_array[0]
+                start_posList = range_array[1].astype(int)
+                stop_posList = range_array[2].astype(int)
+                chrList = [chrList.tolist()]
+                start_posList = [start_posList.tolist()]
+                stop_posList = [stop_posList.tolist()]
+            else:
+                chrList = range_array[:, 0]
+                start_posList = range_array[:, 1].astype(int)
+                stop_posList = range_array[:, 2].astype(int)
+
+            if len(start_posList) != len(stop_posList):
+                print("start_positions and end_positions do not have the same length")
+                exit(3)
+
+            chrList = np.array(chrList)
+            start_posList = np.array(start_posList).astype(int)
+            stop_posList = np.array(stop_posList).astype(int)
+            for chr, start_pos, stop_pos in zip(chrList, start_posList, stop_posList):
+                chr_start_stop = "{}_{}_{}".format(chr, start_pos, stop_pos)
+                qname_dict[chr_start_stop] = []
+                for read in bam.fetch(chr.tobytes(), start_pos, stop_pos):
+                    if not read.is_unmapped:
+                        if re.search('_', read.query_name):
+                            tags = re.split('_', read.query_name)[0]
+                        else:
+                            tags = read.query_name
+                        qname_dict[chr_start_stop].append(tags)
+
+        else:
+            for read in bam.fetch():
+                if not read.is_unmapped:
+                    if re.search(r'_', read.query_name):
+                        tags = re.split('_', read.query_name)[0]
+                    else:
+                        tags = read.query_name
+
+                    if read.reference_name not in qname_dict.keys():
+                        qname_dict[read.reference_name] = [tags]
+                    else:
+                        qname_dict[read.reference_name].append(tags)
+
+        seq = np.array(data_array[:, 1])
+        tags = np.array(data_array[:, 2])
+        quant = np.array(data_array[:, 0]).astype(int)
+        group = np.array(qname_dict.keys())
+
+        all_ab = seq[np.where(tags == "ab")[0]]
+        all_ba = seq[np.where(tags == "ba")[0]]
+        quant_ab = quant[np.where(tags == "ab")[0]]
+        quant_ba = quant[np.where(tags == "ba")[0]]
+
+        seqDic_ab = dict(zip(all_ab, quant_ab))
+        seqDic_ba = dict(zip(all_ba, quant_ba))
+
+        lst_ab = []
+        lst_ba = []
+        quantAfterRegion = []
+        length_regions = 0
+        for i in group:
+            lst_ab_r = []
+            lst_ba_r = []
+            seq_mut = qname_dict[i]
+            if rangesFile is None:
+                seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True)
+            length_regions = length_regions + len(seq_mut) * 2
+            for r in seq_mut:
+                count_ab = seqDic_ab.get(r)
+                count_ba = seqDic_ba.get(r)
+                lst_ab_r.append(count_ab)
+                lst_ab.append(count_ab)
+                lst_ba_r.append(count_ba)
+                lst_ba.append(count_ba)
+
+            dataAB = np.array(lst_ab_r)
+            dataBA = np.array(lst_ba_r)
+            bigFamilies = np.where(dataAB > 20)[0]
+            dataAB[bigFamilies] = 22
+            bigFamilies = np.where(dataBA > 20)[0]
+            dataBA[bigFamilies] = 22
+
+            quantAll = np.concatenate((dataAB, dataBA))
+            quantAfterRegion.append(quantAll)
+
+        quant_ab = np.array(lst_ab)
+        quant_ba = np.array(lst_ba)
+
+        maximumX = np.amax(np.concatenate(quantAfterRegion))
+        minimumX = np.amin(np.concatenate(quantAfterRegion))
+
+        # PLOT
+        plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+        plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+        plt.rcParams['xtick.labelsize'] = 14
+        plt.rcParams['ytick.labelsize'] = 14
+        plt.rcParams['patch.edgecolor'] = "black"
+        fig = plt.figure()
+        plt.subplots_adjust(bottom=0.3)
+
+        colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
+
+        col = []
+        for i in range(0, len(group)):
+            col.append(colors[i])
+
+        counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=group,
+                          align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
+        ticks = np.arange(minimumX - 1, maximumX, 1)
+
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        plt.xticks(np.array(ticks), ticks1)
+        count = np.bincount(map(int, quant_ab))  # original counts
+
+        legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
+        plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure)
+
+        legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int)))
+        plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+
+        count2 = np.bincount(map(int, quant_ba))  # original counts
+
+        legend = "BA\n{}\n{}\n{:.5f}" \
+            .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
+        plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
+
+        plt.text(0.55, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)
+        plt.text(0.8, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,
+                 transform=plt.gcf().transFigure)
+
+        legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
+        plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)
+
+        space = 0
+        for i, count in zip(group, quantAfterRegion):
+            plt.text(0.55, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure)
+            plt.text(0.8, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
+            space = space + 0.02
+
+        plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+        plt.xlabel("Family size", fontsize=14)
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.grid(b=True, which="major", color="#424242", linestyle=":")
+        plt.margins(0.01, None)
+
+        pdf.savefig(fig, bbox_inch="tight")
+        plt.close()
+
+        output_file.write("Dataset:{}{}\n".format(sep, name1))
+        output_file.write("{}AB{}BA\n".format(sep, sep))
+        output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
+        output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
+        output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
+        output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))
+        output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2))
+        output_file.write("\n\nValues from family size distribution\n")
+        output_file.write("{}".format(sep))
+        for i in group:
+            output_file.write("{}{}".format(i, sep))
+        output_file.write("\n")
+
+        j = 0
+        for fs in counts[1][0:len(counts[1]) - 1]:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs, sep))
+            if len(group) == 1:
+                output_file.write("{}{}".format(int(counts[0][j]), sep))
+            else:
+                for n in range(len(group)):
+                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))
+            output_file.write("\n")
+            j += 1
+        output_file.write("sum{}".format(sep))
+
+        if len(group) == 1:
+            output_file.write("{}{}".format(int(sum(counts[0])), sep))
+        else:
+            for i in counts[0]:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+        output_file.write("\n")
+        output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
+        output_file.write("Region{}total nr. of tags per region\n".format(sep))
+        for i, count in zip(group, quantAfterRegion):
+            output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
+
+    print("Files successfully created!")
+
+
+if __name__ == '__main__':
+    sys.exit(compare_read_families_refGenome(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_regions.xml	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="fsd_regions" name="FSD regions:" version="1.0.0" profile="19.01">
+    <description>Family size distribution of user-specified regions in the reference genome</description>
+    <macros>
+        <import>fsd_macros.xml</import>
+    </macros>
+	<requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4.0">matplotlib</requirement>
+        <requirement type="package" version="0.15">pysam</requirement>
+    </requirements>
+    <command>
+python '$__tool_directory__/fsd_regions.py'
+--inputFile '$file1'
+--inputName1 '$file1.element_identifier'
+--bamFile '$file2'
+#if $file3:
+    --rangesFile '$file3'
+#end if
+--output_pdf '$output_pdf'
+--output_tabular '$output_tabular'
+    </command>
+    <inputs>
+        <param name="file1" type="data" format="tabular" label="Input tags of whole dataset" optional="false" help="This dataset is generated by post-processing of the output from 'Make Families' or 'Correct Barcodes' tool by extracting the first two columns, sorting the tags (column 1) and adding the counts of unique occurencies of each tag. See Help section below for a detailed explanation."/>
+        <param name="file2" type="data" format="bam" label="BAM file of aligned reads." help="Input in BAM format with the reads that were aligned to the reference genome."/>
+        <param name="file3" type="data" format="bed" label="BED file with chromsome, start and stop positions of the targetted regions." optional="true" help="BED file with start and stop positions of regions in the reference genome."/>
+    </inputs>
+    <outputs>
+        <data name="output_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF"/>
+        <data name="output_tabular" format="tabular" label="${tool.name} on ${on_string}: Summary"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="file1" value="fsd_reg.tab"/>
+            <param name="file2" value="fsd_reg.bam"/>
+            <param name="file3" value="fsd_reg_ranges.bed"/>
+            <output name="output_pdf" file="fsd_reg_output.pdf" lines_diff="136"/>
+            <output name="output_tabular" file="fsd_reg_output.tab" lines_diff="2"/>
+        </test>
+    </tests>
+    <help> <![CDATA[
+**What it does**
+
+This tool provides a computationally very fast insight into the distribution of the family sizes of ALL tags from a Duplex Sequencing (DS) experiment that were aligned to different regions targeted in the reference genome.
+
+**Input**
+
+**Dataset 1:** This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+.. class:: infomark
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form::
+
+ 1                        2  3     4
+ ------------------------------------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG
+
+We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAAAATGGTATG ba
+
+Next, the tags are sorted in ascending or descending order using the **Sort** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAAAATGGTATG ba
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+
+Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1)::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+These data can now be used in this tool.
+
+**Dataset 2:** BAM file of the aligned reads. This file can be obtained by the tool `Map with BWA-MEM <https://arxiv.org/abs/1303.3997>`_.
+
+**Dataset 3 (optional):** BED file with start and stop positions of the regions. If it is not provided, then all aligned reads of the BAM file are used in the distribution of family sizes::
+
+ 1        2      3
+ ---------------------
+ ACH_TDII   90    633
+ ACH_TDII  659   1140
+ ACH_TDII 1144   1561
+
+**Output**
+
+The output is a PDF file with the plot and the summarized data of the plot. The tool creates a distribution of family sizes of tags that were aligned to the reference genome. Note that tags that overlap different regions of the reference genome are counted for each region. This tool is useful to examine differences in coverage among targeted regions. The plot includes both complementary SSCS-ab and SSCS-ba that form a DCS; whereas the table shows only single counts of the tags per region.
+
+    ]]>
+    </help>
+    <expand macro="citation" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/td.py	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,1296 @@
+#!/usr/bin/env python
+
+# Tag distance analysis of SSCSs
+#
+# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS and
+# optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances
+# separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0)
+# or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the
+# difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the
+# data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input.
+# The tool can run on a certain number of processors, which can be defined by the user.
+
+# USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int /
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int
+#        --nr_above_bars True/False --output_tabular outptufile_name_tabular
+
+import argparse
+import itertools
+import operator
+import sys
+from collections import Counter, defaultdict
+from functools import partial
+from multiprocessing.pool import Pool
+
+import matplotlib.pyplot as plt
+import numpy
+from matplotlib.backends.backend_pdf import PdfPages
+
+plt.switch_backend('agg')
+
+
+def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,
+                   subtitle, pdf, relative=False, diff=True, rel_freq=False):
+    if diff is False:
+        colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
+        labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"]
+    else:
+        colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
+        if relative is True:
+            labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
+        else:
+            labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]
+
+    fig = plt.figure(figsize=(6, 7))
+    ax = fig.add_subplot(111)
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.bincount(numpy.concatenate(familySizeList1))
+    maximumY = numpy.amax(p1)
+
+    if len(range(minimumXFS, maximumXFS)) == 0:
+        range1 = range(minimumXFS - 1, minimumXFS + 2)
+    else:
+        range1 = range(0, maximumXFS + 2)
+
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1]
+        plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        plt.ylim((0, 1.07))
+    else:
+        plt.hist(familySizeList1, label=labels, color=colors, stacked=True, rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1)
+        if len(numpy.concatenate(familySizeList1)) != 0:
+            plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.ylim((0, maximumY * 1.2))
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel("Family size", fontsize=14)
+    ticks = numpy.arange(0, maximumXFS + 1, 1)
+    ticks1 = map(str, ticks)
+    if maximumXFS >= 20:
+        ticks1[len(ticks1) - 1] = ">=20"
+    plt.xticks(numpy.array(ticks), ticks1)
+    [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
+    plt.xlim((0, maximumXFS + 1))
+    legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: "
+    plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
+
+    # count = numpy.bincount(originalCounts)  # original counts
+    if max(originalCounts) >= 20:
+        max_count = ">= 20"
+    else:
+        max_count = max(originalCounts)
+    legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1))
+    plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
+    legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1))
+    plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+
+
+def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
+    if relative is True:
+        step = 0.1
+    else:
+        step = 1
+
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
+    maximumY = numpy.amax(p1)
+    if relative is True:  # relative difference
+        bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
+    else:
+        bin1 = maximumX + 1
+
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, 1.07))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][5]))
+
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8,
+                          color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        maximumY = numpy.amax(p1)
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][5]))
+
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel(xlabel, fontsize=14)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.xlim((minimumX - step, maximumX + step))
+    plt.xticks(numpy.arange(0, maximumX + step, step))
+
+    if nr_above_bars:
+        bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+        for x_label, label in zip(counts, bin_centers):  # labels for values
+            if x_label == 0:
+                continue
+            else:
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (relative and ((counts[len(counts) - 1] / nr_unique_chimeras) == 2)) or \
+                (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\
+                .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))
+
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+
+def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False,
+                  nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False):
+    step = 1
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())])
+    maximumY = numpy.amax(p1)
+    bin1 = maximumX + 1
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1]
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, 1.07))
+        plt.ylabel("Relative Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(float, counts[0][2]))
+
+    else:
+        counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                          label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"],
+                          stacked=True, alpha=1, align="left", range=(0, maximumX + 1))
+        plt.ylim((0, maximumY * 1.2))
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        bins = counts[1]  # width of bins
+        counts = numpy.array(map(int, counts[0][2]))
+
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel(xlabel, fontsize=14)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.xlim((minimumX - step, maximumX + step))
+    plt.xticks(numpy.arange(0, maximumX + step, step))
+
+    if nr_above_bars:
+        bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+        for x_label, label in zip(counts, bin_centers):  # labels for values
+            if x_label == 0:
+                continue
+            else:
+                if rel_freq:
+                    plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))),
+                                                       float(x_label)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001),
+                                 xycoords="data", color="#000066", fontsize=10)
+                else:
+                    plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)),
+                                 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01),
+                                 xycoords="data", color="#000066", fontsize=10)
+
+    if nr_unique_chimeras != 0:
+        if (sum(counts) / nr_unique_chimeras) == 2:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\
+                format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2)
+        else:
+            legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format(
+                lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras)
+    else:
+        legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+            lenTags, len_sample, len(numpy.concatenate(list1)))
+    plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure)
+
+    legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format(
+        len(list1[1]), len(list1[1]) / float(nr_unique_chimeras),
+        len(list1[2]), len(list1[2]) / float(nr_unique_chimeras),
+        len(list1[0]), len(list1[0]) / float(nr_unique_chimeras))
+    plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure)
+
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+
+def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False):
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+
+    ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)]  # new hd within tags
+    maximumX = numpy.amax(numpy.concatenate(ham_partial))
+    minimumX = numpy.amin(numpy.concatenate(ham_partial))
+
+    if len(range(minimumX, maximumX)) == 0:
+        range1 = minimumX
+    else:
+        range1 = range(minimumX, maximumX + 2)
+
+    if rel_freq:
+        w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial]
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w,
+                 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Relative Frequency", fontsize=14)
+        plt.ylim(0, 1.07)
+    else:
+        plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False,
+                 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"],
+                 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"],
+                 edgecolor='black', linewidth=1)
+        plt.ylabel("Absolute Frequency", fontsize=14)
+
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1))
+    plt.suptitle('Tag distances within tags', fontsize=14)
+    plt.xlabel("TD", fontsize=14)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.xlim((minimumX - 1, maximumX + 1))
+    plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
+    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(
+        lenTags, len_sample, len(numpy.concatenate(ham_partial)))
+    plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+
+def createTableFSD2(list1, diff=True):
+    selfAB = numpy.concatenate(list1)
+    uniqueFS = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueFS), 1)
+    if diff is False:
+        count = numpy.zeros((len(uniqueFS), 6))
+    else:
+        count = numpy.zeros((len(uniqueFS), 7))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 2] = j[1]
+            if state == 4:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 3] = j[1]
+            if state == 5:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 4] = j[1]
+            if state == 6:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 5] = j[1]
+            if state == 7:
+                for k, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 6] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+    uniqueFS = uniqueFS.astype(str)
+    if uniqueFS[len(uniqueFS) - 1] == "20":
+        uniqueFS[len(uniqueFS) - 1] = ">20"
+    first = ["FS={}".format(i) for i in uniqueFS]
+    final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)
+
+
+def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True):
+    output_file.write(name)
+    output_file.write("\n")
+    if diff is False:
+        output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format(
+            sep, sep, sep, sep, sep, sep, sep, sep))
+    else:
+        if rel is False:
+            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(
+                sep, sep, sep, sep, sep, sep, sep, sep, sep))
+        else:
+            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".
+                              format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "FS" not in nr and "diff" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
+def createTableHD(list1, row_label):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), 6))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            if state == 4:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 3] = j[1]
+            if state == 5:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 4] = j[1]
+            if state == 6:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 5] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["{}{}".format(row_label, i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)
+
+
+def createTableHDwithTags(list1):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), 5))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            if state == 4:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 3] = j[1]
+            if state == 5:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 4] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["TD={}".format(i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)
+
+
+def createTableHDwithDCS(list1):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), len(list1)))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+            if state == 3:
+                for k, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            state = state + 1
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["TD={}".format(i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
+    return (final, sumCol)
+
+
+def createFileHD(summary, sumCol, overallSum, output_file, name, sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(
+        sep, sep, sep, sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "TD" not in nr and "diff" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
+def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "TD" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
+def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep))
+    for item in summary:
+        for nr in item:
+            if "TD" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr, sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el, sep))
+    output_file.write("{}{}".format(overallSum.astype(int), sep))
+    output_file.write("\n\n")
+
+
+def hamming(array1, array2):
+    res = 99 * numpy.ones(len(array1))
+    i = 0
+    array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
+    for a in array1:
+        dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2])  # fastest
+        res[i] = numpy.amin(dist[dist > 0])  # pick min distance greater than zero
+        i += 1
+    return res
+
+
+def hamming_difference(array1, array2, mate_b):
+    array2 = numpy.unique(array2)  # remove duplicate sequences to decrease running time
+    array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1])  # mate1 part1
+    array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1])  # mate1 part 2
+    array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2])  # mate2 part1
+    array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2])  # mate2 part2
+
+    diff11 = []
+    relativeDiffList = []
+    ham1 = []
+    ham2 = []
+    ham1min = []
+    ham2min = []
+    min_valueList = []
+    min_tagsList = []
+    diff11_zeros = []
+    min_tagsList_zeros = []
+    max_tag_list = []
+    i = 0  # counter, only used to see how many HDs of tags were already calculated
+    if mate_b is False:  # HD calculation for all a's
+        half1_mate1 = array1_half
+        half2_mate1 = array1_half2
+        half1_mate2 = array2_half
+        half2_mate2 = array2_half2
+    elif mate_b is True:  # HD calculation for all b's
+        half1_mate1 = array1_half2
+        half2_mate1 = array1_half
+        half1_mate2 = array2_half2
+        half2_mate2 = array2_half
+
+    for a, b, tag in zip(half1_mate1, half2_mate1, array1):
+        # exclude identical tag from array2, to prevent comparison to itself
+        sameTag = numpy.where(array2 == tag)[0]
+        indexArray2 = numpy.arange(0, len(array2), 1)
+        index_withoutSame = numpy.delete(indexArray2, sameTag)  # delete identical tag from the data
+
+        # all tags without identical tag
+        array2_half_withoutSame = half1_mate2[index_withoutSame]
+        array2_half2_withoutSame = half2_mate2[index_withoutSame]
+        array2_withoutSame = array2[index_withoutSame]  # whole tag (=not splitted into 2 halfs)
+        # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+        dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
+                            array2_half_withoutSame])
+        min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
+        min_value = dist.min()
+        # get all "b's" of the tag or all "a's" of the tag with minimum HD
+        min_tag_half2 = array2_half2_withoutSame[min_index]
+        min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD
+
+        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
+                                        min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        max_value = dist_second_half.max()
+        max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
+        max_tag = min_tag_array2[max_index]
+
+        # for d, d2 in zip(min_value, max_value):
+        if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
+            ham2.append(min_value)
+            ham2min.append(max_value)
+        else:  # half1, corrects the variable of the HD from both halfs if it is a or b
+            ham1.append(min_value)
+            ham1min.append(max_value)
+
+        min_valueList.append(min_value + max_value)
+        min_tagsList.append(tag)
+        difference1 = abs(min_value - max_value)
+        diff11.append(difference1)
+        rel_difference = round(float(difference1) / (min_value + max_value), 1)
+        relativeDiffList.append(rel_difference)
+
+        # tags which have identical parts:
+        if min_value == 0 or max_value == 0:
+            min_tagsList_zeros.append(numpy.array(tag))
+            difference1_zeros = abs(min_value - max_value)  # td of non-identical part
+            diff11_zeros.append(difference1_zeros)
+            max_tag_list.append(numpy.array(max_tag))
+        else:
+            min_tagsList_zeros.append(None)
+            diff11_zeros.append(None)
+            max_tag_list.append(None)
+        i += 1
+    return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros,
+             min_tagsList_zeros, ham1min, ham2min, max_tag_list])
+
+
+def readFileReferenceFree(file):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
+        integers = numpy.array(data_array[:, 0]).astype(int)
+        return (integers, data_array)
+
+
+def hammingDistanceWithFS(fs, ham):
+    fs = numpy.asarray(fs)
+    maximum = max(ham)
+    minimum = min(ham)
+    ham = numpy.asarray(ham)
+
+    singletons = numpy.where(fs == 1)[0]
+    data = ham[singletons]
+
+    hd2 = numpy.where(fs == 2)[0]
+    data2 = ham[hd2]
+
+    hd3 = numpy.where(fs == 3)[0]
+    data3 = ham[hd3]
+
+    hd4 = numpy.where(fs == 4)[0]
+    data4 = ham[hd4]
+
+    hd5 = numpy.where((fs >= 5) & (fs <= 10))[0]
+    data5 = ham[hd5]
+
+    hd6 = numpy.where(fs > 10)[0]
+    data6 = ham[hd6]
+
+    list1 = [data, data2, data3, data4, data5, data6]
+    return (list1, maximum, minimum)
+
+
+def familySizeDistributionWithHD(fs, ham, diff=False, rel=True):
+    hammingDistances = numpy.unique(ham)
+    fs = numpy.asarray(fs)
+    ham = numpy.asarray(ham)
+    bigFamilies2 = numpy.where(fs > 19)[0]
+    if len(bigFamilies2) != 0:
+        fs[bigFamilies2] = 20
+    maximum = max(fs)
+    minimum = min(fs)
+    if diff is True:
+        hd0 = numpy.where(ham == 0)[0]
+        data0 = fs[hd0]
+
+    if rel is True:
+        hd1 = numpy.where(ham == 0.1)[0]
+    else:
+        hd1 = numpy.where(ham == 1)[0]
+    data = fs[hd1]
+
+    if rel is True:
+        hd2 = numpy.where(ham == 0.2)[0]
+    else:
+        hd2 = numpy.where(ham == 2)[0]
+    data2 = fs[hd2]
+
+    if rel is True:
+        hd3 = numpy.where(ham == 0.3)[0]
+    else:
+        hd3 = numpy.where(ham == 3)[0]
+    data3 = fs[hd3]
+
+    if rel is True:
+        hd4 = numpy.where(ham == 0.4)[0]
+    else:
+        hd4 = numpy.where(ham == 4)[0]
+    data4 = fs[hd4]
+
+    if rel is True:
+        hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0]
+    else:
+        hd5 = numpy.where((ham >= 5) & (ham <= 8))[0]
+    data5 = fs[hd5]
+
+    if rel is True:
+        hd6 = numpy.where(ham > 0.8)[0]
+    else:
+        hd6 = numpy.where(ham > 8)[0]
+    data6 = fs[hd6]
+
+    if diff is True:
+        list1 = [data0, data, data2, data3, data4, data5, data6]
+    else:
+        list1 = [data, data2, data3, data4, data5, data6]
+
+    return (list1, hammingDistances, maximum, minimum)
+
+
+def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array):
+    diff_zeros = numpy.array(diff_zeros)
+    maximum = numpy.amax(diff_zeros)
+    minimum = numpy.amin(diff_zeros)
+    minHD_tags_zeros = numpy.array(minHD_tags_zeros)
+
+    idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros])
+    subset_data = data_array[idx, :]
+
+    seq = numpy.array(subset_data[:, 1])
+
+    # find all unique tags and get the indices for ALL tags, but only once
+    u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+    DCS_tags = u[c == 2]
+    rest_tags = u[c == 1]
+
+    dcs = numpy.repeat("DCS", len(DCS_tags))
+    idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags])
+    sscs = subset_data[idx_sscs, 2]
+
+    all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])),
+                                   numpy.concatenate((dcs, sscs))))
+    hd_DCS = []
+    ab_SSCS = []
+    ba_SSCS = []
+
+    for i in range(len(all_tags)):
+        tag = all_tags[i, :]
+        hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]]
+
+        if tag[1] == "DCS":
+            hd_DCS.append(hd)
+        elif tag[1] == "ab":
+            ab_SSCS.append(hd)
+        elif tag[1] == "ba":
+            ba_SSCS.append(hd)
+
+    if len(hd_DCS) != 0:
+        hd_DCS = numpy.concatenate(hd_DCS)
+    if len(ab_SSCS) != 0:
+        ab_SSCS = numpy.concatenate(ab_SSCS)
+    if len(ba_SSCS) != 0:
+        ba_SSCS = numpy.concatenate(ba_SSCS)
+    list1 = [hd_DCS, ab_SSCS, ba_SSCS]  # list for plotting
+    return (list1, maximum, minimum)
+
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data')
+    parser.add_argument('--inputFile',
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--sample_size', default=1000, type=int,
+                        help='Sample size of Tag distance analysis.')
+    parser.add_argument('--subset_tag', default=0, type=int,
+                        help='The tag is shortened to the given number.')
+    parser.add_argument('--nproc', default=4, type=int,
+                        help='The tool runs with the given number of processors.')
+    parser.add_argument('--only_DCS', action="store_false",
+                        help='Only tags of the DCSs are included in the HD analysis')
+
+    parser.add_argument('--minFS', default=1, type=int,
+                        help='Only tags, which have a family size greater or equal than specified, '
+                             'are included in the HD analysis')
+    parser.add_argument('--maxFS', default=0, type=int,
+                        help='Only tags, which have a family size smaller or equal than specified, '
+                             'are included in the HD analysis')
+    parser.add_argument('--nr_above_bars', action="store_true",
+                        help='If False, values above bars in the histograms are removed')
+    parser.add_argument('--rel_freq', action="store_false",
+                        help='If True, the relative frequencies are displayed.')
+
+    parser.add_argument('--output_tabular', default="data.tabular", type=str,
+                        help='Name of the tabular file.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str,
+                        help='Name of the pdf file.')
+    parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str,
+                        help='Name of the tabular file with all chimeric tags.')
+
+    return parser
+
+
+def Hamming_Distance_Analysis(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+    file1 = args.inputFile
+    name1 = args.inputName1
+    index_size = args.sample_size
+    title_savedFile_pdf = args.output_pdf
+    title_savedFile_csv = args.output_tabular
+    output_chimeras_tabular = args.output_chimeras_tabular
+    onlyDuplicates = args.only_DCS
+    rel_freq = args.rel_freq
+    minFS = args.minFS
+    maxFS = args.maxFS
+    nr_above_bars = args.nr_above_bars
+    subset = args.subset_tag
+    nproc = args.nproc
+    sep = "\t"
+
+    # input checks
+    if index_size < 0:
+        print("index_size is a negative integer.")
+        exit(2)
+    if nproc <= 0:
+        print("nproc is smaller or equal zero")
+        exit(3)
+    if subset < 0:
+        print("subset_tag is smaller or equal zero.")
+        exit(5)
+
+    # PLOT
+    plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+    plt.rcParams['xtick.labelsize'] = 14
+    plt.rcParams['ytick.labelsize'] = 14
+    plt.rcParams['patch.edgecolor'] = "#000000"
+    plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+    name1 = name1.split(".tabular")[0]
+
+    with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf:
+        print("dataset: ", name1)
+        integers, data_array = readFileReferenceFree(file1)
+        data_array = numpy.array(data_array)
+        print("total nr of tags:", len(data_array))
+
+        # filter tags out which contain any other character than ATCG
+        valid_bases = ["A", "T", "G", "C"]
+        tagsToDelete = []
+        for idx, t in enumerate(data_array[:, 1]):
+            for char in t:
+                if char not in valid_bases:
+                    tagsToDelete.append(idx)
+                    break
+
+        if len(tagsToDelete) != 0:  # delete tags with N in the tag from data
+            print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete),
+                  float(len(tagsToDelete)) / len(data_array))
+            index_whole_array = numpy.arange(0, len(data_array), 1)
+            index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete)
+            data_array = data_array[index_withoutN_inTag, :]
+            integers = integers[index_withoutN_inTag]
+            print("total nr of filtered tags:", len(data_array))
+
+        int_f = numpy.array(data_array[:, 0]).astype(int)
+        data_array = data_array[numpy.where(int_f >= minFS)]
+        integers = integers[integers >= minFS]
+
+        # select family size for tags
+        if maxFS > 0:
+            int_f2 = numpy.array(data_array[:, 0]).astype(int)
+            data_array = data_array[numpy.where(int_f2 <= maxFS)]
+            integers = integers[integers <= maxFS]
+
+        if onlyDuplicates is True:
+            tags = data_array[:, 2]
+            seq = data_array[:, 1]
+
+            # find all unique tags and get the indices for ALL tags, but only once
+            u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
+            d = u[c == 2]
+
+            # get family sizes, tag for duplicates
+            duplTags_double = integers[numpy.in1d(seq, d)]
+            duplTags = duplTags_double[0::2]  # ab of DCS
+            duplTagsBA = duplTags_double[1::2]  # ba of DCS
+
+            duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
+            duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags
+
+            if minFS > 1:
+                duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)]
+                duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)]  # ab+ba with FS>=3
+
+            data_array = numpy.column_stack((duplTags, duplTags_seq))
+            data_array = numpy.column_stack((data_array, duplTags_tag))
+            integers = numpy.array(data_array[:, 0]).astype(int)
+            print("DCS in whole dataset", len(data_array))
+
+        print("min FS", min(integers))
+        print("max FS", max(integers))
+
+        # HD analysis for a subset of the tag
+        if subset > 0:
+            tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
+            tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
+
+            flanking_region_float = float((len(tag1[0]) - subset)) / 2
+            flanking_region = int(flanking_region_float)
+            if flanking_region_float % 2 == 0:
+                tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
+                tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
+            else:
+                flanking_region_rounded = int(round(flanking_region, 1))
+                flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
+                tag1_shorten = numpy.array(
+                    [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
+                tag2_shorten = numpy.array(
+                    [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
+
+            data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
+            data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
+
+        print("length of tag= ", len(data_array[0, 1]))
+        # select sample: if no size given --> all vs. all comparison
+        if index_size == 0:
+            result = numpy.arange(0, len(data_array), 1)
+        else:
+            numpy.random.shuffle(data_array)
+            unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True)  # get only unique tags
+            result = numpy.random.choice(unique_indices, size=index_size,
+                                         replace=False)  # array of random sequences of size=index.size
+
+        # comparison random tags to whole dataset
+        result1 = data_array[result, 1]  # random tags
+        result2 = data_array[:, 1]  # all tags
+        print("sample size= ", len(result1))
+
+        # HD analysis of whole tag
+        proc_pool = Pool(nproc)
+        chunks_sample = numpy.array_split(result1, nproc)
+        ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
+        proc_pool.close()
+        proc_pool.join()
+        ham = numpy.concatenate(ham).astype(int)
+        # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
+        # for h, tag in zip(ham, result1):
+        #     output_file1.write("{}\t{}\n".format(tag, h))
+
+        proc_pool_b = Pool(nproc)
+        diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
+        diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
+        proc_pool_b.close()
+        proc_pool_b.join()
+        HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
+                                     numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
+        HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
+                                     numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
+        minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
+                                    numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
+        HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
+        HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
+                                        numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)
+
+        rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a])
+        rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b])
+        diff1 = numpy.concatenate([item[0] for item in diff_list_a])
+        diff2 = numpy.concatenate([item[0] for item in diff_list_b])
+
+        diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
+        diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
+        minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
+        minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
+        minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
+
+        chimera_tags1 = sum([item[10] for item in diff_list_a], [])
+        chimera_tags2 = sum([item[10] for item in diff_list_b], [])
+
+        rel_Diff = []
+        diff_zeros = []
+        minHD_tags_zeros = []
+        diff = []
+        chimera_tags = []
+        for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
+                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2,
+                    chimera_tags1, chimera_tags2):
+            relatives = numpy.array([rel1, rel2])
+            absolutes = numpy.array([d1, d2])
+            max_idx = numpy.argmax(relatives)
+            rel_Diff.append(relatives[max_idx])
+            diff.append(absolutes[max_idx])
+
+            if all(i is not None for i in [zeros1, zeros2]):
+                diff_zeros.append(max(zeros1, zeros2))
+                minHD_tags_zeros.append(str(tag1))
+                tags = [ctag1, ctag2]
+                chimera_tags.append(tags)
+            elif zeros1 is not None and zeros2 is None:
+                diff_zeros.append(zeros1)
+                minHD_tags_zeros.append(str(tag1))
+                chimera_tags.append(ctag1)
+            elif zeros1 is None and zeros2 is not None:
+                diff_zeros.append(zeros2)
+                minHD_tags_zeros.append(str(tag2))
+                chimera_tags.append(ctag2)
+
+        chimera_tags_new = chimera_tags
+        data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
+
+        checked_tags = []
+        stat_maxTags = []
+
+        with open(output_chimeras_tabular, "w") as output_file1:
+            output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n")
+            for i in range(len(data_chimeraAnalysis)):
+                tag1 = data_chimeraAnalysis[i, 0]
+
+                info_tag1 = data_array[data_array[:, 1] == tag1, :]
+                fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1]
+
+                if tag1 in checked_tags:  # skip tag if already written to file
+                    continue
+
+                sample_half_a = tag1[0:(len(tag1)) / 2]
+                sample_half_b = tag1[len(tag1) / 2:len(tag1)]
+
+                max_tags = data_chimeraAnalysis[i, 1]
+                if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray:
+                    max_tags = numpy.concatenate(max_tags)
+                max_tags = numpy.unique(max_tags)
+                stat_maxTags.append(len(max_tags))
+
+                info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags]
+
+                chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags])  # mate1 part1
+                chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags])  # mate1 part 2
+
+                new_format = []
+                for j in range(len(max_tags)):
+                    fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]]
+
+                    if sample_half_a == chimera_half_a[j]:
+                        max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
+                        new_format.append(max_tag)
+
+                    elif sample_half_b == chimera_half_b[j]:
+                        max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags))
+                        new_format.append(max_tag)
+                    checked_tags.append(max_tags[j])
+
+                sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1))
+                output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
+                checked_tags.append(tag1)
+
+            output_file1.write(
+                "This file contains all tags that were identified as chimeras as the first column and the "
+                "corresponding tags which returned a Hamming distance of zero in either the first or the second "
+                "half of the sample tag as the second column.\n"
+                "The tags were separated by an empty space into their halves and the * marks the identical half.")
+            output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n")
+            output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags))))
+            output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags))))
+            output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags))))
+            output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags))))
+            output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags))))
+
+        lenTags = len(data_array)
+        len_sample = len(result1)
+
+        quant = numpy.array(data_array[result, 0]).astype(int)  # family size for sample of tags
+        seq = numpy.array(data_array[result, 1])  # tags of sample
+        ham = numpy.asarray(ham)  # HD for sample of tags
+
+        if onlyDuplicates is True:  # ab and ba strands of DCSs
+            quant = numpy.concatenate((quant, duplTagsBA[result]))
+            seq = numpy.tile(seq, 2)
+            ham = numpy.tile(ham, 2)
+            diff = numpy.tile(diff, 2)
+            rel_Diff = numpy.tile(rel_Diff, 2)
+            diff_zeros = numpy.tile(diff_zeros, 2)
+
+        nr_chimeric_tags = len(data_chimeraAnalysis)
+        print("nr of chimeras", nr_chimeric_tags)
+
+        # prepare data for different kinds of plots
+        # distribution of FSs separated after HD
+        familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False)
+        list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham)  # histogram of HDs separated after FS
+
+        # get FS for all tags with min HD of analysis of chimeric reads
+        # there are more tags than sample size in the plot, because one tag can have multiple minimas
+        if onlyDuplicates:
+            seqDic = defaultdict(list)
+            for s, q in zip(seq, quant):
+                seqDic[s].append(q)
+        else:
+            seqDic = dict(zip(seq, quant))
+
+        lst_minHD_tags = []
+        for i in minHD_tags:
+            lst_minHD_tags.append(seqDic.get(i))
+
+        if onlyDuplicates:
+            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
+                                                [item_b[1] for item_b in lst_minHD_tags])).astype(int)
+        # histogram with absolute and relative difference between HDs of both parts of the tag
+        listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
+        listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, rel_Diff)
+        # chimeric read analysis: tags which have TD=0 in one of the halfs
+        if len(minHD_tags_zeros) != 0:
+            lst_minHD_tags_zeros = []
+            for i in minHD_tags_zeros:
+                lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
+            if onlyDuplicates:
+                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
+                                                          [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+
+            # histogram with HD of non-identical half
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
+                lst_minHD_tags_zeros, diff_zeros)
+
+            if onlyDuplicates is False:
+                listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array)
+
+        # plot Hamming Distance with Family size distribution
+        plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq,
+                      subtitle="Tag distance separated by family size", lenTags=lenTags,
+                      xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample)
+
+        # Plot FSD with separation after
+        plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq,
+                       originalCounts=quant, subtitle="Family size distribution separated by Tag distance",
+                       pdf=pdf, relative=False, diff=False)
+
+        # Plot HD within tags
+        plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
+                        rel_freq=rel_freq, len_sample=len_sample)
+
+        # Plot difference between HD's separated after FSD
+        plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
+                      subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq,
+                      xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)
+
+        plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
+                      subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq,
+                      xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars,
+                      nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+
+        # plots for chimeric reads
+        if len(minHD_tags_zeros) != 0:
+            # HD
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
+                          subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq,
+                          lenTags=lenTags, xlabel="TD", relative=False,
+                          nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+
+            if onlyDuplicates is False:
+                plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf,
+                              subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq,
+                              lenTags=lenTags, xlabel="TD", relative=False,
+                              nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)
+
+        # print all data to a CSV file
+        # HD
+        summary, sumCol = createTableHD(list1, "TD=")
+        overallSum = sum(sumCol)  # sum of columns in table
+
+        # FSD
+        summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
+        overallSum5 = sum(sumCol5)
+
+        # HD of both parts of the tag
+        summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)])
+        overallSum9 = sum(sumCol9)
+
+        # HD
+        # absolute difference
+        summary11, sumCol11 = createTableHD(listDifference1, "diff=")
+        overallSum11 = sum(sumCol11)
+        # relative difference and all tags
+        summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
+        overallSum13 = sum(sumCol13)
+
+        # chimeric reads
+        if len(minHD_tags_zeros) != 0:
+            # absolute difference and tags where at least one half has HD=0
+            summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=")
+            overallSum15 = sum(sumCol15)
+
+            if onlyDuplicates is False:
+                summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros)
+                overallSum16 = sum(sumCol16)
+
+        output_file.write("{}\n".format(name1))
+        output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample))
+
+        # HD
+        createFileHD(summary, sumCol, overallSum, output_file,
+                     "Tag distance separated by family size", sep)
+        # FSD
+        createFileFSD2(summary5, sumCol5, overallSum5, output_file,
+                       "Family size distribution separated by Tag distance", sep,
+                       diff=False)
+
+        # output_file.write("{}{}\n".format(sep, name1))
+        output_file.write("\n")
+        max_fs = numpy.bincount(integers[result])
+        output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result])))
+        output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1]))
+        output_file.write(
+            "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs)))
+
+        # HD within tags
+        output_file.write(
+            "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n"
+            "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n"
+            "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n"
+            "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n"
+            "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n"
+            "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n"
+            "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n"
+            "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n")
+
+        output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2))
+
+        createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
+                              "Tag distance of each half in the tag", sep)
+        createFileHD(summary11, sumCol11, overallSum11, output_file,
+                     "Absolute delta Tag distance within the tag", sep)
+
+        createFileHD(summary13, sumCol13, overallSum13, output_file,
+                     "Chimera analysis: relative delta Tag distance", sep)
+
+        if len(minHD_tags_zeros) != 0:
+            output_file.write(
+                "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n"
+                "These tags are considered as chimeras.\n")
+            createFileHD(summary15, sumCol15, overallSum15, output_file,
+                         "Tag distance of chimeric families separated after FS", sep)
+
+            if onlyDuplicates is False:
+                createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file,
+                                    "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep)
+
+        output_file.write("\n")
+
+
+if __name__ == '__main__':
+    sys.exit(Hamming_Distance_Analysis(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/td.xml	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,145 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="td" name="TD:" version="1.0.0" profile="19.01">
+    <description>Tag distance analysis of duplex tags</description>
+    <macros>
+        <import>fsd_macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4.0">matplotlib</requirement>
+    </requirements>
+    <command><![CDATA[
+        python '$__tool_directory__/td.py'
+        --inputFile '${inputFile}'
+        --inputName1 '${inputFile.element_identifier}'
+        --sample_size '${sampleSize}'
+        --subset_tag '${subsetTag}'
+        --nproc "\${GALAXY_SLOTS:-1}"
+        $onlyDCS
+        $rel_freq
+        --minFS '${minFS}'
+        --maxFS '${maxFS}'
+        $nr_above_bars
+        --output_pdf '${output_pdf}'
+        --output_tabular '${output_tabular}'
+        --output_chimeras_tabular '${output_chimeras_tabular}'
+    ]]>
+    </command>
+    <inputs>
+        <param name="inputFile" type="data" format="tabular" label="Input tags" optional="false" help="This dataset is generated by post-processing of the output from 'Make Families' or 'Correct Barcodes' tool by extracting the first two columns, sorting the tags (column 1) and adding the counts of unique occurencies of each tag. See Help section below for a detailed explanation."/>
+        <param name="sampleSize" type="integer" label="Number of tags to sample" value="1000" min="0" help="A typical duplex experiment contains very large number of tags. To reduce the runtime of this tool it can sample a subset of tags from the dataset. This parameter specifies how many tags to sample. 1000 is a good starting default. Set to '0' to sample all tags."/>
+        <param name="minFS" type="integer" label="Minimum family size" min="1" value="1" help="Tags forming families of size smaller than this value will be filtered out. Default = 1"/>
+        <param name="maxFS" type="integer" label="Maximum family size" min="0" value="0" help="Tags forming families of size larger than this value will be filtered out. Set to '0' to turn off this restriction. Default = 0"/>
+        <param name="onlyDCS" type="boolean" label="Include only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Include only tags forming duplex families (e.g. present in ab and ba configurations)."/>
+        <param name="rel_freq" type="boolean" label="Relative frequency" truevalue="" falsevalue="--rel_freq" checked="False" help="If True, the relative frequencies instead of the absolute values are displayed in the plots."/>
+        <param name="subsetTag" type="integer" label="Shorten tag in the analysis?" value="0" help="Use this parameter to simulate shorted tag lengths. Set to '0' to keep the original length. Default = 0"/>
+        <param name="nproc" type="integer" label="Number of processors" value="8" help="Number of processor used for computing."/>
+        <param name="nr_above_bars" type="boolean" label="Include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the data can be included or removed from the plots. "/>
+    </inputs>
+    <outputs>
+        <data name="output_tabular" format="tabular" label="${tool.name} on ${on_string}: Summary"/>
+        <data name="output_chimeras_tabular" format="tabular" label="${tool.name} on ${on_string}: Tags of chimeras"/>
+        <data name="output_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="inputFile" value="td_data.tab"/>
+            <param name="sampleSize" value="0"/>
+            <output name="output_pdf" file="td_output.pdf" lines_diff="136" />
+            <output name="output_tabular" file="td_output.tab"/>
+            <output name="output_chimeras_tabular" file="td_chimeras_output.tab"/>
+        </test>
+    </tests>
+    <help> <![CDATA[
+**What it does**
+
+Tags used in Duplex Sequencing (DS) are randomized barcodes, e.g 12 base pairs long. Since each DNA fragment is labeled by two tags at each end there are theoretically 4 to the power of (12+12) unique combinations. However, the input DNA in a typical DS experiment contains only ~1,000,000 molecules creating a large tag-to-input excess (4^24
+≫ 1,000,000). Because of such excess it is highly unlikely to tag distinct input DNA molecules with highly similar barcodes.
+
+This tool calculates the number of nucleotide differences among tags (tag distance), also known as `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_. In this context the Hamming distance is simply the number of differences between two tags. The tool compares in a randomly selected subset of tags (default n=1000), the difference between each tag of the subset with the tags of the complete dataset. Each tag will differ by a certain number of nucleotides with the other tags; yet the tool uses the smallest difference observed with any other tag.
+
+**Input**
+
+This tools expects a tabular file with the tags of all families, the family sizes and information about forward (ab) and reverse (ba) strands::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+.. class:: infomark
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool or the **Correct Barcodes** tool that produces output in this form::
+
+ 1                        2  3     4
+ ------------------------------------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab read1 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read2 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAATAGCTCGAT ab read3 CGCTACGTGACTGGGTCATG
+ AAAAAAAAAAAAAAAAATGGTATG ba read3 CGCTACGTGACTAAAACATG
+
+We only need columns 1 and 2. These two columns can be extracted from this dataset using the **Cut** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAAAATGGTATG ba
+
+Next, the tags are sorted in ascending or descending order using the **Sort** tool::
+
+ 1                        2
+ ---------------------------
+ AAAAAAAAAAAAAAAAATGGTATG ba
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+ AAAAAAAAAAAAAAATAGCTCGAT ab
+
+Finally, unique occurencies of each tag are counted. This is done using **Unique lines** tool that adds an additional column with the counts that also represent the family size (column 1)::
+
+ 1 2                        3
+ -----------------------------
+ 1 AAAAAAAAAAAAAAAAATGGTATG ba
+ 3 AAAAAAAAAAAAAATGGTATGGAC ab
+
+These data can now be used in this tool.
+
+**Output**
+
+The output is one PDF file with various plots of the tag distance, a tabular file with the summarized data of the plots and a tabular file with the chimeras. The PDF file contains several pages:
+
+ 1. This first page contains a graph representing the minimum tag distance (smallest number of differences) categorized after the family sizes.
+
+ 2. The second page contains the same information as the first page, but plots the family size categorized by the minimum tag distance.
+
+ 3. The third page contains the **first step** of the **chimera analysis**, which examines the differences between the tags at both ends of a read (a/b). Chimeras can be distinguished by carrying the same tag at one end combined with multiple different tags at the other end of a read. Here, we describe the calculation of the TDs for only one tag in detail, but the process is repeated for each tag in the sample (default n=1000). First, the tool splits the tag into its upstream and downstream part (named a and b) and compares it with all other a parts of the families in the dataset. Next, the tool estimates the sequence differences (TD) among the a parts and extracts those tags with the smallest difference (TD a.min) and calculates the TD of the b part. The tags with the largest differences are extracted to estimate the maximum TD (TD b.max). The process is repeated starting with the b part instead and estimates TD a.max and TD b.min. Next, we calculate the sum of TD a.min and TD b.max.
+
+ 4. The fourth page contains the **second step** of the **chimera analysis**: the absolute difference (=delta TD) between the partial TDs (TD a.min & TD b.max and TD b.min & TD a.max). The partial TDs of chimeric tags are normally very different which means that multiple combinations of the same a part with different b parts are likely. But it is possible that small delta TDs occur due to a half of a tag that is identical to other halves in the data. For this purpose, the relative difference between the partial TDs is estimated in the next step.
+
+ 5. The fifth page contains the **third step** of the **chimera analysis**: the relative differences of the partial TDs (=relative delta TD). These are calculated as the absolute difference between TD a.min and TD b.max equal to TD delta. Since it is not known whether the absolute difference originates due to a low and a very large TD within a tag or an identical half (TD=0), the tool estimates the relative TD delta as the ratio of the difference to the sum of the partial TDs. In a chimera, it is expected that only one end of the tag contributes to the TD of the whole tag. In other words, if the same a part is observed in combination with several different b parts, then one end will have a TD = 0. Thus, the TD difference between the parts (TD a.min - TD b.max, TD a.max - TD b.min) is the same as the sum of the parts (TD a.min + TD b.max, TD a.max + TD b.min) or the ratio of the difference to the sum (relative delta TD = TD a.min - TD b.max / (TD a.min + TD b.max or TD a.max - TD b.min / (TD a.max + TD b.min)) will equal 1 in chimeric families. Here, the maximum value of the relative delta TD and the respective delta TD (in step 4) are selected and the plot can be interpreted as the following:
+
+    - A low relative difference indicates that the total TD is equally distributed in the two partial TDs. This case would be expected, if all tags originate from different molecules.
+    - A relative delta TD of 1 means that one part of the tags is identical. Since it is very unlikely that by chance two different tags have a TD of 0, the TDs in the other half are probably artificially introduced and represents chimeric families.
+
+ 6. The sixth page is an analysis only of **chimeric tags** (relative delta TD =1) from step 5.
+
+ 7. The last page is only generated when the parameter **"Include only DCS in the analysis?"** is set to **False (NO)**. The graph represents the **TD of the chimeric tags** that form a DCS (complementary ab and ba).
+
+ .. class:: infomark
+
+**Note:**
+Chimeras can be identical in the first or second part of the tag and can have an identical TD with mutliple tags. Therefore, the second column of the output file can have multiple tag entries. The file also contains the family sizes and the direction of the read (ab, ba). The asterisks mark the identical part of the tag::
+
+ 1                                      2
+ --------------------------------------------------------------------------------------------------
+ GAAAGGGAGG GCGCTTCACG	1 ba            GCAATCGACG *GCGCTTCACG* 1 ba
+ CCCTCCCTGA GGTTCGTTAT	1 ba            CGTCCTTTTC *GGTTCGTTAT* 1 ba, GCACCTCCTT *GGTTCGTTAT* 1  ba
+ ATGCTGATCT CGAATGCATA	55 ba, 59 ab    AGGTGCCGCC *CGAATGCATA* 27 ba, *ATGCTGATCT* GAATGTTTAC 1 ba
+   ]]>
+    </help>
+    <expand macro="citation" />
+</tool>
+
Binary file test-data/fsd_ba.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_ba_DCS.fna	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,50 @@
+>AAAAAAGGACCCTACCACCAACGT 8-5
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGNGTCCAACGCGTCCATGAGCTNCNACNCNCCACTGGTGCGCNNCGCNNGNCTNNNNNCAGNNNANNNCCNCANNNNNNCCNNNNNCNNNNNNNNNNNNNNNCCNNNNNNNNNNNNNN
+>AAAAAAGGATTCCAAATCTCTGGA 3-7
+TACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGNCCCTGGGGACACGGGCTCCTCAGACGGGNTGCCAGGCNCNGGAGGNCCGCNCAGCCGGNNNCCACCGCNNSNNNCCNNNNCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNN
+>AAAAAAGGCCAGTTTAAAAAAACT 37-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTNGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNNNTGGGCCTGGCANCCCNNCNGAGGAGCCNGNNNCCNCAGGTCCCCTGTACNNNCTNGTG
+>AAAAAAGGCCCAGATCTCTTAAAA 194-31
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTGCTGGTGGAGTACGCGGCCAAGGGTAACCTGCGGGAGTTTCAGTCATTTTAAG
+>AAAAAAGGGGCCTCATGCGTCAGT 4-3
+CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCNGGAAGNGGGAGANCTTGTGCACGGTGGNNGANCCNAGGCCTTNCTTGGGGGGNNTGCGNNNNNNNNNNNNNNNNNCNNNNNNNNNNNNNNGGNNNANNNGNNNNNNNNNNNNNNNNT
+>AAAAAAGTCCTTCGACTCAAGCGG 4-5
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTNCTGCCAGGTACCGGCNTCTNCTGCTGCTGNNGNNCNNCNNTNNCNNNNNNNNNNTNNCNNNNNNNNNNNN
+>AAAAAAGTGGGATCGGGTTGCAGC 11-6
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCNGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCNCCCCAGGGCCGGGCNCACGTTGGTTGTCTTCTTGNANTAGTCNNNNTNGTGCNCGTNNNNNNNCNNNNNNNNNNNNNCNNNN
+>AAAAAATAATTTCGCCCTCGAGTA 16-4
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCANCNCACCACTGGNNNGNNTCNCANNGCTNTCNNCAGNNNNNGGCCNNNNNCTGNNCNANNTCNCC
+>AAAAAATAGTTAAGCCGACACACT 5-7
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGNGTACACCTGCCTGGCGGGCAATTCTATNGGGTTTTCTCATCACTCTGCGTGGNNGGTGGTGCTGCCAGGTACCGGCNTCTGCTGCTNNNGCTGNNCCGCNNNNNNNNNNNNNNNNNGNNNNNNNNNNCN
+>AAAAAATCAGAATAGAGCGCTTTC 4-3
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGNGGACGCCGGGGAGTANACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGNCTGGTGGNGCTGCCNGNNNCCNNNNNCTNNNNNNNNNGCTNNNNNNNNNTNNNNNNNNNNNNNNNNNCTNNCNNNNNNNN
+>AAAAAATCATAAACATTTTAACAA 65-21
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGCATCGCAAGGCTGTCCTCAGGGGAGGGCCCCACGCTGGCCAATGTCTCCNNNCNNGNGCNGCCTGCCNNNNNCAAANGG
+>AAAAAATCTTGACTCGGTACACAA 9-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGMCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNCNGAGGAGCCNNNNTCNCCAGGNCCNNTGNANNNNCNGGNGGAGTANNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAAATGACGAACGATTCGTCAT 3-6
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACNNTGGGCACAGGGNCAGGAGNNNNNGCNCAAGNANNNNNNNNNNNNNNNNNNTCNNNNNNNNNNNCNNNNNNNNNNKNNNNNNNNNNNNNNNNNNNCNNNNNNNN
+>AAAAAATGTGCCGAACCTTGGCGA 7-10
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGCTCAAGAAGCGGGNNGNNNGNAAGTCNCAGGATTNNNNNCNNNNNTNNNANNTTTGGCNNNNNNNNNNNNANN
+>AAAAAATTGAATCCGTGGATATAG 3-8
+CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGNNCCNNNCCNGGCCNNAACGCCCATNTCTTTNCNGCNNNGNNNGANCNGGTGGAGGNNNGACNAGGCGGGNNGTGNNTNNNNMNGNCNNNNNNNNNNNNGNNNNGNGNNNNNNNNNTGTNNNTCMNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAAATTGCATGCATCGTCCCTG 6-9
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGNATCGCAAGGCTGTCCTCAGGGGNGGNCCNCNNNNTGNCNNATNNCNCCGNGCNNNNGCNNCCTGCNNNNNNCNNNN
+>AAAAAATTGGCATTGTGTATGCAT 18-5
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCNGNNCNNNNGNNNANNCCGTGNCCCCAGGGNCNNTGNNNNNNNNGGTNNNNNACGNNNNCNANNNTANCCNNNNNNNGT
+>AAAAAATTTTCCCACCAAAATTTC 18-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTNNNGGNTNGGCGGCCCTCCTGGGCCTGGCNGCNCGTCTGAGGNNCCNNNGTCNNCAGGNNNNCNNTNNNNNNNGGNGNNNNANNNNGNNNNGNNNNNNNNNNNNNNNNNNCNGNNNNNNNNNNNN
+>AAAAAATTTTTCTTTACCACCTGT 4-4
+CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGGCCNGGCNNNNCNNNAACGCCCANNNTNTTTNNANCNNNNGNGNNNNNGNNNNNNNSNNNCNNNNNNNNNNGNNNNNNNNNNNNNCNNNNNNNNCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAACAACATAGCTTGAAAATTT 7-4
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCNCAGGGCCAGGAGTGAGGGCNCNAGNAGCNGGACGGNNGTAAGTNNNNNNANTNNNNNNNNTNNNNGCNNNNNTNNNNNNNCNNNNNNNAGN
+>AAAAACAACCAACGTTCTATCTCT 18-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNNNGNGGNNNNNGNGTCCCCAGGGCCCCTGNNNNNNNNGGNNGNNNNNNNGGNCNNGNNNNNCNTGNNNNANNNNNNNNNNNNNNNNNNN
+>AAAAACAAGATAATTGGCGCCCGT 5-22
+CTGCCATACACCCGTCCCAGGAGCATGTCCACAGAACCCCAGCCACACCCAACATCCGCCACATCCCTGACGGCCCCTAAACCCAGCCGGGCCTCTGACTGGTGGCTGTTTCACCCCCACCACCAAGCCCCCTACAGCCAACGCTGGCCCTCAGCACCACTGACCGGGCCCGAGACAGCTCCCATTTGGGGTCGGCAGGCAGCTCGAGCTCGGANACATTGGCCAGCGTGGNGNNNNNCCCNNNGNNCNNCCNTNNNNNNYNNNNNANNNNNNNNNNNNNNNMNNN
+>AAAAACAAGCATCTGTCGACACTA 69-60
+TGCCGCCTGCGCAGCCCCCCCAAGAAAGGCCTGGGCTCCCCCACCGTGCACAAGATCTCCCGCTTCCCGCTCAAGCGACAGGTAACAGAAAGTAGATACCAGGTTCTGAGCTGCCTGCCCGCCAGGCCTCCTGGAGCCCCACCTCGGCCCACGCTGGTCCTGGGCTGTGTGAGCCCTCTCTGCAGCCAGGCGGGCTCCCCTCTCCTCGTCTCTGCTCACCATGTAGAGCCTAGGGAGTCATAGTGTCGACAG
+>AAAAACAATCTTAACCGCGATCTA 10-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCNGNNTCCCCNGGTCCCCTGTNNGTGCTGGNGGANTACNNGGNCNAGGGTNNCCNGCNNNNGNTNNTGNNNGNNNNNNNN
+>AAAAACACGCGGACTTTCCGCATT 4-7
+ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGCCCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGNGGGCCGCCCAGCCGGCACCACCGCCGCTACCGCNNCTACCNCCNNNNCGTGCNNNNNNCNNNCAGNNNNNN
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_ba_data.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,32 @@
+10	AAAAAACATCCCAATAAGAAATCA	ab
+9	AAAAAACATCCCAATAAGAAATCA	ba
+4	AAAAAAGTCCTTCGACTCAAGCGG	ab
+5	AAAAAAGTCCTTCGACTCAAGCGG	ba
+5	AAAAAATAGTTAAGCCGACACACT	ab
+7	AAAAAATAGTTAAGCCGACACACT	ba
+7	AAAAAATGTGCCGAACCTTGGCGA	ab
+10	AAAAAATGTGCCGAACCTTGGCGA	ba
+7	AAAAACAACATAGCTTGAAAATTT	ab
+4	AAAAACAACATAGCTTGAAAATTT	ba
+81	ATTCGGATAATTCGACGCAACATT	ab
+11	ATTCGGATAATTCGACGCAACATT	ba
+41	ATTCGTCGACAATACAAAGGGGCC	ab
+226	ATTCGTCGACAATACAAAGGGGCC	ba
+6	ATTGCCAGTGTGGGCTGGTTAGTA	ab
+41	ATTGCCAGTGTGGGCTGGTTAGTA	ba
+50	ATTTCGCGACCATCCGCCACTTTG	ab
+332	ATTTCGCGACCATCCGCCACTTTG	ba
+64	CAAACTTTAGCACAGTGTGTGTCC	ab
+57	CAAACTTTAGCACAGTGTGTGTCC	ba
+85	ATAAACGGCCTTCGACATTGTGAC	ab
+15	ATAAACGGCCTTCGACATTGTGAC	ba
+11	ATAAAGTCACCTGTGAATACGTTG	ab
+35	ATAAAGTCACCTGTGAATACGTTG	ba
+83	ATAAATCGAAACCGTGCCCAACAA	ab
+63	ATAAATCGAAACCGTGCCCAACAA	ba
+9	ATTTAGATATTTTCTTCTTTTTCT	ab
+7	ATTTAGATATTTTCTTCTTTTTCT	ba
+7	ATTTAGTTATCCGTCGGCGACGAA	ab
+3	ATTTAGTTATCCGTCGGCGACGAA	ba
+8	ATTTAGTTTGAATTGCCCTGCGTC	ab
+9	ATTTAGTTTGAATTGCCCTGCGTC	ba
\ No newline at end of file
Binary file test-data/fsd_ba_output.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_ba_output.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,45 @@
+Dataset:	fsd_ba_data.tab
+	AB	BA
+max. family size:	81	226
+absolute frequency:	1	1
+relative frequency:	0.100	0.100
+
+
+total nr. of reads before SSCS building	1312
+
+
+Values from family size distribution
+	before SSCS building	after DCS building	after trimming	after alignment to reference
+FS=1	0	0	0	0
+FS=2	0	0	0	0
+FS=3	1	8	0	0
+FS=4	2	10	6	2
+FS=5	2	5	3	2
+FS=6	1	3	3	0
+FS=7	5	5	3	4
+FS=8	1	2	0	1
+FS=9	3	2	1	3
+FS=10	2	2	3	2
+FS=11	2	1	0	2
+FS=12	0	0	1	0
+FS=13	0	0	4	0
+FS=14	0	0	0	0
+FS=15	1	0	1	0
+FS=16	0	1	3	0
+FS=17	0	0	3	0
+FS=18	0	3	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	12	8	3	4
+sum	32	50	34	20
+
+
+In the plot, the family sizes of ab and ba strands and of both duplex tags were used.
+Whereas the total numbers indicate only the single count of the formed duplex tags.
+total nr. of tags (unique, FS>=1)	16
+DCS (before SSCS building, FS>=1)	16
+total nr. of tags (unique, FS>=3)	16
+DCS (before SSCS building, FS>=3)	16
+after DCS building	25
+after trimming	17
+after alignment to reference	10
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_ba_trimmed.fna	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,34 @@
+>AAAAAGAAAAACGATGCTTGACCA 4-6
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGC
+>AAAAAGAAAAGTTTGCTTTTTCTT 13-17
+CTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCCGCT
+>AAAAAGAAATGAATTGGTCCTAGA 24-7
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTNTGAGGAGCC
+>AAAAAGACAGCCTGAATTCCTTGT 17-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGNGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNCCTGGGCCTGGCAGCCCGTCNGAG
+>AAAAAGACGATTACACAATAACCT 16-7
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGT
+>AAAAAGATACGGGAGGTGAATTGT 75-6
+CTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGGCTTCTGCTGCTGCTGCTGCTCCGCACTGTCTGGGGGACGCTGGCTCGGGACACGCCAAAGCTGCCAGGACGGACGGGAATCCTGTGACTTACGGCCGTCCCGCTTCTTGAGCCCTCACTCCTGGCCCTGTGCCCAGTGTGGGGACAAAGTTGGCCTGGCCCGGTCCTGGTCCCAGAGGGGC
+>AAAAAGATATTTTAATCGGCCCGA 7-6
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTMTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGC
+>AAAAAGATTACACTGAAATCTTTT 25-5
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCNTGTCCCCAGGGCC
+>AAAAAGCCATATGGTCGAAGAGAT 13-10
+ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCC
+>AAAAAGCGAAAGTGCCCCATATTT 13-16
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTNCTGGTGG
+>AAAAAGCGATTTAACTGAAATTAT 5-4
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGC
+>AAAAAGCGGGGTGGCCTTACGCCC 17-10
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGNGTCCCCAGGGCCNCTGTACGT
+>AAAAAGCTCTACCCCCACGAAGCG 5-10
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGNCTTCTGCTGCTGCTGC
+>AAAAAGGATATGTCTAACATCCCT 15-16
+CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCGGGAAGCGGGAGATCTTGTGCACGGTGGGGGAGCCCAGGCCTTTCTTGGGGGGGCTGCGCAGGCGGCAGAGCGTCACAGCCGCCACCACCAGGATGAACAGGAAGAAGCCCACCCCGT
+>AAAAAGGTACACCCGAGATGAACT 13-9
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCCGTGTCCCCAGGTCCCCTGTACGTGCTGGTGGAGTAC
+>AAAAAGTAGCTTCGGTTCGGGTCT 12-4
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGANTACACCTGCCTGGCGGGCANTTCTATTGGGTTTTCTCATCACTCTGCGTG
+>AAAAAGTAGGGACATAATTGACTT 4-4
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGRTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCACCCCA
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_data1.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,112 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+21	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
+3	AAAAAAAAAAATCACACTTAACTT	ba
+1	AAAAAAAAAAATCCGCTCCTCACA	ba
+1	AAAAAAAAAAATTAACTAAACTTA	ab
+1	AAAAAAAAAACAAATTCTATTATT	ab
+1	AAAAAAAAAACTCCCAGATTTTTT	ab
+1	AAAAAAAAAACTTCTGCTTGGCGG	ba
+11	AAAAAAAAAAGAATCGTGGTTTGT	ba
+5	AAAAAAAAAAGATAGCCCTAAACG	ab
+1	AAAAAAAAAAGCAATAATGCCAGT	ab
+2	AAAAAAAAAAGTACCGCACTCTCA	ba
+1	AAAAAAAAAAGTTCTTTCTTTGAG	ab
+1	AAAAAAAAAATAACTTCAATAATG	ba
+2	AAAAAAAAAATAATCATAGACTCT	ab
+1	AAAAAAAAAATAGTCTCACATTTA	ab
+1	AAAAAAAAAATATAACCTTTGGCG	ab
+3	AAAAAAAAACAAAATTCTATTATT	ab
+1	AAAAAAAAACAAGTACGCGGCATT	ab
+1	AAAAAAAAACAAGTACGCGGTATT	ab
+1	AAAAAAAAACAATATCGAATTAAC	ab
+3	AAAAAAAAACACGGTGAGACAAGG	ba
+1	AAAAAAAAACACGTTTCTCCCCTT	ba
+1	AAAAAAAAACATATCGTCCCGAGC	ba
+1	AAAAAAAAACCTACCTGAGGCCCC	ab
+3	AAAAAAAAACCTTATTACAGCGGA	ab
+1	AAAAAAAAACGATTCTCTGTATCT	ba
+1	AAAAAAAAACGTACCGCACTCTCA	ba
+4	AAAAAAAAACTACCCAGATTTTTT	ba
+1	AAAAAAAAACTAGATGAGACGACC	ba
+4	AAAAAAAAACTGTCTGCTTGGCGG	ba
+1	AAAAAAAAAGAAGTTTAATTTTAA	ab
+1	AAAAAAAAAGAATGCCTAAGACGA	ba
+6	AAAAAAAAAGACCGGCCTTAGACA	ba
+1	AAAAAAAAAGATATCGTGGTTTGT	ba
+1	AAAAAAAAAGCAATACTCAAGCTG	ba
+6	AAAAAAAAAGCAATGTCTAAGCCT	ba
+1	AAAAAAAAAGCACTGTCTAAGCCT	ab
+2	AAAAAAAAAGCTAATAATGCCAGT	ab
+1	AAAAAAAAAGTTTCGTGAAGGTCC	ba
+1	AAAAAAAAATAAAGGTCCGAATCT	ab
+1	AAAAAAAAATAAATGAGAGTGTAA	ba
+8	AAAAAAAAATAAGTCTCACATTTA	ab
+1	AAAAAAAAATAATAACCTCTGGCG	ab
+10	AAAAAAAAATAATAACCTTTGGCG	ab
+1	AAAAAAAAATAATCCCCTTTGTCG	ab
+6	AAAAAAAAATACGCAAACGCTGAG	ab
+4	AAAAAAAAATAGATCATAGACTCT	ab
+10	AAAAAAAAATAGATCATAGACTCT	ba
+10	AAAAAAAAATAGTAGGATTTCATG	ba
+7	AAAAAAAAATATGAATACCCTCGT	ba
+1	AAAAAAAAATATGCCACTTGATCC	ba
+1	AAAAAAAAATATTCTGCCACTTGA	ba
+3	AAAAAAAAATCAAACCAAGAGGAC	ba
+1	AAAAAAAAATCAGTACCCCTAAAC	ab
+12	AAAAAAAAATCCTAGTTAATGAAG	ba
+1	AAAAAAAAATCGATTCTTTATGCG	ab
+1	AAAAAAAAATGTCTGAAAATATCT	ab
+4	AAAAAAAAATGTCTGAAAATATCT	ba
+1	AAAAAAAAATTTCCGCAGACCGTT	ba
+8	AAAAAAAAATTTGGGCTACTACAA	ba
+1	AAAAAAAACAAAATTAGAACCCTT	ab
+1	AAAAAAAACAAACCGCTCCTCACA	ba
+5	AAAAAAAACAACGTACGCGGTATT	ab
+4	AAAAAAAACAATATCGTTGATATG	ba
+4	AAAAAAAACAATCACGTTAATAGG	ab
+1	AAAAAAAACAGAATCGTGGTTTGT	ba
+1	AAAAAAAACCAAATCGTTGATATG	ba
+9	AAAAAAAACCAAGTCCAGGCATCT	ba
+2	AAAAAAAACCACGGTGAGACAAGG	ba
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
+1	AAAAAAAAGAAGTTCGCCCTCCGA	ab
+13	AAAAAAAAGAGAGTTTAGTCATGG	ab
+1	AAAAAAAAGAGAGTTTAGTCATGG	ba
+1	AAAAAAAAGAGAGTTTAGTCCTGG	ab
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_data2.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,112 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+21	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
+3	AAAAAAAAAAATCACACTTAACTT	ba
+1	AAAAAAAAAAATCCGCTCCTCACA	ba
+1	AAAAAAAAAAATTAACTAAACTTA	ab
+1	AAAAAAAAAACAAATTCTATTATT	ab
+1	AAAAAAAAAACTCCCAGATTTTTT	ab
+1	AAAAAAAAAACTTCTGCTTGGCGG	ba
+11	AAAAAAAAAAGAATCGTGGTTTGT	ba
+5	AAAAAAAAAAGATAGCCCTAAACG	ab
+1	AAAAAAAAAAGCAATAATGCCAGT	ab
+2	AAAAAAAAAAGTACCGCACTCTCA	ba
+1	AAAAAAAAAAGTTCTTTCTTTGAG	ab
+1	AAAAAAAAAATAACTTCAATAATG	ba
+2	AAAAAAAAAATAATCATAGACTCT	ab
+1	AAAAAAAAAATAGTCTCACATTTA	ab
+1	AAAAAAAAAATATAACCTTTGGCG	ab
+3	AAAAAAAAACAAAATTCTATTATT	ab
+1	AAAAAAAAACAAGTACGCGGCATT	ab
+1	AAAAAAAAACAAGTACGCGGTATT	ab
+1	AAAAAAAAACAATATCGAATTAAC	ab
+3	AAAAAAAAACACGGTGAGACAAGG	ba
+1	AAAAAAAAACACGTTTCTCCCCTT	ba
+1	AAAAAAAAACATATCGTCCCGAGC	ba
+1	AAAAAAAAACCTACCTGAGGCCCC	ab
+3	AAAAAAAAACCTTATTACAGCGGA	ab
+1	AAAAAAAAACGATTCTCTGTATCT	ba
+1	AAAAAAAAACGTACCGCACTCTCA	ba
+4	AAAAAAAAACTACCCAGATTTTTT	ba
+1	AAAAAAAAACTAGATGAGACGACC	ba
+4	AAAAAAAAACTGTCTGCTTGGCGG	ba
+1	AAAAAAAAAGAAGTTTAATTTTAA	ab
+1	AAAAAAAAAGAATGCCTAAGACGA	ba
+6	AAAAAAAAAGACCGGCCTTAGACA	ba
+1	AAAAAAAAAGATATCGTGGTTTGT	ba
+1	AAAAAAAAAGCAATACTCAAGCTG	ba
+6	AAAAAAAAAGCAATGTCTAAGCCT	ba
+1	AAAAAAAAAGCACTGTCTAAGCCT	ab
+2	AAAAAAAAAGCTAATAATGCCAGT	ab
+1	AAAAAAAAAGTTTCGTGAAGGTCC	ba
+1	AAAAAAAAATAAAGGTCCGAATCT	ab
+1	AAAAAAAAATAAATGAGAGTGTAA	ba
+8	AAAAAAAAATAAGTCTCACATTTA	ab
+1	AAAAAAAAATAATAACCTCTGGCG	ab
+10	AAAAAAAAATAATAACCTTTGGCG	ab
+1	AAAAAAAAATAATCCCCTTTGTCG	ab
+6	AAAAAAAAATACGCAAACGCTGAG	ab
+4	AAAAAAAAATAGATCATAGACTCT	ab
+10	AAAAAAAAATAGATCATAGACTCT	ba
+10	AAAAAAAAATAGTAGGATTTCATG	ba
+7	AAAAAAAAATATGAATACCCTCGT	ba
+1	AAAAAAAAATATGCCACTTGATCC	ba
+1	AAAAAAAAATATTCTGCCACTTGA	ba
+3	AAAAAAAAATCAAACCAAGAGGAC	ba
+1	AAAAAAAAATCAGTACCCCTAAAC	ab
+12	AAAAAAAAATCCTAGTTAATGAAG	ba
+1	AAAAAAAAATCGATTCTTTATGCG	ab
+1	AAAAAAAAATGTCTGAAAATATCT	ab
+4	AAAAAAAAATGTCTGAAAATATCT	ba
+1	AAAAAAAAATTTCCGCAGACCGTT	ba
+8	AAAAAAAAATTTGGGCTACTACAA	ba
+1	AAAAAAAACAAAATTAGAACCCTT	ab
+1	AAAAAAAACAAACCGCTCCTCACA	ba
+5	AAAAAAAACAACGTACGCGGTATT	ab
+4	AAAAAAAACAATATCGTTGATATG	ba
+4	AAAAAAAACAATCACGTTAATAGG	ab
+1	AAAAAAAACAGAATCGTGGTTTGT	ba
+1	AAAAAAAACCAAATCGTTGATATG	ba
+9	AAAAAAAACCAAGTCCAGGCATCT	ba
+2	AAAAAAAACCACGGTGAGACAAGG	ba
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
+1	AAAAAAAAGAAGTTCGCCCTCCGA	ab
+13	AAAAAAAAGAGAGTTTAGTCATGG	ab
+1	AAAAAAAAGAGAGTTTAGTCATGG	ba
+1	AAAAAAAAGAGAGTTTAGTCCTGG	ab
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_data3.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,112 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+21	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
+3	AAAAAAAAAAATCACACTTAACTT	ba
+1	AAAAAAAAAAATCCGCTCCTCACA	ba
+1	AAAAAAAAAAATTAACTAAACTTA	ab
+1	AAAAAAAAAACAAATTCTATTATT	ab
+1	AAAAAAAAAACTCCCAGATTTTTT	ab
+1	AAAAAAAAAACTTCTGCTTGGCGG	ba
+11	AAAAAAAAAAGAATCGTGGTTTGT	ba
+5	AAAAAAAAAAGATAGCCCTAAACG	ab
+1	AAAAAAAAAAGCAATAATGCCAGT	ab
+2	AAAAAAAAAAGTACCGCACTCTCA	ba
+1	AAAAAAAAAAGTTCTTTCTTTGAG	ab
+1	AAAAAAAAAATAACTTCAATAATG	ba
+2	AAAAAAAAAATAATCATAGACTCT	ab
+1	AAAAAAAAAATAGTCTCACATTTA	ab
+1	AAAAAAAAAATATAACCTTTGGCG	ab
+3	AAAAAAAAACAAAATTCTATTATT	ab
+1	AAAAAAAAACAAGTACGCGGCATT	ab
+1	AAAAAAAAACAAGTACGCGGTATT	ab
+1	AAAAAAAAACAATATCGAATTAAC	ab
+3	AAAAAAAAACACGGTGAGACAAGG	ba
+1	AAAAAAAAACACGTTTCTCCCCTT	ba
+1	AAAAAAAAACATATCGTCCCGAGC	ba
+1	AAAAAAAAACCTACCTGAGGCCCC	ab
+3	AAAAAAAAACCTTATTACAGCGGA	ab
+1	AAAAAAAAACGATTCTCTGTATCT	ba
+1	AAAAAAAAACGTACCGCACTCTCA	ba
+4	AAAAAAAAACTACCCAGATTTTTT	ba
+1	AAAAAAAAACTAGATGAGACGACC	ba
+4	AAAAAAAAACTGTCTGCTTGGCGG	ba
+1	AAAAAAAAAGAAGTTTAATTTTAA	ab
+1	AAAAAAAAAGAATGCCTAAGACGA	ba
+6	AAAAAAAAAGACCGGCCTTAGACA	ba
+1	AAAAAAAAAGATATCGTGGTTTGT	ba
+1	AAAAAAAAAGCAATACTCAAGCTG	ba
+6	AAAAAAAAAGCAATGTCTAAGCCT	ba
+1	AAAAAAAAAGCACTGTCTAAGCCT	ab
+2	AAAAAAAAAGCTAATAATGCCAGT	ab
+1	AAAAAAAAAGTTTCGTGAAGGTCC	ba
+1	AAAAAAAAATAAAGGTCCGAATCT	ab
+1	AAAAAAAAATAAATGAGAGTGTAA	ba
+8	AAAAAAAAATAAGTCTCACATTTA	ab
+1	AAAAAAAAATAATAACCTCTGGCG	ab
+10	AAAAAAAAATAATAACCTTTGGCG	ab
+1	AAAAAAAAATAATCCCCTTTGTCG	ab
+6	AAAAAAAAATACGCAAACGCTGAG	ab
+4	AAAAAAAAATAGATCATAGACTCT	ab
+10	AAAAAAAAATAGATCATAGACTCT	ba
+10	AAAAAAAAATAGTAGGATTTCATG	ba
+7	AAAAAAAAATATGAATACCCTCGT	ba
+1	AAAAAAAAATATGCCACTTGATCC	ba
+1	AAAAAAAAATATTCTGCCACTTGA	ba
+3	AAAAAAAAATCAAACCAAGAGGAC	ba
+1	AAAAAAAAATCAGTACCCCTAAAC	ab
+12	AAAAAAAAATCCTAGTTAATGAAG	ba
+1	AAAAAAAAATCGATTCTTTATGCG	ab
+1	AAAAAAAAATGTCTGAAAATATCT	ab
+4	AAAAAAAAATGTCTGAAAATATCT	ba
+1	AAAAAAAAATTTCCGCAGACCGTT	ba
+8	AAAAAAAAATTTGGGCTACTACAA	ba
+1	AAAAAAAACAAAATTAGAACCCTT	ab
+1	AAAAAAAACAAACCGCTCCTCACA	ba
+5	AAAAAAAACAACGTACGCGGTATT	ab
+4	AAAAAAAACAATATCGTTGATATG	ba
+4	AAAAAAAACAATCACGTTAATAGG	ab
+1	AAAAAAAACAGAATCGTGGTTTGT	ba
+1	AAAAAAAACCAAATCGTTGATATG	ba
+9	AAAAAAAACCAAGTCCAGGCATCT	ba
+2	AAAAAAAACCACGGTGAGACAAGG	ba
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
+1	AAAAAAAAGAAGTTCGCCCTCCGA	ab
+13	AAAAAAAAGAGAGTTTAGTCATGG	ab
+1	AAAAAAAAGAGAGTTTAGTCATGG	ba
+1	AAAAAAAAGAGAGTTTAGTCCTGG	ab
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_data4.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,112 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+21	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
+3	AAAAAAAAAAATCACACTTAACTT	ba
+1	AAAAAAAAAAATCCGCTCCTCACA	ba
+1	AAAAAAAAAAATTAACTAAACTTA	ab
+1	AAAAAAAAAACAAATTCTATTATT	ab
+1	AAAAAAAAAACTCCCAGATTTTTT	ab
+1	AAAAAAAAAACTTCTGCTTGGCGG	ba
+11	AAAAAAAAAAGAATCGTGGTTTGT	ba
+5	AAAAAAAAAAGATAGCCCTAAACG	ab
+1	AAAAAAAAAAGCAATAATGCCAGT	ab
+2	AAAAAAAAAAGTACCGCACTCTCA	ba
+1	AAAAAAAAAAGTTCTTTCTTTGAG	ab
+1	AAAAAAAAAATAACTTCAATAATG	ba
+2	AAAAAAAAAATAATCATAGACTCT	ab
+1	AAAAAAAAAATAGTCTCACATTTA	ab
+1	AAAAAAAAAATATAACCTTTGGCG	ab
+3	AAAAAAAAACAAAATTCTATTATT	ab
+1	AAAAAAAAACAAGTACGCGGCATT	ab
+1	AAAAAAAAACAAGTACGCGGTATT	ab
+1	AAAAAAAAACAATATCGAATTAAC	ab
+3	AAAAAAAAACACGGTGAGACAAGG	ba
+1	AAAAAAAAACACGTTTCTCCCCTT	ba
+1	AAAAAAAAACATATCGTCCCGAGC	ba
+1	AAAAAAAAACCTACCTGAGGCCCC	ab
+3	AAAAAAAAACCTTATTACAGCGGA	ab
+1	AAAAAAAAACGATTCTCTGTATCT	ba
+1	AAAAAAAAACGTACCGCACTCTCA	ba
+4	AAAAAAAAACTACCCAGATTTTTT	ba
+1	AAAAAAAAACTAGATGAGACGACC	ba
+4	AAAAAAAAACTGTCTGCTTGGCGG	ba
+1	AAAAAAAAAGAAGTTTAATTTTAA	ab
+1	AAAAAAAAAGAATGCCTAAGACGA	ba
+6	AAAAAAAAAGACCGGCCTTAGACA	ba
+1	AAAAAAAAAGATATCGTGGTTTGT	ba
+1	AAAAAAAAAGCAATACTCAAGCTG	ba
+6	AAAAAAAAAGCAATGTCTAAGCCT	ba
+1	AAAAAAAAAGCACTGTCTAAGCCT	ab
+2	AAAAAAAAAGCTAATAATGCCAGT	ab
+1	AAAAAAAAAGTTTCGTGAAGGTCC	ba
+1	AAAAAAAAATAAAGGTCCGAATCT	ab
+1	AAAAAAAAATAAATGAGAGTGTAA	ba
+8	AAAAAAAAATAAGTCTCACATTTA	ab
+1	AAAAAAAAATAATAACCTCTGGCG	ab
+10	AAAAAAAAATAATAACCTTTGGCG	ab
+1	AAAAAAAAATAATCCCCTTTGTCG	ab
+6	AAAAAAAAATACGCAAACGCTGAG	ab
+4	AAAAAAAAATAGATCATAGACTCT	ab
+10	AAAAAAAAATAGATCATAGACTCT	ba
+10	AAAAAAAAATAGTAGGATTTCATG	ba
+7	AAAAAAAAATATGAATACCCTCGT	ba
+1	AAAAAAAAATATGCCACTTGATCC	ba
+1	AAAAAAAAATATTCTGCCACTTGA	ba
+3	AAAAAAAAATCAAACCAAGAGGAC	ba
+1	AAAAAAAAATCAGTACCCCTAAAC	ab
+12	AAAAAAAAATCCTAGTTAATGAAG	ba
+1	AAAAAAAAATCGATTCTTTATGCG	ab
+1	AAAAAAAAATGTCTGAAAATATCT	ab
+4	AAAAAAAAATGTCTGAAAATATCT	ba
+1	AAAAAAAAATTTCCGCAGACCGTT	ba
+8	AAAAAAAAATTTGGGCTACTACAA	ba
+1	AAAAAAAACAAAATTAGAACCCTT	ab
+1	AAAAAAAACAAACCGCTCCTCACA	ba
+5	AAAAAAAACAACGTACGCGGTATT	ab
+4	AAAAAAAACAATATCGTTGATATG	ba
+4	AAAAAAAACAATCACGTTAATAGG	ab
+1	AAAAAAAACAGAATCGTGGTTTGT	ba
+1	AAAAAAAACCAAATCGTTGATATG	ba
+9	AAAAAAAACCAAGTCCAGGCATCT	ba
+2	AAAAAAAACCACGGTGAGACAAGG	ba
+1	AAAAAAAACCGCCCAACTGCCGGT	ab
+5	AAAAAAAACCTCTCAACCCCAAAT	ba
+7	AAAAAAAACCTCTTGCGATGTTGT	ab
+1	AAAAAAAACCTCTTGCGCTGTTGT	ab
+1	AAAAAAAACCTCTTGTGATGTTGT	ab
+12	AAAAAAAACCTGAGCAATGGTTCC	ab
+3	AAAAAAAACCTTGACCCTCACATG	ba
+6	AAAAAAAACCTTGCACTCGTCCTA	ba
+9	AAAAAAAACGAAATAAAAAAACCT	ba
+1	AAAAAAAACGACCGGCCTTAGACA	ba
+4	AAAAAAAACGCCACCACCCCCTTT	ab
+12	AAAAAAAACGCCACGGGCACTATT	ba
+13	AAAAAAAACGTATCAGTAGATCCT	ab
+1	AAAAAAAACTAGTAGGATTTCATG	ba
+3	AAAAAAAACTATAGAAAATCCATT	ba
+1	AAAAAAAACTATTCTATTTCCGAT	ba
+13	AAAAAAAACTGATCTGCTTGGCGG	ba
+8	AAAAAAAACTTGCGAATAGCATCG	ba
+4	AAAAAAAACTTGTTATCAAAACGT	ab
+1	AAAAAAAAGAAAAGTTCAACACGC	ba
+1	AAAAAAAAGAAGTTCGCCCTCCGA	ab
+13	AAAAAAAAGAGAGTTTAGTCATGG	ab
+1	AAAAAAAAGAGAGTTTAGTCATGG	ba
+1	AAAAAAAAGAGAGTTTAGTCCTGG	ab
\ No newline at end of file
Binary file test-data/fsd_output1.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_output1.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,367 @@
+Values from family size distribution with all datasets based on families
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
+FS=1	63	63	63	63
+FS=2	5	5	5	5
+FS=3	8	8	8	8
+FS=4	9	9	9	9
+FS=5	3	3	3	3
+FS=6	5	5	5	5
+FS=7	3	3	3	3
+FS=8	3	3	3	3
+FS=9	2	2	2	2
+FS=10	3	3	3	3
+FS=11	1	1	1	1
+FS=12	3	3	3	3
+FS=13	3	3	3	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	1	1	1	1
+sum	112	112	112	112
+
+Values from family size distribution with all datasets based on PE reads
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
+FS=1	63	63	63	63
+FS=2	10	10	10	10
+FS=3	24	24	24	24
+FS=4	36	36	36	36
+FS=5	15	15	15	15
+FS=6	30	30	30	30
+FS=7	21	21	21	21
+FS=8	24	24	24	24
+FS=9	18	18	18	18
+FS=10	30	30	30	30
+FS=11	11	11	11	11
+FS=12	36	36	36	36
+FS=13	39	39	39	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	21	21	21	21
+sum	378	378	378	378
+
+Dataset:	fsd_data1.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data1.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
+
+Dataset:	fsd_data2.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data2.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
+
+Dataset:	fsd_data3.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data3.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
+
+Dataset:	fsd_data4.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data4.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
Binary file test-data/fsd_output2.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_output2.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,288 @@
+Values from family size distribution with all datasets based on families
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
+FS=1	63	63	63
+FS=2	5	5	5
+FS=3	8	8	8
+FS=4	9	9	9
+FS=5	3	3	3
+FS=6	5	5	5
+FS=7	3	3	3
+FS=8	3	3	3
+FS=9	2	2	2
+FS=10	3	3	3
+FS=11	1	1	1
+FS=12	3	3	3
+FS=13	3	3	3
+FS=14	0	0	0
+FS=15	0	0	0
+FS=16	0	0	0
+FS=17	0	0	0
+FS=18	0	0	0
+FS=19	0	0	0
+FS=20	0	0	0
+FS>20	1	1	1
+sum	112	112	112
+
+Values from family size distribution with all datasets based on PE reads
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
+FS=1	63	63	63
+FS=2	10	10	10
+FS=3	24	24	24
+FS=4	36	36	36
+FS=5	15	15	15
+FS=6	30	30	30
+FS=7	21	21	21
+FS=8	24	24	24
+FS=9	18	18	18
+FS=10	30	30	30
+FS=11	11	11	11
+FS=12	36	36	36
+FS=13	39	39	39
+FS=14	0	0	0
+FS=15	0	0	0
+FS=16	0	0	0
+FS=17	0	0	0
+FS=18	0	0	0
+FS=19	0	0	0
+FS=20	0	0	0
+FS>20	21	21	21
+sum	378	378	378
+
+Dataset:	fsd_data1.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data1.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
+
+Dataset:	fsd_data2.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data2.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
+
+Dataset:	fsd_data3.tab
+max. family size:	21
+absolute frequency:	1
+relative frequency:	0.009
+
+median family size:	1.0
+mean family size:	3.375
+
+	singletons:			family size > 20:				length of dataset:
+	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
+fsd_data3.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
+
+The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)
+Whereas the total frequencies were calculated from the whole dataset (=including the DCS).
+
+FS >= 1	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	47	123	0.431	0.420	0.339	0.325
+SSCS ba	59	222	0.541	0.527	0.612	0.587
+DCS (total)	3 (6)	18 (33)	0.028	0.027 (0.054)	0.050	0.048 (0.087)
+total nr. of tags	109	363	109	112	363	378
+
+FS >= 3	nr. of tags	nr. of PE reads	rel. freq of tags		rel. freq of PE reads:
+			unique:	total	unique	total:
+SSCS ab	14	87	0.341	0.259	0.313	0.224
+SSCS ba	26	187	0.491	0.481	0.495	0.482
+DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
+total nr. of tags	41	278	41	42	278	288
+
+Values from family size distribution based on families
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	3	2	5
+FS=3	0	3	5	8
+FS=4	2	3	4	9
+FS=5	0	2	1	3
+FS=6	0	1	4	5
+FS=7	0	1	2	3
+FS=8	0	1	2	3
+FS=9	0	0	2	2
+FS=10	1	1	1	3
+FS=11	0	0	1	1
+FS=12	0	1	2	3
+FS=13	1	1	1	3
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
Binary file test-data/fsd_reg.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_reg.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,32 @@
+10	AAAAAACATCCCAATAAGAAATCA	ab
+9	AAAAAACATCCCAATAAGAAATCA	ba
+4	AAAAAAGTCCTTCGACTCAAGCGG	ab
+5	AAAAAAGTCCTTCGACTCAAGCGG	ba
+5	AAAAAATAGTTAAGCCGACACACT	ab
+7	AAAAAATAGTTAAGCCGACACACT	ba
+7	AAAAAATGTGCCGAACCTTGGCGA	ab
+10	AAAAAATGTGCCGAACCTTGGCGA	ba
+7	AAAAACAACATAGCTTGAAAATTT	ab
+4	AAAAACAACATAGCTTGAAAATTT	ba
+81	ATTCGGATAATTCGACGCAACATT	ab
+11	ATTCGGATAATTCGACGCAACATT	ba
+41	ATTCGTCGACAATACAAAGGGGCC	ab
+226	ATTCGTCGACAATACAAAGGGGCC	ba
+6	ATTGCCAGTGTGGGCTGGTTAGTA	ab
+41	ATTGCCAGTGTGGGCTGGTTAGTA	ba
+50	ATTTCGCGACCATCCGCCACTTTG	ab
+332	ATTTCGCGACCATCCGCCACTTTG	ba
+64	CAAACTTTAGCACAGTGTGTGTCC	ab
+57	CAAACTTTAGCACAGTGTGTGTCC	ba
+85	ATAAACGGCCTTCGACATTGTGAC	ab
+15	ATAAACGGCCTTCGACATTGTGAC	ba
+11	ATAAAGTCACCTGTGAATACGTTG	ab
+35	ATAAAGTCACCTGTGAATACGTTG	ba
+83	ATAAATCGAAACCGTGCCCAACAA	ab
+63	ATAAATCGAAACCGTGCCCAACAA	ba
+9	ATTTAGATATTTTCTTCTTTTTCT	ab
+7	ATTTAGATATTTTCTTCTTTTTCT	ba
+7	ATTTAGTTATCCGTCGGCGACGAA	ab
+3	ATTTAGTTATCCGTCGGCGACGAA	ba
+8	ATTTAGTTTGAATTGCCCTGCGTC	ab
+9	ATTTAGTTTGAATTGCCCTGCGTC	ba
\ No newline at end of file
Binary file test-data/fsd_reg_output.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_reg_output.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,38 @@
+Dataset:	fsd_reg.tab
+	AB	BA
+max. family size:	11	35
+absolute frequency:	22	2
+relative frequency:	0.333	0.167
+
+total nr. of reads	1312
+total nr. of tags	24 (12)
+
+
+Values from family size distribution
+	ACH_TDII_5regions_90_633	ACH_TDII_5regions_659_1140
+FS=4	4	0
+FS=5	4	0
+FS=6	0	0
+FS=7	6	0
+FS=8	0	0
+FS=9	2	0
+FS=10	4	0
+FS=11	0	2
+FS=12	0	0
+FS=13	0	0
+FS=14	0	0
+FS=15	0	0
+FS=16	0	0
+FS=17	0	0
+FS=18	0	0
+FS=19	0	0
+FS=20	0	0
+FS>20	0	2
+sum	20	4
+
+
+In the plot, both family sizes of the ab and ba strands were used.
+Whereas the total numbers indicate only the single count of the tags per region.
+Region	total nr. of tags per region
+ACH_TDII_5regions_90_633	10
+ACH_TDII_5regions_659_1140	2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fsd_reg_ranges.bed	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,2 @@
+ACH_TDII_5regions	90	633
+ACH_TDII_5regions	659	1140
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/td_chimeras_output.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,22 @@
+chimera tag	family size, read direction	similar tag with TD=0
+AAAAAAAAAAAA AACCAAAACTTC	1 ba	*AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab
+AAAAAAAAAAAA ACCAGGCGTCGA	1 ba	*AAAAAAAAAAAA* AACCAAAACTTC 1 ba, *AAAAAAAAAAAA* AGCTCCACGTTG 1 ba, *AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab, *AAAAAAAAAAAA* TTGGGTTCCTTA 1 ab
+AAAAAAAAAAAA ATCGTGGTTTGT	1 ba	*AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAG *ATCGTGGTTTGT* 4 ba
+AAAAAAAAAAAA ATTCACCCTTGT	1 ba	*AAAAAAAAAAAA* CAGTGTTGAGAC 1 ba, AAAAAAAAAAAT *ATTCACCCTTGT* 6 ba
+AAAAAAAAAAAA CACACTTAACTT	7 ba	*AAAAAAAAAAAA* ATTCACCCTTGT 1 ba, *AAAAAAAAAAAA* CCGCTCCTCACA 4 ba, *AAAAAAAAAAAA* TCTTTCTTTGAG 2 ab
+AAAAAAAAAAAA GGCAACACAGAA	1 ab	*AAAAAAAAAAAA* ATCGTGGTTTGT 1 ba, AAAAAAAAAAAG *GGCAACACAGAA* 3 ab
+AAAAAAAAAAAG AGTCGCACCCAG	1 ba	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG CGCAACACAGAA	1 ab	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG TAGCCCTAAACG	1 ab	*AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba
+AAAAAAAAAAAG TCTTTCTTTGAG	1 ab	AAAAAAAAAAAA *TCTTTCTTTGAG* 2 ab, *AAAAAAAAAAAG* ATCGTGGTTTGT 4 ba, *AAAAAAAAAAAG* CGCAACACAGAA 1 ab, *AAAAAAAAAAAG* GGCAACACAGAA 3 ab
+AAAAAAAAAAAT ATCATAGACTCT	1 ab	*AAAAAAAAAAAT* ATTCACCCTTGT 6 ba
+AAAAAAAAAAAT ATTCGAAAGTTA	1 ba	*AAAAAAAAAAAT* ATCATAGACTCT 1 ab, *AAAAAAAAAAAT* ATTCACCCTTGT 6 ba
+This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.
+The tags were separated by an empty space into their halves and the * marks the identical half.
+
+Statistics of nr. of tags that returned max. TD (2nd column)
+minimum	1	tag(s)
+mean	2.08333333333	tag(s)
+median	2.0	tag(s)
+maximum	5	tag(s)
+sum	25	tag(s)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/td_data.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,20 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+4	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
\ No newline at end of file
Binary file test-data/td_output.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/td_output.tab	Thu Oct 24 09:36:09 2019 -0400
@@ -0,0 +1,92 @@
+td_data.tab
+nr of tags	20
+sample size	20
+
+Tag distance separated by family size
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+TD=1	5	1	1	1	1	0	9
+TD=6	3	0	0	0	0	0	3
+TD=7	4	0	0	0	1	0	5
+TD=8	2	0	0	1	0	0	3
+sum	14	1	1	2	2	0	20
+
+Family size distribution separated by Tag distance
+	TD=1	TD=2	TD=3	TD=4	TD=5-8	TD>8	sum
+FS=1	5	0	0	0	9	0	14
+FS=2	1	0	0	0	0	0	1
+FS=3	1	0	0	0	0	0	1
+FS=4	1	0	0	0	1	0	2
+FS=6	1	0	0	0	0	0	1
+FS=7	0	0	0	0	1	0	1
+sum	9	0	0	0	11	0	20
+
+
+max. family size in sample:	7
+absolute frequency:	1
+relative frequency:	0.05
+
+Chimera Analysis:
+The tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.
+The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).
+In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.
+The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.
+Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).
+These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).
+For simplicity, we used the maximum value of the relative differences and the respective delta HD.
+Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.
+
+length of one half of the tag	12
+
+Tag distance of each half in the tag
+	TD a.min	TD b.max	TD b.min	TD a.max	TD a.min + b.max, TD a.max + b.min	sum
+TD=0	20	0	8	1	0	29
+TD=1	0	0	1	19	8	28
+TD=2	0	0	0	0	1	1
+TD=5	0	0	3	0	0	3
+TD=6	0	0	2	0	3	5
+TD=7	0	1	6	0	4	11
+TD=8	0	2	0	0	7	9
+TD=9	0	1	0	0	1	2
+TD=10	0	2	0	0	2	4
+TD=11	0	7	0	0	7	14
+TD=12	0	7	0	0	7	14
+sum	20	20	20	20	40	120
+
+Absolute delta Tag distance within the tag
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=7	1	0	0	0	0	0	1
+diff=8	1	0	0	0	1	0	2
+diff=9	1	0	0	0	0	0	1
+diff=10	2	0	0	0	0	0	2
+diff=11	4	0	1	1	1	0	7
+diff=12	5	1	0	1	0	0	7
+sum	14	1	1	2	2	0	20
+
+Chimera analysis: relative delta Tag distance
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=1.0	14	1	1	2	2	0	20
+sum	14	1	1	2	2	0	20
+
+All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.
+These tags are considered as chimeras.
+Tag distance of chimeric families separated after FS
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+TD=7	1	0	0	0	0	0	1
+TD=8	1	0	0	0	1	0	2
+TD=9	1	0	0	0	0	0	1
+TD=10	2	0	0	0	0	0	2
+TD=11	4	0	1	1	1	0	7
+TD=12	5	1	0	1	0	0	7
+sum	14	1	1	2	2	0	20
+
+Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)
+	DCS	SSCS ab	SSCS ba	sum
+TD=7.0	0	0	1	1
+TD=8.0	0	1	1	2
+TD=9.0	0	1	0	1
+TD=10.0	0	1	1	2
+TD=11.0	0	3	4	7
+TD=12.0	0	2	5	7
+sum	0	8	12	20
+
+