Mercurial > repos > mheinzl > fsd

--- a/fsd.py	Wed Aug 14 13:03:14 2019 -0400
+++ b/fsd.py	Tue Aug 27 07:36:53 2019 -0400
@@ -41,6 +41,7 @@
     parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName4')
     parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
+    parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
     parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
     return parser
@@ -61,6 +62,7 @@
     fourthFile = args.inputFile4
     name4 = args.inputName4
     log_axis = args.log_axis
+    rel_freq = args.rel_freq

     title_file = args.output_tabular
     title_file2 = args.output_pdf
@@ -78,7 +80,7 @@
     data_array_list = []
     list_to_plot_original = []
     colors = []
-    bins = numpy.arange(1, 22)
+    bins = numpy.arange(1, 22)
     with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
         fig = plt.figure()
         fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
@@ -98,6 +100,8 @@
             # data1[bigFamilies] = 22
             data1 = numpy.clip(integers, bins[0], bins[-1])
             name1 = name1.split(".tabular")[0]
+            if len(name1) > 40:
+                name1 = name1[:40]
             list_to_plot.append(data1)
             label.append(name1)
             data_array_list.append(file1)
@@ -106,21 +110,24 @@
             fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)

-            legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1))
+            legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+                                                                       float(numpy.bincount(data1)[1]) / len(data1))
             fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers))
+            legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
+                                                        float(numpy.bincount(data1)[1]) / sum(integers))
             fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)

             legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]),
-                                                                             float(sum(integers[integers > 20]))
-                                                                             / sum(integers))
+                                                                             float(len(integers[integers > 20]))
+                                                                             / len(integers))
             fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)

-            legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers))
+            legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]),
+                                                       float(sum(integers[integers > 20])) / sum(integers))
             fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)

@@ -145,6 +152,8 @@
             data2 = numpy.clip(integers2, bins[0], bins[-1])
             list_to_plot.append(data2)
             name2 = name2.split(".tabular")[0]
+            if len(name2) > 40:
+                name2 = name2[:40]
             label.append(name2)
             data_array_list.append(file2)

@@ -159,11 +168,13 @@
             fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)

-            legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
+            legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]),
+                                             float(len(integers2[integers2 > 20])) / len(integers2))
             fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)

-            legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
+            legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]),
+                                             float(sum(integers2[integers2 > 20])) / sum(integers2))
             fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)

@@ -188,6 +199,8 @@
             data3 = numpy.clip(integers3, bins[0], bins[-1])
             list_to_plot.append(data3)
             name3 = name3.split(".tabular")[0]
+            if len(name3) > 40:
+                name3 = name3[:40]
             label.append(name3)
             data_array_list.append(file3)

@@ -198,11 +211,13 @@
             fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3))
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1],
+                                              float(numpy.bincount(data3)[1]) / sum(integers3))
             fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)

-            legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3))
+            legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]),
+                                             float(len(integers3[integers3 > 20])) / len(integers3))
             fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)

@@ -231,6 +246,8 @@
             data4 = numpy.clip(integers4, bins[0], bins[-1])
             list_to_plot.append(data4)
             name4 = name4.split(".tabular")[0]
+            if len(name4) > 40:
+                name4 = name4[:40]
             label.append(name4)
             data_array_list.append(file4)

@@ -241,11 +258,13 @@
             fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4))
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1],
+                                              float(numpy.bincount(data4)[1]) / sum(integers4))
             fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)

-            legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4))
+            legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]),
+                                             float(len(integers4[integers4 > 20])) / len(integers4))
             fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
             fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)

@@ -265,137 +284,129 @@
         maximumX = numpy.amax(numpy.concatenate(list_to_plot))
         minimumX = numpy.amin(numpy.concatenate(list_to_plot))
         list_to_plot2 = list_to_plot
-        to_plot = ["Absolute frequencies", "Relative frequencies"]
-        plt.xticks([], [])
-        plt.yticks([], [])
-        fig.suptitle('Family Size Distribution (tags)', fontsize=14)

-        for l in range(len(to_plot)):
-            ax = fig.add_subplot(2, 1, l+1)
-            ticks = numpy.arange(1, 22, 1)
-            ticks1 = map(str, ticks)
-            ticks1[len(ticks1) - 1] = ">20"
+        if rel_freq:
+            ylab = "Relative Frequency"
+        else:
+            ylab = "Absolute Frequency"

-            if to_plot[l] == "Relative frequencies":
-                w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
-                counts_rel = ax.hist(list_to_plot2, weights=w,
-                                     bins=numpy.arange(1, 23), stacked=False, edgecolor="black",
-                                     linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
-                ax.set_ylim(0, 1.07)
-            else:
-                counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
-                ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
-
-            ax.set_xticks(numpy.array(ticks))
-            ax.set_xticklabels(ticks1)
-
-            ax.set_ylabel(to_plot[l], fontsize=14)
-            ax.set_xlabel("Family size", fontsize=14)
-            if log_axis:
-                ax.set_yscale('log')
-            ax.grid(b=True, which="major", color="#424242", linestyle=":")
-            ax.margins(0.01, None)
-        pdf.savefig(fig)
-        plt.close()
+        # PLOT FSD based on tags
+        fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14)
+        ax = fig.add_subplot(1, 1, 1)
+        ticks = numpy.arange(1, 22, 1)
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        ax.set_xticks([], [])
+        if rel_freq:
+            w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
+            counts = ax.hist(list_to_plot2, weights=w,
+                                 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors,
+                                 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
+            ax.set_ylim(0, 1.07)
+        else:
+            counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1,
+                             label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
+        ax.set_xticks(numpy.array(ticks))
+        ax.set_xticklabels(ticks1)
+        ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))

-        fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
-        for l in range(len(to_plot)):
-            ax = fig2.add_subplot(2, 1, l + 1)
-            ticks = numpy.arange(minimumX, maximumX + 1)
-            ticks = numpy.arange(1, 22)
-            ticks1 = map(str, ticks)
-            ticks1[len(ticks1) - 1] = ">20"
-            reads = []
-            reads_rel = []
-
-            barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot)+1)
-
-            for i in range(len(list_to_plot2)):
-                x = list(numpy.arange(1, 22).astype(float))
-                unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
-                y = unique * c
-                if sum(list_to_plot_original[i] > 20) > 0:
-                    y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
-                y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
-                reads.append(y)
-                reads_rel.append(list(numpy.float_(y)) / sum(y))
-                #x = [xi + barWidth for xi in x]
+        ax.set_ylabel(ylab, fontsize=14)
+        ax.set_xlabel("Family size", fontsize=14)
+        if log_axis:
+            ax.set_yscale('log')
+        ax.grid(b=True, which="major", color="#424242", linestyle=":")
+        ax.margins(0.01, None)
+        pdf.savefig(fig)
+        #plt.close()

-                if len(list_to_plot2) == 1:
-                    x = [xi * 0.5 for xi in x]
-                    w = 0.4
-                else:
-                    x = [xi + barWidth for xi in x]
-                    w = 1./(len(list_to_plot) + 1)
-                if to_plot[l] == "Relative frequencies":
-                    counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
-                                         edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i])
-                    ax.set_ylim(0, 1.07)
-                else:
-                    #y = list(y.reshape((len(y))))
-                    counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
-                                     alpha=0.7, color=colors[i])
-                if i == len(list_to_plot2)-1:
-                    barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
-                else:
-                    barWidth += 1. / (len(list_to_plot) + 1)
-
-            if to_plot[l] == "Absolute frequencies":
-                ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
-            else:
-                ax.set_xlabel("Family size", fontsize=14)
+        # PLOT FSD based on PE reads
+        fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14)
+        ax2 = fig2.add_subplot(1, 1, 1)
+        ticks = numpy.arange(1, 22)
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        reads = []
+        reads_rel = []
+
+        barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+        ax2.set_xticks([], [])
+
+        for i in range(len(list_to_plot2)):
+            x = list(numpy.arange(1, 22).astype(float))
+            unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
+            y = unique * c
+            if sum(list_to_plot_original[i] > 20) > 0:
+                y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
+            y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+            reads.append(y)
+            reads_rel.append(list(numpy.float_(y)) / sum(y))

             if len(list_to_plot2) == 1:
-                ax.set_xticks(numpy.array([xi + 0.2 for xi in x]))
+                x = [xi * 0.5 for xi in x]
+                w = 0.4
+            else:
+                x = [xi + barWidth for xi in x]
+                w = 1. / (len(list_to_plot) + 1)
+            if rel_freq:
+                counts2_rel = ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
+                                     edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i])
+                ax2.set_ylim(0, 1.07)
+            else:
+                counts2 = ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
+                                 alpha=0.7, color=colors[i])
+
+            if i == len(list_to_plot2) - 1:
+                barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
             else:
-                ax.set_xticks(numpy.array(ticks))
-            ax.set_xticklabels(ticks1)
-            ax.set_ylabel(to_plot[l], fontsize=14)
-            if log_axis:
-                ax.set_yscale('log')
-            ax.grid(b=True, which="major", color="#424242", linestyle=":")
-            ax.margins(0.01, None)
+                barWidth += 1. / (len(list_to_plot) + 1)
+
+        ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+
+        if len(list_to_plot2) == 1:
+            ax2.set_xticks(numpy.array([xi + 0.2 for xi in x]))
+        else:
+            ax2.set_xticks(numpy.array(ticks))
+        ax2.set_xticklabels(ticks1)
+        ax2.set_xlabel("Family size", fontsize=14)
+        ax2.set_ylabel(ylab, fontsize=14)
+        if log_axis:
+            ax2.set_yscale('log')
+        ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+        ax2.margins(0.01, None)

         pdf.savefig(fig2)
         plt.close()

         # write data to CSV file tags
-        output_file.write("Values from family size distribution with all datasets (tags)\n")
+        counts = [numpy.bincount(d, minlength=22)[1:] for d in list_to_plot2]  # original counts of family sizes
+        output_file.write("Values from family size distribution with all datasets based on families\n")
         output_file.write("\nFamily size")
         for i in label:
             output_file.write("{}{}".format(sep, i))
-        # output_file.write("{}sum".format(sep))
         output_file.write("\n")
         j = 0
-        for fs in counts[1][0:len(counts[1]) - 1]:
+        for fs in bins:
             if fs == 21:
                 fs = ">20"
             else:
                 fs = "={}".format(fs)
             output_file.write("FS{}{}".format(fs, sep))
-            if len(label) == 1:
-                output_file.write("{}{}".format(int(counts[0][j]), sep))
-            else:
-                for n in range(len(label)):
-                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))
+            for n in range(len(label)):
+                output_file.write("{}{}".format(int(counts[n][j]), sep))
             output_file.write("\n")
             j += 1
         output_file.write("sum{}".format(sep))
-        if len(label) == 1:
-            output_file.write("{}{}".format(int(sum(counts[0])), sep))
-        else:
-            for i in counts[0]:
-                output_file.write("{}{}".format(int(sum(i)), sep))
+        for i in counts:
+            output_file.write("{}{}".format(int(sum(i)), sep))

         # write data to CSV file PE reads
-        output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n")
+        output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n")
         output_file.write("\nFamily size")
         for i in label:
             output_file.write("{}{}".format(sep, i))
-        # output_file.write("{}sum".format(sep))
         output_file.write("\n")
         j = 0
-
+
         for fs in bins:
             if fs == 21:
                 fs = ">20"
@@ -410,7 +421,7 @@
             output_file.write("\n")
             j += 1
         output_file.write("sum{}".format(sep))
-        if len(label) == 1:
+        if len(label) == 1:
             output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep))
         else:
             for i in reads:
@@ -461,6 +472,7 @@
             dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)]

             list1 = [duplTags_double, dataAB, dataBA]  # list for plotting
+            list1_o = [duplTags_double_o, dataAB_o, dataBA_o]  # list for plotting

             # information for family size >= 3
             dataAB_FS3 = dataAB[dataAB >= 3]
@@ -483,20 +495,30 @@

             fig = plt.figure()
             plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
-            counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
-                              edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
-                              rwidth=0.8)
+
+            if rel_freq:
+                w = [numpy.zeros_like(d) + 1. / len(numpy.concatenate(list1)) for d in list1]
+                counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w,
+                                  edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
+                                  rwidth=0.8)
+                plt.ylim(0, 1.07)
+            else:
+                counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
+                                  edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
+                                  rwidth=0.8)
+
             # tick labels of x axis
             ticks = numpy.arange(1, 22, 1)
             ticks1 = map(str, ticks)
             ticks1[len(ticks1) - 1] = ">20"
             plt.xticks(numpy.array(ticks), ticks1)
-            singl = counts[0][2][0]  # singletons
-            last = counts[0][2][len(counts[0][0]) - 1]  # large families
+           # singl = counts[0][2][0]  # singletons
+            singl = len(data_o[data_o == 1])
+            last = len(data_o[data_o > 20])  # large families
             if log_axis:
                 plt.yscale('log')
             plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
-            plt.title(name_file, fontsize=14)
+            plt.title("{}: FSD based on families".format(name_file), fontsize=14)
             plt.xlabel("Family size", fontsize=14)
             plt.ylabel("Absolute Frequency", fontsize=14)
             plt.margins(0.01, None)
@@ -506,31 +528,47 @@
             legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
             plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)

-            legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), (len(ab) + len(ba)))
+            legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+                                                                                  len(duplTags), len(duplTags_double), (
+                                                                                              len(dataAB) + len(
+                                                                                          dataBA) + len(duplTags)),
+                                                                                  (len(ab) + len(ba)))
             plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)

-            legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), (sum(ab_o) + sum(ba_o)))
+            legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+                                                                                sum(duplTags_o), sum(duplTags_double_o),
+                                                                                (sum(dataAB_o) + sum(dataBA_o) + sum(
+                                                                                    duplTags_o)),
+                                                                                (sum(ab_o) + sum(ba_o)))
             plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)

-            legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags)))
+            legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                (len(dataAB) + len(dataBA) + len(duplTags)))
             plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)

-            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)), float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba)))
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+                                                                           float(len(dataBA)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags_double)) / (
+                                                                                       len(ab) + len(ba)),
+                                                                           (len(ab) + len(ba)))
             plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)

             legend1 = "\nsingletons:\nfamily size > 20:"
             plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)

-            legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int))
+            legend4 = "{:,}\n{:,}".format(singl, last)
             plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
-
-            legend3 = "{:.3f}\n{:.3f}".format(singl / len(data), last / len(data))
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
             plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)

             legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
             plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)

-            legend3 = "{:.3f}\n{:.3f}".format(float(singl)/sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
             plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)

             legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
@@ -550,8 +588,137 @@
             pdf.savefig(fig)
             plt.close()

+            # PLOT FSD based on PE reads
+            fig3 = plt.figure()
+            plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
+
+            fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14)
+            ax2 = fig3.add_subplot(1, 1, 1)
+            ticks = numpy.arange(1, 22)
+            ticks1 = map(str, ticks)
+            ticks1[len(ticks1) - 1] = ">20"
+            reads = []
+            reads_rel = []
+
+            #barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
+            ax2.set_xticks([], [])
+
+            list_y = []
+            label = ["duplex", "ab", "ba"]
+            col = ["#FF0000", "#5FB404", "#FFBF00"]
+            for i in range(len(list1)):
+                x = list(numpy.arange(1, 22).astype(float))
+                unique, c = numpy.unique(list1[i], return_counts=True)
+                y = unique * c
+                if sum(list1_o[i] > 20) > 0:
+                    y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20])
+                y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
+                reads.append(y)
+                reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)))
+
+                if rel_freq:
+                    y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))
+                    ax2.set_ylim(0, 1.07)
+                else:
+                    y = y
+
+                list_y.append(y)
+                if i == 0:
+                    counts2 = ax2.bar(x, y, align="edge", width=0.8,
+                                          edgecolor="black", label=label[0],
+                                          linewidth=1, alpha=1, color=col[0])
+                elif i == 1:
+                    counts2 = ax2.bar(x, y, bottom=list_y[i-1], align="edge", width=0.8,
+                                          edgecolor="black", label=label[1],
+                                          linewidth=1, alpha=1, color=col[1])
+                elif i == 2:
+                    bars = numpy.add(list_y[0], list_y[1]).tolist()
+
+                    counts2 = ax2.bar(x, y, bottom=bars, align="edge", width=0.8,
+                                      edgecolor="black", label=label[2],
+                                      linewidth=1, alpha=1, color=col[2])
+
+            ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+
+            singl = len(data_o[data_o == 1])
+            last = len(data_o[data_o > 20])  # large families
+
+            ax2.set_xticks(numpy.array(ticks))
+            ax2.set_xticklabels(ticks1)
+            ax2.set_xlabel("Family size", fontsize=14)
+            ax2.set_ylabel(ylab, fontsize=14)
+            if log_axis:
+                ax2.set_yscale('log')
+            ax2.grid(b=True, which="major", color="#424242", linestyle=":")
+            ax2.margins(0.01, None)
+
+            # extra information beneath the plot
+            legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
+            plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
+                                                                                  len(duplTags), len(duplTags_double), (
+                                                                                          len(dataAB) + len(
+                                                                                      dataBA) + len(duplTags)),
+                                                                                  (len(ab) + len(ba)))
+            plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
+                                                                                sum(duplTags_o), sum(duplTags_double_o),
+                                                                                (sum(dataAB_o) + sum(dataBA_o) + sum(
+                                                                                    duplTags_o)),
+                                                                                (sum(ab_o) + sum(ba_o)))
+            plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+
+            legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                (len(dataAB) + len(dataBA) + len(duplTags)))
+            plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
+                                                                           float(len(dataBA)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags)) / (len(ab) + len(ba)),
+                                                                           float(len(duplTags_double)) / (
+                                                                                   len(ab) + len(ba)),
+                                                                           (len(ab) + len(ba)))
+            plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend1 = "\nsingletons:\nfamily size > 20:"
+            plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
+
+            legend4 = "{:,}\n{:,}".format(singl, last)
+            plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
+            plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
+            plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
+            plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
+
+            legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)))
+            plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(
+                float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+                float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o)))
+            plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
+
+            pdf.savefig(fig3)
+            plt.close()
+
             # write same information to a csv file
             count = numpy.bincount(data_o)  # original counts of family sizes
+
             output_file.write("\nDataset:{}{}\n".format(sep, name_file))
             output_file.write("max. family size:{}{}\n".format(sep, max(data_o)))
             output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
@@ -560,37 +727,58 @@
             output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o))))
             output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o))))

-            output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
-            output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
+            output_file.write(
+                "{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep,
+                                                                                          sep, sep))
+            output_file.write(
+                "{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(
+                    sep, sep, sep, sep, sep, sep, sep, sep, sep))
             output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
-                name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
-                last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o)))
+                name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep,
+                last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep,
+                                                        float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data),
+                sep, sum(data_o)))

             # information for FS >= 1
-            output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
-                              "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
-            output_file.write("FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep))
+            output_file.write(
+                "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
+                "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
+            output_file.write(
+                "FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep,
+                                                                                                              sep, sep,
+                                                                                                              sep))
             output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
             output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
-                sep, len(dataAB), sep, sum(dataAB_o), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
+                sep, len(dataAB), sep, sum(dataAB_o), sep,
+                float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
                 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
                 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o))))
             output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
-                sep, len(dataBA), sep, sum(dataBA_o), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)),
-                sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, float(len(dataBA)) / (len(ba) + len(ba)),
+                sep, len(dataBA), sep, sum(dataBA_o), sep,
+                float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)),
+                sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+                float(len(dataBA)) / (len(ba) + len(ba)),
                 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o))))
-            output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
-                sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
-                float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
-                float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
-                float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
-                float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
+            output_file.write(
+                "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+                    sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
+                    float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+                    float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
+                    float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+                    float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
+                    float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
             output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
-                sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
+                sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
+                (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
                 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep,
                 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o))))
             # information for FS >= 3
-            output_file.write("\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep))
+            output_file.write(
+                "\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep,
+                                                                                                                sep,
+                                                                                                                sep,
+                                                                                                                sep,
+                                                                                                                sep))
             output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
             output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
                 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep,
@@ -604,29 +792,61 @@
                 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
                 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
                 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
-            output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
-                sep, len(duplTags_FS3), duplTags_double_FS3,  sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
-                float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
-                float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-                float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-                sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
-                float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
-                float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+            output_file.write(
+                "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
+                    sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
+                    float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                    float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                    float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                    sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                    sep,
+                    float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
+                    float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
             output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
-                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
-                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
-                sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
+                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
+                sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
+                (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
+                sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
+                (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))

-            output_file.write("\nValues from family size distribution\n")
+            counts = [numpy.bincount(d, minlength=22)[1:] for d in list1]  # original counts of family sizes
+            output_file.write("\nValues from family size distribution based on families\n")
             output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
-            for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]):
+
+            j = 0
+            for fs in bins:
                 if fs == 21:
                     fs = ">20"
                 else:
                     fs = "={}".format(fs)
-                ab1 = ab - dx
-                ba1 = ba - ab
-                output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba)))
+                output_file.write("FS{}{}".format(fs, sep))
+                for n in range(3):
+                    output_file.write("{}{}".format(int(counts[n][j]), sep))
+                output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j]))
+                j += 1
+            output_file.write("sum{}".format(sep))
+            for i in counts:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+            output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2])))
+
+            output_file.write("\nValues from family size distribution based on PE reads\n")
+            output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
+            j = 0
+            for fs in bins:
+                if fs == 21:
+                    fs = ">20"
+                else:
+                    fs = "={}".format(fs)
+                output_file.write("FS{}{}".format(fs, sep))
+                for n in range(3):
+                    output_file.write("{}{}".format(int(reads[n][j]), sep))
+                output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j]))
+                j += 1
+            output_file.write("sum{}".format(sep))
+            for i in reads:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+            output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2])))

     print("Files successfully created!")
--- a/fsd.xml	Wed Aug 14 13:03:14 2019 -0400
+++ b/fsd.xml	Tue Aug 27 07:36:53 2019 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!-- galaxy version 16.04 -->
-<tool id="fsd" name="FSD" version="1.0.5">
+<tool id="fsd" name="FSD" version="1.0.6">
     <description>: Family Size Distribution of duplex sequencing tags</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
@@ -10,7 +10,7 @@
     <command>
         python2 '$__tool_directory__/fsd.py' --inputFile1 '${file1}' --inputName1 '${file1.name}'
 --inputFile2 '${file2}' --inputName2 '${file2.name}' --inputFile3 '${file3}' --inputName3 '${file3.name}'
---inputFile4 '${file4}' --inputName4 '${file4.name}' $log_axis --output_pdf $output_pdf --output_tabular $output_tabular
+--inputFile4 '${file4}' --inputName4 '${file4.name}' $log_axis $rel_freq --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
         <param name="file1" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
@@ -18,7 +18,7 @@
         <param name="file3" type="data" format="tabular" label="Dataset 3: input tags" optional="true" />
         <param name="file4" type="data" format="tabular" label="Dataset 4: input tags" optional="true"  help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family. Name of the files can have max. 34 charcters!"/>
         <param name="log_axis" type="boolean" label="log scale for y axis?" truevalue="" falsevalue="--log_axis" checked="False" help="Transform y axis in log scale."/>
-
+        <param name="rel_freq" type="boolean" label="relative frequency?" truevalue="" falsevalue="--rel_freq" checked="False" help="If True, the relative frequencies instead of the absolute values are displayed in the plots."/>
     </inputs>
     <outputs>
         <data name="output_pdf" format="pdf" />
Binary file test-data/fsd_output1.pdf has changed
--- a/test-data/fsd_output1.tab	Wed Aug 14 13:03:14 2019 -0400
+++ b/test-data/fsd_output1.tab	Tue Aug 27 07:36:53 2019 -0400
@@ -1,4 +1,4 @@
-Values from family size distribution with all datasets (tags)
+Values from family size distribution with all datasets based on families

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
 FS=1	63	63	63	63
@@ -24,7 +24,7 @@
 FS>20	1	1	1	1
 sum	112	112	112	112

-Values from family size distribution with all datasets (PE reads)
+Values from family size distribution with all datasets based on PE reads

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
 FS=1	63	63	63	63
@@ -79,7 +79,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -102,6 +102,32 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378

 Dataset:	fsd_data2.tab
 max. family size:	21
@@ -132,7 +158,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -155,6 +181,32 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378

 Dataset:	fsd_data3.tab
 max. family size:	21
@@ -185,7 +237,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -208,6 +260,32 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378

 Dataset:	fsd_data4.tab
 max. family size:	21
@@ -238,7 +316,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -261,3 +339,29 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378
Binary file test-data/fsd_output2.pdf has changed
--- a/test-data/fsd_output2.tab	Wed Aug 14 13:03:14 2019 -0400
+++ b/test-data/fsd_output2.tab	Tue Aug 27 07:36:53 2019 -0400
@@ -1,4 +1,4 @@
-Values from family size distribution with all datasets (tags)
+Values from family size distribution with all datasets based on families

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
 FS=1	63	63	63
@@ -24,7 +24,7 @@
 FS>20	1	1	1
 sum	112	112	112

-Values from family size distribution with all datasets (PE reads)
+Values from family size distribution with all datasets based on PE reads

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
 FS=1	63	63	63
@@ -79,7 +79,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -102,6 +102,32 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378

 Dataset:	fsd_data2.tab
 max. family size:	21
@@ -132,7 +158,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -155,6 +181,32 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378

 Dataset:	fsd_data3.tab
 max. family size:	21
@@ -185,7 +237,7 @@
 DCS (total)	1 (2)	4 (14)	0.024	0.024 (0.048)	0.014	0.014 (0.049)
 total nr. of tags	41	278	41	42	278	288

-Values from family size distribution
+Values from family size distribution based on families
 	duplex	ab	ba	sum
 FS=1	2	30	31	63
 FS=2	0	3	2	5
@@ -208,3 +260,29 @@
 FS=19	0	0	0	0
 FS=20	0	0	0	0
 FS>20	0	0	1	1
+sum	6	47	59	112
+
+Values from family size distribution based on PE reads
+	duplex	ab	ba	sum
+FS=1	2	30	31	63
+FS=2	0	6	4	10
+FS=3	0	9	15	24
+FS=4	8	12	16	36
+FS=5	0	10	5	15
+FS=6	0	6	24	30
+FS=7	0	7	14	21
+FS=8	0	8	16	24
+FS=9	0	0	18	18
+FS=10	10	10	10	30
+FS=11	0	0	11	11
+FS=12	0	12	24	36
+FS=13	13	13	13	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	0	0	21	21
+sum	33	123	222	378