Mercurial > repos > mheinzl > fsd

--- a/fsd.py	Tue Apr 02 05:10:09 2019 -0400
+++ b/fsd.py	Wed May 08 07:03:39 2019 -0400
@@ -11,10 +11,11 @@
 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.

-# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
+# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf

 import argparse
 import sys
+import os

 import matplotlib.pyplot as plt
 import numpy
@@ -39,6 +40,7 @@
     parser.add_argument('--inputName3')
     parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName4')
+    parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
     parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
     return parser
@@ -48,14 +50,18 @@

     parser = make_argparser()
     args = parser.parse_args(argv[1:])
+
     firstFile = args.inputFile1
     name1 = args.inputName1
+
     secondFile = args.inputFile2
     name2 = args.inputName2
     thirdFile = args.inputFile3
     name3 = args.inputName3
     fourthFile = args.inputFile4
     name4 = args.inputName4
+    log_axis = args.log_axis
+
     title_file = args.output_tabular
     title_file2 = args.output_pdf

@@ -70,185 +76,309 @@
     list_to_plot = []
     label = []
     data_array_list = []
+    list_to_plot_original = []
+    colors = []
+
     with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
         fig = plt.figure()
-        plt.subplots_adjust(bottom=0.25)
+        fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+        fig2 = plt.figure()
+        fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+
+        # plt.subplots_adjust(bottom=0.25)
         if firstFile != str(None):
             file1 = readFileReferenceFree(firstFile)
             integers = numpy.array(file1[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers)
+            colors.append("#0000FF")

             # for plot: replace all big family sizes by 22
-            data1 = numpy.array(file1[:, 0]).astype(int)
-            bigFamilies = numpy.where(data1 > 20)[0]
-            data1[bigFamilies] = 22
-
+            # data1 = numpy.array(file1[:, 0]).astype(int)
+            # bigFamilies = numpy.where(data1 > 20)[0]
+            # data1[bigFamilies] = 22
+            if numpy.amax(integers) > 20:
+                bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1)
+                data1 = numpy.clip(integers, bins[0], bins[-1])
+            else:
+                data1 = integers
             name1 = name1.split(".tabular")[0]
             list_to_plot.append(data1)
             label.append(name1)
             data_array_list.append(file1)

             legend = "\n\n\n{}".format(name1)
-            plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
-            legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1])
-            plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)

-            legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1))
-            plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure)
+            legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1))
+            fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers))
-            plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+            legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers))
+            fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)

             legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1))
-            plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)

             legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers))
-            plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)

             legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
-            plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)

             legend6b = "PE reads\n{:,}".format(sum(integers))
-            plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)

         if secondFile != str(None):
             file2 = readFileReferenceFree(secondFile)
             integers2 = numpy.array(file2[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers2)
+            colors.append("#298A08")

-            data2 = numpy.asarray(file2[:, 0]).astype(int)
-            bigFamilies2 = numpy.where(data2 > 20)[0]
-            data2[bigFamilies2] = 22
+            # data2 = numpy.asarray(file2[:, 0]).astype(int)
+            # bigFamilies2 = numpy.where(data2 > 20)[0]
+            # data2[bigFamilies2] = 22

+            if numpy.amax(integers) > 20:
+                bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1)
+                data2 = numpy.clip(integers2, bins[0], bins[-1])
+            else:
+                data2 = integers2
             list_to_plot.append(data2)
             name2 = name2.split(".tabular")[0]
             label.append(name2)
             data_array_list.append(file2)

-            plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
-
-            legend1 = "{:,}".format(numpy.bincount(data2)[1])
-            plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)

-            legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2))
-            plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2))
+            fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2))
-            plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure)
+            legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
+            fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)

             legend4 = "{:,} ({:.3f})".format(
                 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int),
                 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2))
-            plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)

             legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
-            plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)

             legend6 = "{:,}".format(len(data2))
-            plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)

             legend6b = "{:,}".format(sum(integers2))
-            plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)

         if thirdFile != str(None):
             file3 = readFileReferenceFree(thirdFile)
             integers3 = numpy.array(file3[:, 0]).astype(int)  # keep original family sizes
+            list_to_plot_original.append(integers3)
+            colors.append("#DF0101")

-            data3 = numpy.asarray(file3[:, 0]).astype(int)
-            bigFamilies3 = numpy.where(data3 > 20)[0]
-            data3[bigFamilies3] = 22
+            # data3 = numpy.asarray(file3[:, 0]).astype(int)
+            # bigFamilies3 = numpy.where(data3 > 20)[0]
+            # data3[bigFamilies3] = 22

+            if numpy.amax(integers3) > 20:
+                bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1)
+                data3 = numpy.clip(integers3, bins[0], bins[-1])
+            else:
+                data3 = integers3
             list_to_plot.append(data3)
             name3 = name3.split(".tabular")[0]
             label.append(name3)
             data_array_list.append(file3)

-            plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
-
-            legend1 = "{:,}".format(numpy.bincount(data3)[1])
-            plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)

-            legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3))
-            plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure)
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
+            fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3))
-            plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3))
+            fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)

             legend4 = "{:,} ({:.3f})".format(
                 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int),
                 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3))
-            plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)

             legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
                                              float(sum(integers3[integers3 > 20])) / sum(integers3))
-            plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)

             legend6 = "{:,}".format(len(data3))
-            plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)

             legend6b = "{:,}".format(sum(integers3))
-            plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)

         if fourthFile != str(None):
             file4 = readFileReferenceFree(fourthFile)
             integers4 = numpy.array(file4[:, 0]).astype(int)  # keep original family sizes
-
-            data4 = numpy.asarray(file4[:, 0]).astype(int)
+            list_to_plot_original.append(integers4)
+            colors.append("#04cec7")

-            bigFamilies4 = numpy.where(data4 > 20)[0]
-            data4[bigFamilies4] = 22
-
+            # data4 = numpy.asarray(file4[:, 0]).astype(int)
+            # bigFamilies4 = numpy.where(data4 > 20)[0]
+            # data4[bigFamilies4] = 22
+            if numpy.amax(integers4) > 20:
+                bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1)
+                data4 = numpy.clip(integers4, bins[0], bins[-1])
+            else:
+                data4 = integers4
             list_to_plot.append(data4)
             name4 = name4.split(".tabular")[0]
             label.append(name4)
             data_array_list.append(file4)

-            plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
-
-            legend1 = "{:,}".format(numpy.bincount(data4)[1])
-            plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)

-            legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4))
-            plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure)
+            legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
+            fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)

-            legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4))
-            plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
+            legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4))
+            fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)

             legend4 = "{:,} ({:.3f})".format(
                 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int),
                 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4))
-            plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)

             legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
                                              float(sum(integers4[integers4 > 20])) / sum(integers4))
-            plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)

             legend6 = "{:,}".format(len(data4))
-            plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)

             legend6b = "{:,}".format(sum(integers4))
-            plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+            fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)

         maximumX = numpy.amax(numpy.concatenate(list_to_plot))
         minimumX = numpy.amin(numpy.concatenate(list_to_plot))
+        bins = numpy.arange(minimumX, maximumX + 1)
+        list_to_plot2 = list_to_plot
+        to_plot = ["Absolute frequencies", "Relative frequencies"]
+        plt.xticks([], [])
+        plt.yticks([], [])
+        fig.suptitle('Family Size Distribution (tags)', fontsize=14)

-        counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black",
-                          linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7)
+        for l in range(len(to_plot)):
+            ax = fig.add_subplot(2, 1, l+1)
+            ticks = numpy.arange(1, 22, 1)
+            ticks1 = map(str, ticks)
+            if maximumX > 20:
+                ticks1[len(ticks1) - 1] = ">20"

-        ticks = numpy.arange(minimumX - 1, maximumX, 1)
-        ticks1 = map(str, ticks)
-        ticks1[len(ticks1) - 1] = ">20"
-        plt.xticks(numpy.array(ticks), ticks1)
+            if to_plot[l] == "Relative frequencies":
+                counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True)
+            else:
+                counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8)
+                ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))

-        plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
-        # plt.title("Family Size Distribution", fontsize=14)
-        plt.xlabel("Family size", fontsize=14)
-        plt.ylabel("Absolute Frequency", fontsize=14)
-        plt.margins(0.01, None)
-        plt.grid(b=True, which="major", color="#424242", linestyle=":")
+            ax.set_xticks(numpy.array(ticks))
+            ax.set_xticklabels(ticks1)
+
+            ax.set_ylabel(to_plot[l], fontsize=14)
+            ax.set_xlabel("Family size", fontsize=14)
+            if log_axis:
+                ax.set_yscale('log')
+            ax.grid(b=True, which="major", color="#424242", linestyle=":")
+            ax.margins(0.01, None)
         pdf.savefig(fig)
         plt.close()

-        # write data to CSV file
-        output_file.write("Values from family size distribution with all datasets\n")
+        fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
+        for l in range(len(to_plot)):
+            ax = fig2.add_subplot(2, 1, l + 1)
+            ticks = numpy.arange(minimumX, maximumX + 1)
+            ticks1 = map(str, ticks)
+            if maximumX > 20:
+                ticks1[len(ticks1) - 1] = ">20"
+            reads = []
+            reads_rel = []
+
+            barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1)
+
+            for i in range(len(list_to_plot2)):
+                unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
+                new_c = []
+                new_unique = []
+
+                for t in ticks:
+                    if t not in unique:
+                        new_c.append(0) # add zero count of not occuring
+                        new_unique.append(t)
+                    else:
+                        c_idx = numpy.where(t == unique)[0]
+                        new_c.append(c[c_idx])
+                        new_unique.append(unique[c_idx])
+                y = numpy.array(new_unique) * numpy.array(new_c)
+                if len([list_to_plot_original > 20]) > 0:
+                    y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
+                reads.append(y)
+                reads_rel.append(list(numpy.float_(y)) / sum(y))
+
+                x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float))
+                x = [xi + barWidth for xi in x]
+
+                if to_plot[l] == "Relative frequencies":
+                    counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1),
+                                         edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i])
+                else:
+                    counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i],
+                                     alpha=1, linewidth=1, color=colors[i])
+                if i == len(list_to_plot2):
+                    barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
+                else:
+                    barWidth += 1. / (len(list_to_plot) + 1)
+
+            if to_plot[l] == "Absolute frequencies":
+                ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+            else:
+                ax.set_xlabel("Family size", fontsize=14)
+
+            ax.set_xticks(numpy.array(ticks))
+            ax.set_xticklabels(ticks1)
+            ax.set_ylabel(to_plot[l], fontsize=14)
+            if log_axis:
+                ax.set_yscale('log')
+            ax.grid(b=True, which="major", color="#424242", linestyle=":")
+            ax.margins(0.01, None)
+
+        pdf.savefig(fig2)
+        plt.close()
+
+        # write data to CSV file tags
+        output_file.write("Values from family size distribution with all datasets (tags)\n")
         output_file.write("\nFamily size")
         for i in label:
             output_file.write("{}{}".format(sep, i))
@@ -275,6 +405,35 @@
             for i in counts[0]:
                 output_file.write("{}{}".format(int(sum(i)), sep))

+        # write data to CSV file PE reads
+        output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n")
+        output_file.write("\nFamily size")
+        for i in label:
+            output_file.write("{}{}".format(sep, i))
+        # output_file.write("{}sum".format(sep))
+        output_file.write("\n")
+        j = 0
+        for fs in bins:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs, sep))
+            if len(label) == 1:
+                output_file.write("{}{}".format(int(reads[0][j]), sep))
+            else:
+                for n in range(len(label)):
+                    output_file.write("{}{}".format(int(reads[n][j]), sep))
+            output_file.write("\n")
+            j += 1
+        output_file.write("sum{}".format(sep))
+        if len(label) == 1:
+            output_file.write("{}{}".format(int(sum(reads)), sep))
+        else:
+            for i in reads:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+        output_file.write("\n")
+
         # Family size distribution after DCS and SSCS
         for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label):
             maximumX = numpy.amax(dataset)
@@ -298,6 +457,9 @@
             duplTagsBA = duplTags_double[1::2]  # ba of DCS
             duplTagsBA_o = duplTags_double_o[1::2]  # ba of DCS

+            # duplTags_double_tag = tags[numpy.in1d(seq, d)]
+            # duplTags_double_seq = seq[numpy.in1d(seq, d)]
+
             # get family sizes for SSCS with no partner
             ab = numpy.where(tags == "ab")[0]
             abSeq = seq[ab]
@@ -322,10 +484,10 @@
             dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
             dataBA_FS3 = dataBA[dataBA >= 3]
             dataBA_FS3_o = dataBA_o[dataBA_o >= 3]
-            ab_FS3 = ab[ab >= 3]
-            ba_FS3 = ba[ba >= 3]
-            ab_FS3_o = ab_o[ab_o >= 3]
-            ba_FS3_o = ba_o[ba_o >= 3]
+            # ab_FS3 = ab[ab >= 3]
+            # ba_FS3 = ba[ba >= 3]
+            # ab_FS3_o = ab_o[ab_o >= 3]
+            # ba_FS3_o = ba_o[ba_o >= 3]

             duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
             duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)]  # ba+ab with FS>=3
@@ -337,18 +499,20 @@
             duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o)  # both ab and ba strands with FS>=3

             fig = plt.figure()
-            plt.subplots_adjust(bottom=0.3)
-            counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"],
+            plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
+            counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"],
                               edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
                               rwidth=0.8)
             # tick labels of x axis
-            ticks = numpy.arange(minimumX - 1, maximumX, 1)
+            ticks = numpy.arange(1, 22, 1)
             ticks1 = map(str, ticks)
-            ticks1[len(ticks1) - 1] = ">20"
+            if maximumX > 20:
+                ticks1[len(ticks1) - 1] = ">20"
             plt.xticks(numpy.array(ticks), ticks1)
             singl = counts[0][2][0]  # singletons
             last = counts[0][2][len(counts[0][0]) - 1]  # large families
-
+            if log_axis:
+                plt.yscale('log')
             plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
             plt.title(name_file, fontsize=14)
             plt.xlabel("Family size", fontsize=14)
@@ -411,7 +575,7 @@
             output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
             output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))

-            output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep))
+            output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
             output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
             output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
                 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
--- a/fsd.xml	Tue Apr 02 05:10:09 2019 -0400
+++ b/fsd.xml	Wed May 08 07:03:39 2019 -0400
@@ -10,13 +10,15 @@
     <command>
         python2 '$__tool_directory__/fsd.py' --inputFile1 '${file1}' --inputName1 '${file1.name}'
 --inputFile2 '${file2}' --inputName2 '${file2.name}' --inputFile3 '${file3}' --inputName3 '${file3.name}'
---inputFile4 '${file4}' --inputName4 '${file4.name}' --output_pdf $output_pdf --output_tabular $output_tabular
+--inputFile4 '${file4}' --inputName4 '${file4.name}' $log_axis --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
         <param name="file1" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
         <param name="file2" type="data" format="tabular" label="Dataset 2: input tags" optional="true"  />
         <param name="file3" type="data" format="tabular" label="Dataset 3: input tags" optional="true" />
         <param name="file4" type="data" format="tabular" label="Dataset 4: input tags" optional="true"  help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family. Name of the files can have max. 34 charcters!"/>
+        <param name="log_axis" type="boolean" label="log scale for y axis?" truevalue="" falsevalue="--log_axis" checked="False" help="Transform y axis in log scale."/>
+
     </inputs>
     <outputs>
         <data name="output_pdf" format="pdf" />
Binary file test-data/fsd_output1.pdf has changed
--- a/test-data/fsd_output1.tab	Tue Apr 02 05:10:09 2019 -0400
+++ b/test-data/fsd_output1.tab	Wed May 08 07:03:39 2019 -0400
@@ -1,4 +1,4 @@
-Values from family size distribution with all datasets
+Values from family size distribution with all datasets (tags)

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
 FS=1	63	63	63	63
@@ -23,12 +23,39 @@
 FS=20	0	0	0	0
 FS>20	1	1	1	1
 sum	112	112	112	112
+
+Values from family size distribution with all datasets (PE reads)
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab	fsd_data4.tab
+FS=1	63	63	63	63
+FS=2	10	10	10	10
+FS=3	24	24	24	24
+FS=4	36	36	36	36
+FS=5	15	15	15	15
+FS=6	30	30	30	30
+FS=7	21	21	21	21
+FS=8	24	24	24	24
+FS=9	18	18	18	18
+FS=10	30	30	30	30
+FS=11	11	11	11	11
+FS=12	36	36	36	36
+FS=13	39	39	39	39
+FS=14	0	0	0	0
+FS=15	0	0	0	0
+FS=16	0	0	0	0
+FS=17	0	0	0	0
+FS=18	0	0	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	21	21	21	21
+sum	378	378	378	378
+
 Dataset:	fsd_data1.tab
 max. family size:	21
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data1.tab	63	0.562	0.167	1	0.009	21	0.056	112	378

@@ -78,7 +105,7 @@
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data2.tab	63	0.562	0.167	1	0.009	21	0.056	112	378

@@ -128,7 +155,7 @@
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data3.tab	63	0.562	0.167	1	0.009	21	0.056	112	378

@@ -178,7 +205,7 @@
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data4.tab	63	0.562	0.167	1	0.009	21	0.056	112	378
Binary file test-data/fsd_output2.pdf has changed
--- a/test-data/fsd_output2.tab	Tue Apr 02 05:10:09 2019 -0400
+++ b/test-data/fsd_output2.tab	Wed May 08 07:03:39 2019 -0400
@@ -1,4 +1,4 @@
-Values from family size distribution with all datasets
+Values from family size distribution with all datasets (tags)

 Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
 FS=1	63	63	63
@@ -23,12 +23,39 @@
 FS=20	0	0	0
 FS>20	1	1	1
 sum	112	112	112
+
+Values from family size distribution with all datasets (PE reads)
+
+Family size	fsd_data1.tab	fsd_data2.tab	fsd_data3.tab
+FS=1	63	63	63
+FS=2	10	10	10
+FS=3	24	24	24
+FS=4	36	36	36
+FS=5	15	15	15
+FS=6	30	30	30
+FS=7	21	21	21
+FS=8	24	24	24
+FS=9	18	18	18
+FS=10	30	30	30
+FS=11	11	11	11
+FS=12	36	36	36
+FS=13	39	39	39
+FS=14	0	0	0
+FS=15	0	0	0
+FS=16	0	0	0
+FS=17	0	0	0
+FS=18	0	0	0
+FS=19	0	0	0
+FS=20	0	0	0
+FS>20	21	21	21
+sum	378	378	378
+
 Dataset:	fsd_data1.tab
 max. family size:	21
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data1.tab	63	0.562	0.167	1	0.009	21	0.056	112	378

@@ -78,7 +105,7 @@
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data2.tab	63	0.562	0.167	1	0.009	21	0.056	112	378

@@ -128,7 +155,7 @@
 absolute frequency:	1
 relative frequency:	0.009

-	singletons:			family size > 20:
+	singletons:			family size > 20:				length of dataset:
 	nr. of tags	rel. freq of tags	rel.freq of PE reads	nr. of tags	rel. freq of tags	nr. of PE reads	rel. freq of PE reads	total nr. of tags	total nr. of PE reads
 fsd_data3.tab	63	0.562	0.167	1	0.009	21	0.056	112	378