# HG changeset patch # User mheinzl # Date 1557313419 14400 # Node ID c825a29a7d9f6079e9983a91175ca23f392e2b88 # Parent 2e517a54eedc7d9dcd7a1be5dbe76502a79e12f0 planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty diff -r 2e517a54eedc -r c825a29a7d9f fsd.py --- a/fsd.py Tue Apr 02 05:10:09 2019 -0400 +++ b/fsd.py Wed May 08 07:03:39 2019 -0400 @@ -11,10 +11,11 @@ # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. -# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf +# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf import argparse import sys +import os import matplotlib.pyplot as plt import numpy @@ -39,6 +40,7 @@ parser.add_argument('--inputName3') parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') parser.add_argument('--inputName4') + parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') return parser @@ -48,14 +50,18 @@ parser = make_argparser() args = parser.parse_args(argv[1:]) + firstFile = args.inputFile1 name1 = args.inputName1 + secondFile = args.inputFile2 name2 = args.inputName2 thirdFile = args.inputFile3 name3 = args.inputName3 fourthFile = args.inputFile4 name4 = args.inputName4 + log_axis = args.log_axis + title_file = args.output_tabular title_file2 = args.output_pdf @@ -70,185 +76,309 @@ list_to_plot = [] label = [] data_array_list = [] + list_to_plot_original = [] + colors = [] + with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: fig = plt.figure() - plt.subplots_adjust(bottom=0.25) + fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + fig2 = plt.figure() + fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + + # plt.subplots_adjust(bottom=0.25) if firstFile != str(None): file1 = readFileReferenceFree(firstFile) integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers) + colors.append("#0000FF") # for plot: replace all big family sizes by 22 - data1 = numpy.array(file1[:, 0]).astype(int) - bigFamilies = numpy.where(data1 > 20)[0] - data1[bigFamilies] = 22 - + # data1 = numpy.array(file1[:, 0]).astype(int) + # bigFamilies = numpy.where(data1 > 20)[0] + # data1[bigFamilies] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1) + data1 = numpy.clip(integers, bins[0], bins[-1]) + else: + data1 = integers name1 = name1.split(".tabular")[0] list_to_plot.append(data1) label.append(name1) data_array_list.append(file1) legend = "\n\n\n{}".format(name1) - plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) - plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) - plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) + fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) - plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) + fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) - plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) - plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) - plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "PE reads\n{:,}".format(sum(integers)) - plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) if secondFile != str(None): file2 = readFileReferenceFree(secondFile) integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers2) + colors.append("#298A08") - data2 = numpy.asarray(file2[:, 0]).astype(int) - bigFamilies2 = numpy.where(data2 > 20)[0] - data2[bigFamilies2] = 22 + # data2 = numpy.asarray(file2[:, 0]).astype(int) + # bigFamilies2 = numpy.where(data2 > 20)[0] + # data2[bigFamilies2] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1) + data2 = numpy.clip(integers2, bins[0], bins[-1]) + else: + data2 = integers2 list_to_plot.append(data2) name2 = name2.split(".tabular")[0] label.append(name2) data_array_list.append(file2) - plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data2)[1]) - plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) - plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) + fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) - plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) + legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) + fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) - plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) - plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data2)) - plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers2)) - plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) if thirdFile != str(None): file3 = readFileReferenceFree(thirdFile) integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers3) + colors.append("#DF0101") - data3 = numpy.asarray(file3[:, 0]).astype(int) - bigFamilies3 = numpy.where(data3 > 20)[0] - data3[bigFamilies3] = 22 + # data3 = numpy.asarray(file3[:, 0]).astype(int) + # bigFamilies3 = numpy.where(data3 > 20)[0] + # data3[bigFamilies3] = 22 + if numpy.amax(integers3) > 20: + bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1) + data3 = numpy.clip(integers3, bins[0], bins[-1]) + else: + data3 = integers3 list_to_plot.append(data3) name3 = name3.split(".tabular")[0] label.append(name3) data_array_list.append(file3) - plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data3)[1]) - plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) - plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) + fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) - plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) + fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) - plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3)) - plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data3)) - plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers3)) - plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) if fourthFile != str(None): file4 = readFileReferenceFree(fourthFile) integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes - - data4 = numpy.asarray(file4[:, 0]).astype(int) + list_to_plot_original.append(integers4) + colors.append("#04cec7") - bigFamilies4 = numpy.where(data4 > 20)[0] - data4[bigFamilies4] = 22 - + # data4 = numpy.asarray(file4[:, 0]).astype(int) + # bigFamilies4 = numpy.where(data4 > 20)[0] + # data4[bigFamilies4] = 22 + if numpy.amax(integers4) > 20: + bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1) + data4 = numpy.clip(integers4, bins[0], bins[-1]) + else: + data4 = integers4 list_to_plot.append(data4) name4 = name4.split(".tabular")[0] label.append(name4) data_array_list.append(file4) - plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data4)[1]) - plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) - plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) + fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) - plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) + fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) - plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4)) - plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data4)) - plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers4)) - plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) maximumX = numpy.amax(numpy.concatenate(list_to_plot)) minimumX = numpy.amin(numpy.concatenate(list_to_plot)) + bins = numpy.arange(minimumX, maximumX + 1) + list_to_plot2 = list_to_plot + to_plot = ["Absolute frequencies", "Relative frequencies"] + plt.xticks([], []) + plt.yticks([], []) + fig.suptitle('Family Size Distribution (tags)', fontsize=14) - counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", - linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) + for l in range(len(to_plot)): + ax = fig.add_subplot(2, 1, l+1) + ticks = numpy.arange(1, 22, 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" - ticks = numpy.arange(minimumX - 1, maximumX, 1) - ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" - plt.xticks(numpy.array(ticks), ticks1) + if to_plot[l] == "Relative frequencies": + counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True) + else: + counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8) + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - # plt.title("Family Size Distribution", fontsize=14) - plt.xlabel("Family size", fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) - plt.margins(0.01, None) - plt.grid(b=True, which="major", color="#424242", linestyle=":") + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + + ax.set_ylabel(to_plot[l], fontsize=14) + ax.set_xlabel("Family size", fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) pdf.savefig(fig) plt.close() - # write data to CSV file - output_file.write("Values from family size distribution with all datasets\n") + fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) + for l in range(len(to_plot)): + ax = fig2.add_subplot(2, 1, l + 1) + ticks = numpy.arange(minimumX, maximumX + 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" + reads = [] + reads_rel = [] + + barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) + + for i in range(len(list_to_plot2)): + unique, c = numpy.unique(list_to_plot2[i], return_counts=True) + new_c = [] + new_unique = [] + + for t in ticks: + if t not in unique: + new_c.append(0) # add zero count of not occuring + new_unique.append(t) + else: + c_idx = numpy.where(t == unique)[0] + new_c.append(c[c_idx]) + new_unique.append(unique[c_idx]) + y = numpy.array(new_unique) * numpy.array(new_c) + if len([list_to_plot_original > 20]) > 0: + y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) + reads.append(y) + reads_rel.append(list(numpy.float_(y)) / sum(y)) + + x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) + x = [xi + barWidth for xi in x] + + if to_plot[l] == "Relative frequencies": + counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1), + edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i]) + else: + counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i], + alpha=1, linewidth=1, color=colors[i]) + if i == len(list_to_plot2): + barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) + else: + barWidth += 1. / (len(list_to_plot) + 1) + + if to_plot[l] == "Absolute frequencies": + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + else: + ax.set_xlabel("Family size", fontsize=14) + + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + ax.set_ylabel(to_plot[l], fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) + + pdf.savefig(fig2) + plt.close() + + # write data to CSV file tags + output_file.write("Values from family size distribution with all datasets (tags)\n") output_file.write("\nFamily size") for i in label: output_file.write("{}{}".format(sep, i)) @@ -275,6 +405,35 @@ for i in counts[0]: output_file.write("{}{}".format(int(sum(i)), sep)) + # write data to CSV file PE reads + output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") + output_file.write("\nFamily size") + for i in label: + output_file.write("{}{}".format(sep, i)) + # output_file.write("{}sum".format(sep)) + output_file.write("\n") + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(reads[0][j]), sep)) + else: + for n in range(len(label)): + output_file.write("{}{}".format(int(reads[n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(sum(reads)), sep)) + else: + for i in reads: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("\n") + # Family size distribution after DCS and SSCS for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): maximumX = numpy.amax(dataset) @@ -298,6 +457,9 @@ duplTagsBA = duplTags_double[1::2] # ba of DCS duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS + # duplTags_double_tag = tags[numpy.in1d(seq, d)] + # duplTags_double_seq = seq[numpy.in1d(seq, d)] + # get family sizes for SSCS with no partner ab = numpy.where(tags == "ab")[0] abSeq = seq[ab] @@ -322,10 +484,10 @@ dataAB_FS3_o = dataAB_o[dataAB_o >= 3] dataBA_FS3 = dataBA[dataBA >= 3] dataBA_FS3_o = dataBA_o[dataBA_o >= 3] - ab_FS3 = ab[ab >= 3] - ba_FS3 = ba[ba >= 3] - ab_FS3_o = ab_o[ab_o >= 3] - ba_FS3_o = ba_o[ba_o >= 3] + # ab_FS3 = ab[ab >= 3] + # ba_FS3 = ba[ba >= 3] + # ab_FS3_o = ab_o[ab_o >= 3] + # ba_FS3_o = ba_o[ba_o >= 3] duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 @@ -337,18 +499,20 @@ duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 fig = plt.figure() - plt.subplots_adjust(bottom=0.3) - counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], + plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) + counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8) # tick labels of x axis - ticks = numpy.arange(minimumX - 1, maximumX, 1) + ticks = numpy.arange(1, 22, 1) ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" plt.xticks(numpy.array(ticks), ticks1) singl = counts[0][2][0] # singletons last = counts[0][2][len(counts[0][0]) - 1] # large families - + if log_axis: + plt.yscale('log') plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) plt.title(name_file, fontsize=14) plt.xlabel("Family size", fontsize=14) @@ -411,7 +575,7 @@ output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) - output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) + output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, diff -r 2e517a54eedc -r c825a29a7d9f fsd.xml --- a/fsd.xml Tue Apr 02 05:10:09 2019 -0400 +++ b/fsd.xml Wed May 08 07:03:39 2019 -0400 @@ -10,13 +10,15 @@ python2 '$__tool_directory__/fsd.py' --inputFile1 '${file1}' --inputName1 '${file1.name}' --inputFile2 '${file2}' --inputName2 '${file2.name}' --inputFile3 '${file3}' --inputName3 '${file3.name}' ---inputFile4 '${file4}' --inputName4 '${file4.name}' --output_pdf $output_pdf --output_tabular $output_tabular +--inputFile4 '${file4}' --inputName4 '${file4.name}' $log_axis --output_pdf $output_pdf --output_tabular $output_tabular + + diff -r 2e517a54eedc -r c825a29a7d9f test-data/fsd_output1.pdf Binary file test-data/fsd_output1.pdf has changed diff -r 2e517a54eedc -r c825a29a7d9f test-data/fsd_output1.tab --- a/test-data/fsd_output1.tab Tue Apr 02 05:10:09 2019 -0400 +++ b/test-data/fsd_output1.tab Wed May 08 07:03:39 2019 -0400 @@ -1,4 +1,4 @@ -Values from family size distribution with all datasets +Values from family size distribution with all datasets (tags) Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab FS=1 63 63 63 63 @@ -23,12 +23,39 @@ FS=20 0 0 0 0 FS>20 1 1 1 1 sum 112 112 112 112 + +Values from family size distribution with all datasets (PE reads) + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab +FS=1 63 63 63 63 +FS=2 10 10 10 10 +FS=3 24 24 24 24 +FS=4 36 36 36 36 +FS=5 15 15 15 15 +FS=6 30 30 30 30 +FS=7 21 21 21 21 +FS=8 24 24 24 24 +FS=9 18 18 18 18 +FS=10 30 30 30 30 +FS=11 11 11 11 11 +FS=12 36 36 36 36 +FS=13 39 39 39 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 21 21 21 21 +sum 378 378 378 378 + Dataset: fsd_data1.tab max. family size: 21 absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -78,7 +105,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -128,7 +155,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -178,7 +205,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data4.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 diff -r 2e517a54eedc -r c825a29a7d9f test-data/fsd_output2.pdf Binary file test-data/fsd_output2.pdf has changed diff -r 2e517a54eedc -r c825a29a7d9f test-data/fsd_output2.tab --- a/test-data/fsd_output2.tab Tue Apr 02 05:10:09 2019 -0400 +++ b/test-data/fsd_output2.tab Wed May 08 07:03:39 2019 -0400 @@ -1,4 +1,4 @@ -Values from family size distribution with all datasets +Values from family size distribution with all datasets (tags) Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab FS=1 63 63 63 @@ -23,12 +23,39 @@ FS=20 0 0 0 FS>20 1 1 1 sum 112 112 112 + +Values from family size distribution with all datasets (PE reads) + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab +FS=1 63 63 63 +FS=2 10 10 10 +FS=3 24 24 24 +FS=4 36 36 36 +FS=5 15 15 15 +FS=6 30 30 30 +FS=7 21 21 21 +FS=8 24 24 24 +FS=9 18 18 18 +FS=10 30 30 30 +FS=11 11 11 11 +FS=12 36 36 36 +FS=13 39 39 39 +FS=14 0 0 0 +FS=15 0 0 0 +FS=16 0 0 0 +FS=17 0 0 0 +FS=18 0 0 0 +FS=19 0 0 0 +FS=20 0 0 0 +FS>20 21 21 21 +sum 378 378 378 + Dataset: fsd_data1.tab max. family size: 21 absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -78,7 +105,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -128,7 +155,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378