Mercurial > repos > mheinzl > fsd
changeset 18:c825a29a7d9f draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Wed, 08 May 2019 07:03:39 -0400 |
parents | 2e517a54eedc |
children | b7bccbbee4a7 |
files | fsd.py fsd.xml test-data/fsd_output1.pdf test-data/fsd_output1.tab test-data/fsd_output2.pdf test-data/fsd_output2.tab |
diffstat | 6 files changed, 318 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/fsd.py Tue Apr 02 05:10:09 2019 -0400 +++ b/fsd.py Wed May 08 07:03:39 2019 -0400 @@ -11,10 +11,11 @@ # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. -# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf +# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf import argparse import sys +import os import matplotlib.pyplot as plt import numpy @@ -39,6 +40,7 @@ parser.add_argument('--inputName3') parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') parser.add_argument('--inputName4') + parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') return parser @@ -48,14 +50,18 @@ parser = make_argparser() args = parser.parse_args(argv[1:]) + firstFile = args.inputFile1 name1 = args.inputName1 + secondFile = args.inputFile2 name2 = args.inputName2 thirdFile = args.inputFile3 name3 = args.inputName3 fourthFile = args.inputFile4 name4 = args.inputName4 + log_axis = args.log_axis + title_file = args.output_tabular title_file2 = args.output_pdf @@ -70,185 +76,309 @@ list_to_plot = [] label = [] data_array_list = [] + list_to_plot_original = [] + colors = [] + with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: fig = plt.figure() - plt.subplots_adjust(bottom=0.25) + fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + fig2 = plt.figure() + fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + + # plt.subplots_adjust(bottom=0.25) if firstFile != str(None): file1 = readFileReferenceFree(firstFile) integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers) + colors.append("#0000FF") # for plot: replace all big family sizes by 22 - data1 = numpy.array(file1[:, 0]).astype(int) - bigFamilies = numpy.where(data1 > 20)[0] - data1[bigFamilies] = 22 - + # data1 = numpy.array(file1[:, 0]).astype(int) + # bigFamilies = numpy.where(data1 > 20)[0] + # data1[bigFamilies] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1) + data1 = numpy.clip(integers, bins[0], bins[-1]) + else: + data1 = integers name1 = name1.split(".tabular")[0] list_to_plot.append(data1) label.append(name1) data_array_list.append(file1) legend = "\n\n\n{}".format(name1) - plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) - plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) - plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) + fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) - plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) + fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) - plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) - plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) - plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "PE reads\n{:,}".format(sum(integers)) - plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) if secondFile != str(None): file2 = readFileReferenceFree(secondFile) integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers2) + colors.append("#298A08") - data2 = numpy.asarray(file2[:, 0]).astype(int) - bigFamilies2 = numpy.where(data2 > 20)[0] - data2[bigFamilies2] = 22 + # data2 = numpy.asarray(file2[:, 0]).astype(int) + # bigFamilies2 = numpy.where(data2 > 20)[0] + # data2[bigFamilies2] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1) + data2 = numpy.clip(integers2, bins[0], bins[-1]) + else: + data2 = integers2 list_to_plot.append(data2) name2 = name2.split(".tabular")[0] label.append(name2) data_array_list.append(file2) - plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data2)[1]) - plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) - plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) + fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) - plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) + legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) + fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) - plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) - plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data2)) - plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers2)) - plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) if thirdFile != str(None): file3 = readFileReferenceFree(thirdFile) integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers3) + colors.append("#DF0101") - data3 = numpy.asarray(file3[:, 0]).astype(int) - bigFamilies3 = numpy.where(data3 > 20)[0] - data3[bigFamilies3] = 22 + # data3 = numpy.asarray(file3[:, 0]).astype(int) + # bigFamilies3 = numpy.where(data3 > 20)[0] + # data3[bigFamilies3] = 22 + if numpy.amax(integers3) > 20: + bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1) + data3 = numpy.clip(integers3, bins[0], bins[-1]) + else: + data3 = integers3 list_to_plot.append(data3) name3 = name3.split(".tabular")[0] label.append(name3) data_array_list.append(file3) - plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data3)[1]) - plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) - plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) + fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) - plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) + fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) - plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3)) - plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data3)) - plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers3)) - plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) if fourthFile != str(None): file4 = readFileReferenceFree(fourthFile) integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes - - data4 = numpy.asarray(file4[:, 0]).astype(int) + list_to_plot_original.append(integers4) + colors.append("#04cec7") - bigFamilies4 = numpy.where(data4 > 20)[0] - data4[bigFamilies4] = 22 - + # data4 = numpy.asarray(file4[:, 0]).astype(int) + # bigFamilies4 = numpy.where(data4 > 20)[0] + # data4[bigFamilies4] = 22 + if numpy.amax(integers4) > 20: + bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1) + data4 = numpy.clip(integers4, bins[0], bins[-1]) + else: + data4 = integers4 list_to_plot.append(data4) name4 = name4.split(".tabular")[0] label.append(name4) data_array_list.append(file4) - plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data4)[1]) - plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) - plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) + fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) - plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) + fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) - plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4)) - plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data4)) - plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers4)) - plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) maximumX = numpy.amax(numpy.concatenate(list_to_plot)) minimumX = numpy.amin(numpy.concatenate(list_to_plot)) + bins = numpy.arange(minimumX, maximumX + 1) + list_to_plot2 = list_to_plot + to_plot = ["Absolute frequencies", "Relative frequencies"] + plt.xticks([], []) + plt.yticks([], []) + fig.suptitle('Family Size Distribution (tags)', fontsize=14) - counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", - linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) + for l in range(len(to_plot)): + ax = fig.add_subplot(2, 1, l+1) + ticks = numpy.arange(1, 22, 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" - ticks = numpy.arange(minimumX - 1, maximumX, 1) - ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" - plt.xticks(numpy.array(ticks), ticks1) + if to_plot[l] == "Relative frequencies": + counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True) + else: + counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8) + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - # plt.title("Family Size Distribution", fontsize=14) - plt.xlabel("Family size", fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) - plt.margins(0.01, None) - plt.grid(b=True, which="major", color="#424242", linestyle=":") + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + + ax.set_ylabel(to_plot[l], fontsize=14) + ax.set_xlabel("Family size", fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) pdf.savefig(fig) plt.close() - # write data to CSV file - output_file.write("Values from family size distribution with all datasets\n") + fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) + for l in range(len(to_plot)): + ax = fig2.add_subplot(2, 1, l + 1) + ticks = numpy.arange(minimumX, maximumX + 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" + reads = [] + reads_rel = [] + + barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) + + for i in range(len(list_to_plot2)): + unique, c = numpy.unique(list_to_plot2[i], return_counts=True) + new_c = [] + new_unique = [] + + for t in ticks: + if t not in unique: + new_c.append(0) # add zero count of not occuring + new_unique.append(t) + else: + c_idx = numpy.where(t == unique)[0] + new_c.append(c[c_idx]) + new_unique.append(unique[c_idx]) + y = numpy.array(new_unique) * numpy.array(new_c) + if len([list_to_plot_original > 20]) > 0: + y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) + reads.append(y) + reads_rel.append(list(numpy.float_(y)) / sum(y)) + + x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) + x = [xi + barWidth for xi in x] + + if to_plot[l] == "Relative frequencies": + counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1), + edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i]) + else: + counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i], + alpha=1, linewidth=1, color=colors[i]) + if i == len(list_to_plot2): + barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) + else: + barWidth += 1. / (len(list_to_plot) + 1) + + if to_plot[l] == "Absolute frequencies": + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + else: + ax.set_xlabel("Family size", fontsize=14) + + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + ax.set_ylabel(to_plot[l], fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) + + pdf.savefig(fig2) + plt.close() + + # write data to CSV file tags + output_file.write("Values from family size distribution with all datasets (tags)\n") output_file.write("\nFamily size") for i in label: output_file.write("{}{}".format(sep, i)) @@ -275,6 +405,35 @@ for i in counts[0]: output_file.write("{}{}".format(int(sum(i)), sep)) + # write data to CSV file PE reads + output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") + output_file.write("\nFamily size") + for i in label: + output_file.write("{}{}".format(sep, i)) + # output_file.write("{}sum".format(sep)) + output_file.write("\n") + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(reads[0][j]), sep)) + else: + for n in range(len(label)): + output_file.write("{}{}".format(int(reads[n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(sum(reads)), sep)) + else: + for i in reads: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("\n") + # Family size distribution after DCS and SSCS for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): maximumX = numpy.amax(dataset) @@ -298,6 +457,9 @@ duplTagsBA = duplTags_double[1::2] # ba of DCS duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS + # duplTags_double_tag = tags[numpy.in1d(seq, d)] + # duplTags_double_seq = seq[numpy.in1d(seq, d)] + # get family sizes for SSCS with no partner ab = numpy.where(tags == "ab")[0] abSeq = seq[ab] @@ -322,10 +484,10 @@ dataAB_FS3_o = dataAB_o[dataAB_o >= 3] dataBA_FS3 = dataBA[dataBA >= 3] dataBA_FS3_o = dataBA_o[dataBA_o >= 3] - ab_FS3 = ab[ab >= 3] - ba_FS3 = ba[ba >= 3] - ab_FS3_o = ab_o[ab_o >= 3] - ba_FS3_o = ba_o[ba_o >= 3] + # ab_FS3 = ab[ab >= 3] + # ba_FS3 = ba[ba >= 3] + # ab_FS3_o = ab_o[ab_o >= 3] + # ba_FS3_o = ba_o[ba_o >= 3] duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 @@ -337,18 +499,20 @@ duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 fig = plt.figure() - plt.subplots_adjust(bottom=0.3) - counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], + plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) + counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8) # tick labels of x axis - ticks = numpy.arange(minimumX - 1, maximumX, 1) + ticks = numpy.arange(1, 22, 1) ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" plt.xticks(numpy.array(ticks), ticks1) singl = counts[0][2][0] # singletons last = counts[0][2][len(counts[0][0]) - 1] # large families - + if log_axis: + plt.yscale('log') plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) plt.title(name_file, fontsize=14) plt.xlabel("Family size", fontsize=14) @@ -411,7 +575,7 @@ output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) - output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) + output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
--- a/fsd.xml Tue Apr 02 05:10:09 2019 -0400 +++ b/fsd.xml Wed May 08 07:03:39 2019 -0400 @@ -10,13 +10,15 @@ <command> python2 '$__tool_directory__/fsd.py' --inputFile1 '${file1}' --inputName1 '${file1.name}' --inputFile2 '${file2}' --inputName2 '${file2.name}' --inputFile3 '${file3}' --inputName3 '${file3.name}' ---inputFile4 '${file4}' --inputName4 '${file4.name}' --output_pdf $output_pdf --output_tabular $output_tabular +--inputFile4 '${file4}' --inputName4 '${file4.name}' $log_axis --output_pdf $output_pdf --output_tabular $output_tabular </command> <inputs> <param name="file1" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/> <param name="file2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" /> <param name="file3" type="data" format="tabular" label="Dataset 3: input tags" optional="true" /> <param name="file4" type="data" format="tabular" label="Dataset 4: input tags" optional="true" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family. Name of the files can have max. 34 charcters!"/> + <param name="log_axis" type="boolean" label="log scale for y axis?" truevalue="" falsevalue="--log_axis" checked="False" help="Transform y axis in log scale."/> + </inputs> <outputs> <data name="output_pdf" format="pdf" />
--- a/test-data/fsd_output1.tab Tue Apr 02 05:10:09 2019 -0400 +++ b/test-data/fsd_output1.tab Wed May 08 07:03:39 2019 -0400 @@ -1,4 +1,4 @@ -Values from family size distribution with all datasets +Values from family size distribution with all datasets (tags) Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab FS=1 63 63 63 63 @@ -23,12 +23,39 @@ FS=20 0 0 0 0 FS>20 1 1 1 1 sum 112 112 112 112 + +Values from family size distribution with all datasets (PE reads) + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab fsd_data4.tab +FS=1 63 63 63 63 +FS=2 10 10 10 10 +FS=3 24 24 24 24 +FS=4 36 36 36 36 +FS=5 15 15 15 15 +FS=6 30 30 30 30 +FS=7 21 21 21 21 +FS=8 24 24 24 24 +FS=9 18 18 18 18 +FS=10 30 30 30 30 +FS=11 11 11 11 11 +FS=12 36 36 36 36 +FS=13 39 39 39 39 +FS=14 0 0 0 0 +FS=15 0 0 0 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 21 21 21 21 +sum 378 378 378 378 + Dataset: fsd_data1.tab max. family size: 21 absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -78,7 +105,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -128,7 +155,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -178,7 +205,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data4.tab 63 0.562 0.167 1 0.009 21 0.056 112 378
--- a/test-data/fsd_output2.tab Tue Apr 02 05:10:09 2019 -0400 +++ b/test-data/fsd_output2.tab Wed May 08 07:03:39 2019 -0400 @@ -1,4 +1,4 @@ -Values from family size distribution with all datasets +Values from family size distribution with all datasets (tags) Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab FS=1 63 63 63 @@ -23,12 +23,39 @@ FS=20 0 0 0 FS>20 1 1 1 sum 112 112 112 + +Values from family size distribution with all datasets (PE reads) + +Family size fsd_data1.tab fsd_data2.tab fsd_data3.tab +FS=1 63 63 63 +FS=2 10 10 10 +FS=3 24 24 24 +FS=4 36 36 36 +FS=5 15 15 15 +FS=6 30 30 30 +FS=7 21 21 21 +FS=8 24 24 24 +FS=9 18 18 18 +FS=10 30 30 30 +FS=11 11 11 11 +FS=12 36 36 36 +FS=13 39 39 39 +FS=14 0 0 0 +FS=15 0 0 0 +FS=16 0 0 0 +FS=17 0 0 0 +FS=18 0 0 0 +FS=19 0 0 0 +FS=20 0 0 0 +FS>20 21 21 21 +sum 378 378 378 + Dataset: fsd_data1.tab max. family size: 21 absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data1.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -78,7 +105,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data2.tab 63 0.562 0.167 1 0.009 21 0.056 112 378 @@ -128,7 +155,7 @@ absolute frequency: 1 relative frequency: 0.009 - singletons: family size > 20: + singletons: family size > 20: length of dataset: nr. of tags rel. freq of tags rel.freq of PE reads nr. of tags rel. freq of tags nr. of PE reads rel. freq of PE reads total nr. of tags total nr. of PE reads fsd_data3.tab 63 0.562 0.167 1 0.009 21 0.056 112 378