Mercurial > repos > mheinzl > fsd
diff fsd.py @ 18:c825a29a7d9f draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Wed, 08 May 2019 07:03:39 -0400 |
parents | 2e517a54eedc |
children | b7bccbbee4a7 |
line wrap: on
line diff
--- a/fsd.py Tue Apr 02 05:10:09 2019 -0400 +++ b/fsd.py Wed May 08 07:03:39 2019 -0400 @@ -11,10 +11,11 @@ # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. -# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf +# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf import argparse import sys +import os import matplotlib.pyplot as plt import numpy @@ -39,6 +40,7 @@ parser.add_argument('--inputName3') parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') parser.add_argument('--inputName4') + parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') return parser @@ -48,14 +50,18 @@ parser = make_argparser() args = parser.parse_args(argv[1:]) + firstFile = args.inputFile1 name1 = args.inputName1 + secondFile = args.inputFile2 name2 = args.inputName2 thirdFile = args.inputFile3 name3 = args.inputName3 fourthFile = args.inputFile4 name4 = args.inputName4 + log_axis = args.log_axis + title_file = args.output_tabular title_file2 = args.output_pdf @@ -70,185 +76,309 @@ list_to_plot = [] label = [] data_array_list = [] + list_to_plot_original = [] + colors = [] + with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: fig = plt.figure() - plt.subplots_adjust(bottom=0.25) + fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + fig2 = plt.figure() + fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) + + # plt.subplots_adjust(bottom=0.25) if firstFile != str(None): file1 = readFileReferenceFree(firstFile) integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers) + colors.append("#0000FF") # for plot: replace all big family sizes by 22 - data1 = numpy.array(file1[:, 0]).astype(int) - bigFamilies = numpy.where(data1 > 20)[0] - data1[bigFamilies] = 22 - + # data1 = numpy.array(file1[:, 0]).astype(int) + # bigFamilies = numpy.where(data1 > 20)[0] + # data1[bigFamilies] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1) + data1 = numpy.clip(integers, bins[0], bins[-1]) + else: + data1 = integers name1 = name1.split(".tabular")[0] list_to_plot.append(data1) label.append(name1) data_array_list.append(file1) legend = "\n\n\n{}".format(name1) - plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) - plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) - legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) - plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) + fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) - plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) + fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) - plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) - plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) - plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "PE reads\n{:,}".format(sum(integers)) - plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) if secondFile != str(None): file2 = readFileReferenceFree(secondFile) integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers2) + colors.append("#298A08") - data2 = numpy.asarray(file2[:, 0]).astype(int) - bigFamilies2 = numpy.where(data2 > 20)[0] - data2[bigFamilies2] = 22 + # data2 = numpy.asarray(file2[:, 0]).astype(int) + # bigFamilies2 = numpy.where(data2 > 20)[0] + # data2[bigFamilies2] = 22 + if numpy.amax(integers) > 20: + bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1) + data2 = numpy.clip(integers2, bins[0], bins[-1]) + else: + data2 = integers2 list_to_plot.append(data2) name2 = name2.split(".tabular")[0] label.append(name2) data_array_list.append(file2) - plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data2)[1]) - plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) - plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) + fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) - plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) + legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) + fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) - plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) - plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data2)) - plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers2)) - plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) if thirdFile != str(None): file3 = readFileReferenceFree(thirdFile) integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes + list_to_plot_original.append(integers3) + colors.append("#DF0101") - data3 = numpy.asarray(file3[:, 0]).astype(int) - bigFamilies3 = numpy.where(data3 > 20)[0] - data3[bigFamilies3] = 22 + # data3 = numpy.asarray(file3[:, 0]).astype(int) + # bigFamilies3 = numpy.where(data3 > 20)[0] + # data3[bigFamilies3] = 22 + if numpy.amax(integers3) > 20: + bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1) + data3 = numpy.clip(integers3, bins[0], bins[-1]) + else: + data3 = integers3 list_to_plot.append(data3) name3 = name3.split(".tabular")[0] label.append(name3) data_array_list.append(file3) - plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data3)[1]) - plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) - plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) + fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) - plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) + fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) - plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3)) - plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data3)) - plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers3)) - plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) if fourthFile != str(None): file4 = readFileReferenceFree(fourthFile) integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes - - data4 = numpy.asarray(file4[:, 0]).astype(int) + list_to_plot_original.append(integers4) + colors.append("#04cec7") - bigFamilies4 = numpy.where(data4 > 20)[0] - data4[bigFamilies4] = 22 - + # data4 = numpy.asarray(file4[:, 0]).astype(int) + # bigFamilies4 = numpy.where(data4 > 20)[0] + # data4[bigFamilies4] = 22 + if numpy.amax(integers4) > 20: + bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1) + data4 = numpy.clip(integers4, bins[0], bins[-1]) + else: + data4 = integers4 list_to_plot.append(data4) name4 = name4.split(".tabular")[0] label.append(name4) data_array_list.append(file4) - plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - - legend1 = "{:,}".format(numpy.bincount(data4)[1]) - plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) - legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) - plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) + legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) + fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) + fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) - legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) - plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) + fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) legend4 = "{:,} ({:.3f})".format( numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) - plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) + fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4)) - plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) + fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) legend6 = "{:,}".format(len(data4)) - plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) + fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) legend6b = "{:,}".format(sum(integers4)) - plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) + fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) maximumX = numpy.amax(numpy.concatenate(list_to_plot)) minimumX = numpy.amin(numpy.concatenate(list_to_plot)) + bins = numpy.arange(minimumX, maximumX + 1) + list_to_plot2 = list_to_plot + to_plot = ["Absolute frequencies", "Relative frequencies"] + plt.xticks([], []) + plt.yticks([], []) + fig.suptitle('Family Size Distribution (tags)', fontsize=14) - counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", - linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) + for l in range(len(to_plot)): + ax = fig.add_subplot(2, 1, l+1) + ticks = numpy.arange(1, 22, 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" - ticks = numpy.arange(minimumX - 1, maximumX, 1) - ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" - plt.xticks(numpy.array(ticks), ticks1) + if to_plot[l] == "Relative frequencies": + counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True) + else: + counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8) + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) - # plt.title("Family Size Distribution", fontsize=14) - plt.xlabel("Family size", fontsize=14) - plt.ylabel("Absolute Frequency", fontsize=14) - plt.margins(0.01, None) - plt.grid(b=True, which="major", color="#424242", linestyle=":") + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + + ax.set_ylabel(to_plot[l], fontsize=14) + ax.set_xlabel("Family size", fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) pdf.savefig(fig) plt.close() - # write data to CSV file - output_file.write("Values from family size distribution with all datasets\n") + fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) + for l in range(len(to_plot)): + ax = fig2.add_subplot(2, 1, l + 1) + ticks = numpy.arange(minimumX, maximumX + 1) + ticks1 = map(str, ticks) + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" + reads = [] + reads_rel = [] + + barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) + + for i in range(len(list_to_plot2)): + unique, c = numpy.unique(list_to_plot2[i], return_counts=True) + new_c = [] + new_unique = [] + + for t in ticks: + if t not in unique: + new_c.append(0) # add zero count of not occuring + new_unique.append(t) + else: + c_idx = numpy.where(t == unique)[0] + new_c.append(c[c_idx]) + new_unique.append(unique[c_idx]) + y = numpy.array(new_unique) * numpy.array(new_c) + if len([list_to_plot_original > 20]) > 0: + y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) + reads.append(y) + reads_rel.append(list(numpy.float_(y)) / sum(y)) + + x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) + x = [xi + barWidth for xi in x] + + if to_plot[l] == "Relative frequencies": + counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1), + edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i]) + else: + counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i], + alpha=1, linewidth=1, color=colors[i]) + if i == len(list_to_plot2): + barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) + else: + barWidth += 1. / (len(list_to_plot) + 1) + + if to_plot[l] == "Absolute frequencies": + ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) + else: + ax.set_xlabel("Family size", fontsize=14) + + ax.set_xticks(numpy.array(ticks)) + ax.set_xticklabels(ticks1) + ax.set_ylabel(to_plot[l], fontsize=14) + if log_axis: + ax.set_yscale('log') + ax.grid(b=True, which="major", color="#424242", linestyle=":") + ax.margins(0.01, None) + + pdf.savefig(fig2) + plt.close() + + # write data to CSV file tags + output_file.write("Values from family size distribution with all datasets (tags)\n") output_file.write("\nFamily size") for i in label: output_file.write("{}{}".format(sep, i)) @@ -275,6 +405,35 @@ for i in counts[0]: output_file.write("{}{}".format(int(sum(i)), sep)) + # write data to CSV file PE reads + output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") + output_file.write("\nFamily size") + for i in label: + output_file.write("{}{}".format(sep, i)) + # output_file.write("{}sum".format(sep)) + output_file.write("\n") + j = 0 + for fs in bins: + if fs == 21: + fs = ">20" + else: + fs = "={}".format(fs) + output_file.write("FS{}{}".format(fs, sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(reads[0][j]), sep)) + else: + for n in range(len(label)): + output_file.write("{}{}".format(int(reads[n][j]), sep)) + output_file.write("\n") + j += 1 + output_file.write("sum{}".format(sep)) + if len(label) == 1: + output_file.write("{}{}".format(int(sum(reads)), sep)) + else: + for i in reads: + output_file.write("{}{}".format(int(sum(i)), sep)) + output_file.write("\n") + # Family size distribution after DCS and SSCS for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): maximumX = numpy.amax(dataset) @@ -298,6 +457,9 @@ duplTagsBA = duplTags_double[1::2] # ba of DCS duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS + # duplTags_double_tag = tags[numpy.in1d(seq, d)] + # duplTags_double_seq = seq[numpy.in1d(seq, d)] + # get family sizes for SSCS with no partner ab = numpy.where(tags == "ab")[0] abSeq = seq[ab] @@ -322,10 +484,10 @@ dataAB_FS3_o = dataAB_o[dataAB_o >= 3] dataBA_FS3 = dataBA[dataBA >= 3] dataBA_FS3_o = dataBA_o[dataBA_o >= 3] - ab_FS3 = ab[ab >= 3] - ba_FS3 = ba[ba >= 3] - ab_FS3_o = ab_o[ab_o >= 3] - ba_FS3_o = ba_o[ba_o >= 3] + # ab_FS3 = ab[ab >= 3] + # ba_FS3 = ba[ba >= 3] + # ab_FS3_o = ab_o[ab_o >= 3] + # ba_FS3_o = ba_o[ba_o >= 3] duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 @@ -337,18 +499,20 @@ duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 fig = plt.figure() - plt.subplots_adjust(bottom=0.3) - counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], + plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) + counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], rwidth=0.8) # tick labels of x axis - ticks = numpy.arange(minimumX - 1, maximumX, 1) + ticks = numpy.arange(1, 22, 1) ticks1 = map(str, ticks) - ticks1[len(ticks1) - 1] = ">20" + if maximumX > 20: + ticks1[len(ticks1) - 1] = ">20" plt.xticks(numpy.array(ticks), ticks1) singl = counts[0][2][0] # singletons last = counts[0][2][len(counts[0][0]) - 1] # large families - + if log_axis: + plt.yscale('log') plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) plt.title(name_file, fontsize=14) plt.xlabel("Family size", fontsize=14) @@ -411,7 +575,7 @@ output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) - output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) + output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,