fsd: fsd.py comparison

comparison fsd.py @ 18:c825a29a7d9f draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty

author	mheinzl
date	Wed, 08 May 2019 07:03:39 -0400
parents	2e517a54eedc
children	b7bccbbee4a7

comparison

equal deleted inserted replaced

-:2e517a54eedc
+:c825a29a7d9f
 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and
 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes.
 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.
-# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
+# USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
 import argparse
 import sys
+import os
 import matplotlib.pyplot as plt
 import numpy
 from matplotlib.backends.backend_pdf import PdfPages
 parser.add_argument('--inputName2')
 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
 parser.add_argument('--inputName3')
 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
 parser.add_argument('--inputName4')
+parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
 return parser
 def compare_read_families(argv):
 parser = make_argparser()
 args = parser.parse_args(argv[1:])
 firstFile = args.inputFile1
 name1 = args.inputName1
 secondFile = args.inputFile2
 name2 = args.inputName2
 thirdFile = args.inputFile3
 name3 = args.inputName3
 fourthFile = args.inputFile4
 name4 = args.inputName4
+log_axis = args.log_axis
 title_file = args.output_tabular
 title_file2 = args.output_pdf
 sep = "\t"
 plt.rcParams['ytick.labelsize'] = 14
 list_to_plot = []
 label = []
 data_array_list = []
+list_to_plot_original = []
+colors = []
 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
 fig = plt.figure()
-plt.subplots_adjust(bottom=0.25)
+fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+fig2 = plt.figure()
+fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
+# plt.subplots_adjust(bottom=0.25)
 if firstFile != str(None):
 file1 = readFileReferenceFree(firstFile)
 integers = numpy.array(file1[:, 0]).astype(int)  # keep original family sizes
+list_to_plot_original.append(integers)
+colors.append("#0000FF")
 # for plot: replace all big family sizes by 22
-data1 = numpy.array(file1[:, 0]).astype(int)
+# data1 = numpy.array(file1[:, 0]).astype(int)
-bigFamilies = numpy.where(data1 > 20)[0]
+# bigFamilies = numpy.where(data1 > 20)[0]
-data1[bigFamilies] = 22
+# data1[bigFamilies] = 22
+if numpy.amax(integers) > 20:
+bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1)
+data1 = numpy.clip(integers, bins[0], bins[-1])
+else:
+data1 = integers
 name1 = name1.split(".tabular")[0]
 list_to_plot.append(data1)
 label.append(name1)
 data_array_list.append(file1)
 legend = "\n\n\n{}".format(name1)
-plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
+fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
-legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1])
+fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
-plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
+legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1))
-legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1))
+fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
-plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
-legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers))
+legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers))
-plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1))
-plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers))
-plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
-plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
 legend6b = "PE reads\n{:,}".format(sum(integers))
-plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
 if secondFile != str(None):
 file2 = readFileReferenceFree(secondFile)
 integers2 = numpy.array(file2[:, 0]).astype(int)  # keep original family sizes
+list_to_plot_original.append(integers2)
-data2 = numpy.asarray(file2[:, 0]).astype(int)
+colors.append("#298A08")
-bigFamilies2 = numpy.where(data2 > 20)[0]
-data2[bigFamilies2] = 22
+# data2 = numpy.asarray(file2[:, 0]).astype(int)
+# bigFamilies2 = numpy.where(data2 > 20)[0]
+# data2[bigFamilies2] = 22
+if numpy.amax(integers) > 20:
+bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1)
+data2 = numpy.clip(integers2, bins[0], bins[-1])
+else:
+data2 = integers2
 list_to_plot.append(data2)
 name2 = name2.split(".tabular")[0]
 label.append(name2)
 data_array_list.append(file2)
-plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
+fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
-legend1 = "{:,}".format(numpy.bincount(data2)[1])
-plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
+legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2))
+fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
-legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2))
+fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
-plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
+legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
-legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2))
+fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
-plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
 legend4 = "{:,} ({:.3f})".format(
 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int),
 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2))
-plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
-plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "{:,}".format(len(data2))
-plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
 legend6b = "{:,}".format(sum(integers2))
-plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
 if thirdFile != str(None):
 file3 = readFileReferenceFree(thirdFile)
 integers3 = numpy.array(file3[:, 0]).astype(int)  # keep original family sizes
+list_to_plot_original.append(integers3)
-data3 = numpy.asarray(file3[:, 0]).astype(int)
+colors.append("#DF0101")
-bigFamilies3 = numpy.where(data3 > 20)[0]
-data3[bigFamilies3] = 22
+# data3 = numpy.asarray(file3[:, 0]).astype(int)
+# bigFamilies3 = numpy.where(data3 > 20)[0]
+# data3[bigFamilies3] = 22
+if numpy.amax(integers3) > 20:
+bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1)
+data3 = numpy.clip(integers3, bins[0], bins[-1])
+else:
+data3 = integers3
 list_to_plot.append(data3)
 name3 = name3.split(".tabular")[0]
 label.append(name3)
 data_array_list.append(file3)
-plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
+fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
-legend1 = "{:,}".format(numpy.bincount(data3)[1])
-plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
+legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
+fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
-legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3))
+fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
-plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure)
+legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3))
-legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3))
+fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
-plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
 legend4 = "{:,} ({:.3f})".format(
 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int),
 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3))
-plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
 float(sum(integers3[integers3 > 20])) / sum(integers3))
-plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "{:,}".format(len(data3))
-plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
 legend6b = "{:,}".format(sum(integers3))
-plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
 if fourthFile != str(None):
 file4 = readFileReferenceFree(fourthFile)
 integers4 = numpy.array(file4[:, 0]).astype(int)  # keep original family sizes
+list_to_plot_original.append(integers4)
-data4 = numpy.asarray(file4[:, 0]).astype(int)
+colors.append("#04cec7")
-bigFamilies4 = numpy.where(data4 > 20)[0]
+# data4 = numpy.asarray(file4[:, 0]).astype(int)
-data4[bigFamilies4] = 22
+# bigFamilies4 = numpy.where(data4 > 20)[0]
+# data4[bigFamilies4] = 22
+if numpy.amax(integers4) > 20:
+bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1)
+data4 = numpy.clip(integers4, bins[0], bins[-1])
+else:
+data4 = integers4
 list_to_plot.append(data4)
 name4 = name4.split(".tabular")[0]
 label.append(name4)
 data_array_list.append(file4)
-plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
+fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
-legend1 = "{:,}".format(numpy.bincount(data4)[1])
-plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
+legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
+fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
-legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4))
+fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
-plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure)
+legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4))
-legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4))
+fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
-plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
 legend4 = "{:,} ({:.3f})".format(
 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int),
 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4))
-plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
 float(sum(integers4[integers4 > 20])) / sum(integers4))
-plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
 legend6 = "{:,}".format(len(data4))
-plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
 legend6b = "{:,}".format(sum(integers4))
-plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
+fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
 maximumX = numpy.amax(numpy.concatenate(list_to_plot))
 minimumX = numpy.amin(numpy.concatenate(list_to_plot))
+bins = numpy.arange(minimumX, maximumX + 1)
-counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black",
+list_to_plot2 = list_to_plot
-linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7)
+to_plot = ["Absolute frequencies", "Relative frequencies"]
+plt.xticks([], [])
-ticks = numpy.arange(minimumX - 1, maximumX, 1)
+plt.yticks([], [])
-ticks1 = map(str, ticks)
+fig.suptitle('Family Size Distribution (tags)', fontsize=14)
-ticks1[len(ticks1) - 1] = ">20"
-plt.xticks(numpy.array(ticks), ticks1)
+for l in range(len(to_plot)):
+ax = fig.add_subplot(2, 1, l+1)
-plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+ticks = numpy.arange(1, 22, 1)
-# plt.title("Family Size Distribution", fontsize=14)
+ticks1 = map(str, ticks)
-plt.xlabel("Family size", fontsize=14)
+if maximumX > 20:
-plt.ylabel("Absolute Frequency", fontsize=14)
+ticks1[len(ticks1) - 1] = ">20"
-plt.margins(0.01, None)
-plt.grid(b=True, which="major", color="#424242", linestyle=":")
+if to_plot[l] == "Relative frequencies":
+counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True)
+else:
+counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8)
+ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+ax.set_xticks(numpy.array(ticks))
+ax.set_xticklabels(ticks1)
+ax.set_ylabel(to_plot[l], fontsize=14)
+ax.set_xlabel("Family size", fontsize=14)
+if log_axis:
+ax.set_yscale('log')
+ax.grid(b=True, which="major", color="#424242", linestyle=":")
+ax.margins(0.01, None)
 pdf.savefig(fig)
 plt.close()
-# write data to CSV file
+fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
-output_file.write("Values from family size distribution with all datasets\n")
+for l in range(len(to_plot)):
+ax = fig2.add_subplot(2, 1, l + 1)
+ticks = numpy.arange(minimumX, maximumX + 1)
+ticks1 = map(str, ticks)
+if maximumX > 20:
+ticks1[len(ticks1) - 1] = ">20"
+reads = []
+reads_rel = []
+barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1)
+for i in range(len(list_to_plot2)):
+unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
+new_c = []
+new_unique = []
+for t in ticks:
+if t not in unique:
+new_c.append(0) # add zero count of not occuring
+new_unique.append(t)
+else:
+c_idx = numpy.where(t == unique)[0]
+new_c.append(c[c_idx])
+new_unique.append(unique[c_idx])
+y = numpy.array(new_unique) * numpy.array(new_c)
+if len([list_to_plot_original > 20]) > 0:
+y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
+reads.append(y)
+reads_rel.append(list(numpy.float_(y)) / sum(y))
+x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float))
+x = [xi + barWidth for xi in x]
+if to_plot[l] == "Relative frequencies":
+counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1),
+edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i])
+else:
+counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i],
+alpha=1, linewidth=1, color=colors[i])
+if i == len(list_to_plot2):
+barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
+else:
+barWidth += 1. / (len(list_to_plot) + 1)
+if to_plot[l] == "Absolute frequencies":
+ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
+else:
+ax.set_xlabel("Family size", fontsize=14)
+ax.set_xticks(numpy.array(ticks))
+ax.set_xticklabels(ticks1)
+ax.set_ylabel(to_plot[l], fontsize=14)
+if log_axis:
+ax.set_yscale('log')
+ax.grid(b=True, which="major", color="#424242", linestyle=":")
+ax.margins(0.01, None)
+pdf.savefig(fig2)
+plt.close()
+# write data to CSV file tags
+output_file.write("Values from family size distribution with all datasets (tags)\n")
 output_file.write("\nFamily size")
 for i in label:
 output_file.write("{}{}".format(sep, i))
 # output_file.write("{}sum".format(sep))
 output_file.write("\n")
 output_file.write("{}{}".format(int(sum(counts[0])), sep))
 else:
 for i in counts[0]:
 output_file.write("{}{}".format(int(sum(i)), sep))
+# write data to CSV file PE reads
+output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n")
+output_file.write("\nFamily size")
+for i in label:
+output_file.write("{}{}".format(sep, i))
+# output_file.write("{}sum".format(sep))
+output_file.write("\n")
+j = 0
+for fs in bins:
+if fs == 21:
+fs = ">20"
+else:
+fs = "={}".format(fs)
+output_file.write("FS{}{}".format(fs, sep))
+if len(label) == 1:
+output_file.write("{}{}".format(int(reads[0][j]), sep))
+else:
+for n in range(len(label)):
+output_file.write("{}{}".format(int(reads[n][j]), sep))
+output_file.write("\n")
+j += 1
+output_file.write("sum{}".format(sep))
+if len(label) == 1:
+output_file.write("{}{}".format(int(sum(reads)), sep))
+else:
+for i in reads:
+output_file.write("{}{}".format(int(sum(i)), sep))
+output_file.write("\n")
 # Family size distribution after DCS and SSCS
 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label):
 maximumX = numpy.amax(dataset)
 minimumX = numpy.amin(dataset)
 duplTags_o = duplTags_double_o[0::2]  # ab of DCS
 duplTagsBA = duplTags_double[1::2]  # ba of DCS
 duplTagsBA_o = duplTags_double_o[1::2]  # ba of DCS
+# duplTags_double_tag = tags[numpy.in1d(seq, d)]
+# duplTags_double_seq = seq[numpy.in1d(seq, d)]
 # get family sizes for SSCS with no partner
 ab = numpy.where(tags == "ab")[0]
 abSeq = seq[ab]
 ab_o = data_o[ab]
 ab = data[ab]
 # information for family size >= 3
 dataAB_FS3 = dataAB[dataAB >= 3]
 dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
 dataBA_FS3 = dataBA[dataBA >= 3]
 dataBA_FS3_o = dataBA_o[dataBA_o >= 3]
-ab_FS3 = ab[ab >= 3]
+# ab_FS3 = ab[ab >= 3]
-ba_FS3 = ba[ba >= 3]
+# ba_FS3 = ba[ba >= 3]
-ab_FS3_o = ab_o[ab_o >= 3]
+# ab_FS3_o = ab_o[ab_o >= 3]
-ba_FS3_o = ba_o[ba_o >= 3]
+# ba_FS3_o = ba_o[ba_o >= 3]
 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)]  # ba+ab with FS>=3
 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA)  # both ab and ba strands with FS>=3
 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)]  # ab+ba with FS>=3
 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)]  # ba+ab with FS>=3
 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o)  # both ab and ba strands with FS>=3
 fig = plt.figure()
-plt.subplots_adjust(bottom=0.3)
+plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
-counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"],
+counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"],
 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
 rwidth=0.8)
 # tick labels of x axis
-ticks = numpy.arange(minimumX - 1, maximumX, 1)
+ticks = numpy.arange(1, 22, 1)
 ticks1 = map(str, ticks)
-ticks1[len(ticks1) - 1] = ">20"
+if maximumX > 20:
+ticks1[len(ticks1) - 1] = ">20"
 plt.xticks(numpy.array(ticks), ticks1)
 singl = counts[0][2][0]  # singletons
 last = counts[0][2][len(counts[0][0]) - 1]  # large families
+if log_axis:
+plt.yscale('log')
 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
 plt.title(name_file, fontsize=14)
 plt.xlabel("Family size", fontsize=14)
 plt.ylabel("Absolute Frequency", fontsize=14)
 plt.margins(0.01, None)
 output_file.write("\nDataset:{}{}\n".format(sep, name_file))
 output_file.write("max. family size:{}{}\n".format(sep, max(integers)))
 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
-output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep))
+output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o)))

Mercurial > repos > mheinzl > fsd

comparison fsd.py @ 18:c825a29a7d9f draft