comparison fsd.py @ 18:c825a29a7d9f draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author mheinzl
date Wed, 08 May 2019 07:03:39 -0400
parents 2e517a54eedc
children b7bccbbee4a7
comparison
equal deleted inserted replaced
17:2e517a54eedc 18:c825a29a7d9f
9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and 9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and
10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. 10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes.
11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. 11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. 12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.
13 13
14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf 14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
15 15
16 import argparse 16 import argparse
17 import sys 17 import sys
18 import os
18 19
19 import matplotlib.pyplot as plt 20 import matplotlib.pyplot as plt
20 import numpy 21 import numpy
21 from matplotlib.backends.backend_pdf import PdfPages 22 from matplotlib.backends.backend_pdf import PdfPages
22 23
37 parser.add_argument('--inputName2') 38 parser.add_argument('--inputName2')
38 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') 39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
39 parser.add_argument('--inputName3') 40 parser.add_argument('--inputName3')
40 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') 41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
41 parser.add_argument('--inputName4') 42 parser.add_argument('--inputName4')
43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
42 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') 44 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
43 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') 45 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
44 return parser 46 return parser
45 47
46 48
47 def compare_read_families(argv): 49 def compare_read_families(argv):
48 50
49 parser = make_argparser() 51 parser = make_argparser()
50 args = parser.parse_args(argv[1:]) 52 args = parser.parse_args(argv[1:])
53
51 firstFile = args.inputFile1 54 firstFile = args.inputFile1
52 name1 = args.inputName1 55 name1 = args.inputName1
56
53 secondFile = args.inputFile2 57 secondFile = args.inputFile2
54 name2 = args.inputName2 58 name2 = args.inputName2
55 thirdFile = args.inputFile3 59 thirdFile = args.inputFile3
56 name3 = args.inputName3 60 name3 = args.inputName3
57 fourthFile = args.inputFile4 61 fourthFile = args.inputFile4
58 name4 = args.inputName4 62 name4 = args.inputName4
63 log_axis = args.log_axis
64
59 title_file = args.output_tabular 65 title_file = args.output_tabular
60 title_file2 = args.output_pdf 66 title_file2 = args.output_pdf
61 67
62 sep = "\t" 68 sep = "\t"
63 69
68 plt.rcParams['ytick.labelsize'] = 14 74 plt.rcParams['ytick.labelsize'] = 14
69 75
70 list_to_plot = [] 76 list_to_plot = []
71 label = [] 77 label = []
72 data_array_list = [] 78 data_array_list = []
79 list_to_plot_original = []
80 colors = []
81
73 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: 82 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
74 fig = plt.figure() 83 fig = plt.figure()
75 plt.subplots_adjust(bottom=0.25) 84 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
85 fig2 = plt.figure()
86 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
87
88 # plt.subplots_adjust(bottom=0.25)
76 if firstFile != str(None): 89 if firstFile != str(None):
77 file1 = readFileReferenceFree(firstFile) 90 file1 = readFileReferenceFree(firstFile)
78 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes 91 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes
92 list_to_plot_original.append(integers)
93 colors.append("#0000FF")
79 94
80 # for plot: replace all big family sizes by 22 95 # for plot: replace all big family sizes by 22
81 data1 = numpy.array(file1[:, 0]).astype(int) 96 # data1 = numpy.array(file1[:, 0]).astype(int)
82 bigFamilies = numpy.where(data1 > 20)[0] 97 # bigFamilies = numpy.where(data1 > 20)[0]
83 data1[bigFamilies] = 22 98 # data1[bigFamilies] = 22
84 99 if numpy.amax(integers) > 20:
100 bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1)
101 data1 = numpy.clip(integers, bins[0], bins[-1])
102 else:
103 data1 = integers
85 name1 = name1.split(".tabular")[0] 104 name1 = name1.split(".tabular")[0]
86 list_to_plot.append(data1) 105 list_to_plot.append(data1)
87 label.append(name1) 106 label.append(name1)
88 data_array_list.append(file1) 107 data_array_list.append(file1)
89 108
90 legend = "\n\n\n{}".format(name1) 109 legend = "\n\n\n{}".format(name1)
91 plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) 110 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
92 legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) 111 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
93 plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) 112
94 113 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1))
95 legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) 114 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
96 plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) 115 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
97 116
98 legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) 117 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers))
99 plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) 118 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
119 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
100 120
101 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) 121 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1))
102 plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) 122 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
123 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
103 124
104 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) 125 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers))
105 plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) 126 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
127 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
106 128
107 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) 129 legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
108 plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) 130 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
131 fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
109 132
110 legend6b = "PE reads\n{:,}".format(sum(integers)) 133 legend6b = "PE reads\n{:,}".format(sum(integers))
111 plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) 134 fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
135 fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure)
112 136
113 if secondFile != str(None): 137 if secondFile != str(None):
114 file2 = readFileReferenceFree(secondFile) 138 file2 = readFileReferenceFree(secondFile)
115 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes 139 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes
116 140 list_to_plot_original.append(integers2)
117 data2 = numpy.asarray(file2[:, 0]).astype(int) 141 colors.append("#298A08")
118 bigFamilies2 = numpy.where(data2 > 20)[0] 142
119 data2[bigFamilies2] = 22 143 # data2 = numpy.asarray(file2[:, 0]).astype(int)
120 144 # bigFamilies2 = numpy.where(data2 > 20)[0]
145 # data2[bigFamilies2] = 22
146
147 if numpy.amax(integers) > 20:
148 bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1)
149 data2 = numpy.clip(integers2, bins[0], bins[-1])
150 else:
151 data2 = integers2
121 list_to_plot.append(data2) 152 list_to_plot.append(data2)
122 name2 = name2.split(".tabular")[0] 153 name2 = name2.split(".tabular")[0]
123 label.append(name2) 154 label.append(name2)
124 data_array_list.append(file2) 155 data_array_list.append(file2)
125 156
126 plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) 157 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
127 158 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
128 legend1 = "{:,}".format(numpy.bincount(data2)[1]) 159
129 plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) 160 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2))
130 161 fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
131 legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) 162 fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure)
132 plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) 163
133 164 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
134 legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) 165 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
135 plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) 166 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
136 167
137 legend4 = "{:,} ({:.3f})".format( 168 legend4 = "{:,} ({:.3f})".format(
138 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), 169 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int),
139 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) 170 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2))
140 plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) 171 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
172 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
141 173
142 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) 174 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2))
143 plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) 175 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
176 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
144 177
145 legend6 = "{:,}".format(len(data2)) 178 legend6 = "{:,}".format(len(data2))
146 plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) 179 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
180 fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
147 181
148 legend6b = "{:,}".format(sum(integers2)) 182 legend6b = "{:,}".format(sum(integers2))
149 plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) 183 fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
184 fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure)
150 185
151 if thirdFile != str(None): 186 if thirdFile != str(None):
152 file3 = readFileReferenceFree(thirdFile) 187 file3 = readFileReferenceFree(thirdFile)
153 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes 188 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes
154 189 list_to_plot_original.append(integers3)
155 data3 = numpy.asarray(file3[:, 0]).astype(int) 190 colors.append("#DF0101")
156 bigFamilies3 = numpy.where(data3 > 20)[0] 191
157 data3[bigFamilies3] = 22 192 # data3 = numpy.asarray(file3[:, 0]).astype(int)
158 193 # bigFamilies3 = numpy.where(data3 > 20)[0]
194 # data3[bigFamilies3] = 22
195
196 if numpy.amax(integers3) > 20:
197 bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1)
198 data3 = numpy.clip(integers3, bins[0], bins[-1])
199 else:
200 data3 = integers3
159 list_to_plot.append(data3) 201 list_to_plot.append(data3)
160 name3 = name3.split(".tabular")[0] 202 name3 = name3.split(".tabular")[0]
161 label.append(name3) 203 label.append(name3)
162 data_array_list.append(file3) 204 data_array_list.append(file3)
163 205
164 plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) 206 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
165 207 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
166 legend1 = "{:,}".format(numpy.bincount(data3)[1]) 208
167 plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) 209 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
168 210 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
169 legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) 211 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
170 plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) 212
171 213 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3))
172 legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) 214 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
173 plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) 215 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
174 216
175 legend4 = "{:,} ({:.3f})".format( 217 legend4 = "{:,} ({:.3f})".format(
176 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), 218 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int),
177 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) 219 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3))
178 plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) 220 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
221 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
179 222
180 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), 223 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
181 float(sum(integers3[integers3 > 20])) / sum(integers3)) 224 float(sum(integers3[integers3 > 20])) / sum(integers3))
182 plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) 225 fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
226 fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure)
183 227
184 legend6 = "{:,}".format(len(data3)) 228 legend6 = "{:,}".format(len(data3))
185 plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) 229 fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
230 fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure)
186 231
187 legend6b = "{:,}".format(sum(integers3)) 232 legend6b = "{:,}".format(sum(integers3))
188 plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) 233 fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
234 fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure)
189 235
190 if fourthFile != str(None): 236 if fourthFile != str(None):
191 file4 = readFileReferenceFree(fourthFile) 237 file4 = readFileReferenceFree(fourthFile)
192 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes 238 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes
193 239 list_to_plot_original.append(integers4)
194 data4 = numpy.asarray(file4[:, 0]).astype(int) 240 colors.append("#04cec7")
195 241
196 bigFamilies4 = numpy.where(data4 > 20)[0] 242 # data4 = numpy.asarray(file4[:, 0]).astype(int)
197 data4[bigFamilies4] = 22 243 # bigFamilies4 = numpy.where(data4 > 20)[0]
198 244 # data4[bigFamilies4] = 22
245 if numpy.amax(integers4) > 20:
246 bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1)
247 data4 = numpy.clip(integers4, bins[0], bins[-1])
248 else:
249 data4 = integers4
199 list_to_plot.append(data4) 250 list_to_plot.append(data4)
200 name4 = name4.split(".tabular")[0] 251 name4 = name4.split(".tabular")[0]
201 label.append(name4) 252 label.append(name4)
202 data_array_list.append(file4) 253 data_array_list.append(file4)
203 254
204 plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) 255 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
205 256 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
206 legend1 = "{:,}".format(numpy.bincount(data4)[1]) 257
207 plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) 258 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
208 259 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
209 legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) 260 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
210 plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) 261
211 262 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4))
212 legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) 263 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
213 plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) 264 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
214 265
215 legend4 = "{:,} ({:.3f})".format( 266 legend4 = "{:,} ({:.3f})".format(
216 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), 267 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int),
217 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) 268 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4))
218 plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) 269 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
270 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
219 271
220 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), 272 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
221 float(sum(integers4[integers4 > 20])) / sum(integers4)) 273 float(sum(integers4[integers4 > 20])) / sum(integers4))
222 plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) 274 fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
275 fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure)
223 276
224 legend6 = "{:,}".format(len(data4)) 277 legend6 = "{:,}".format(len(data4))
225 plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) 278 fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
279 fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure)
226 280
227 legend6b = "{:,}".format(sum(integers4)) 281 legend6b = "{:,}".format(sum(integers4))
228 plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) 282 fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
283 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
229 284
230 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) 285 maximumX = numpy.amax(numpy.concatenate(list_to_plot))
231 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) 286 minimumX = numpy.amin(numpy.concatenate(list_to_plot))
232 287 bins = numpy.arange(minimumX, maximumX + 1)
233 counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", 288 list_to_plot2 = list_to_plot
234 linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) 289 to_plot = ["Absolute frequencies", "Relative frequencies"]
235 290 plt.xticks([], [])
236 ticks = numpy.arange(minimumX - 1, maximumX, 1) 291 plt.yticks([], [])
237 ticks1 = map(str, ticks) 292 fig.suptitle('Family Size Distribution (tags)', fontsize=14)
238 ticks1[len(ticks1) - 1] = ">20" 293
239 plt.xticks(numpy.array(ticks), ticks1) 294 for l in range(len(to_plot)):
240 295 ax = fig.add_subplot(2, 1, l+1)
241 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) 296 ticks = numpy.arange(1, 22, 1)
242 # plt.title("Family Size Distribution", fontsize=14) 297 ticks1 = map(str, ticks)
243 plt.xlabel("Family size", fontsize=14) 298 if maximumX > 20:
244 plt.ylabel("Absolute Frequency", fontsize=14) 299 ticks1[len(ticks1) - 1] = ">20"
245 plt.margins(0.01, None) 300
246 plt.grid(b=True, which="major", color="#424242", linestyle=":") 301 if to_plot[l] == "Relative frequencies":
302 counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True)
303 else:
304 counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8)
305 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
306
307 ax.set_xticks(numpy.array(ticks))
308 ax.set_xticklabels(ticks1)
309
310 ax.set_ylabel(to_plot[l], fontsize=14)
311 ax.set_xlabel("Family size", fontsize=14)
312 if log_axis:
313 ax.set_yscale('log')
314 ax.grid(b=True, which="major", color="#424242", linestyle=":")
315 ax.margins(0.01, None)
247 pdf.savefig(fig) 316 pdf.savefig(fig)
248 plt.close() 317 plt.close()
249 318
250 # write data to CSV file 319 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
251 output_file.write("Values from family size distribution with all datasets\n") 320 for l in range(len(to_plot)):
321 ax = fig2.add_subplot(2, 1, l + 1)
322 ticks = numpy.arange(minimumX, maximumX + 1)
323 ticks1 = map(str, ticks)
324 if maximumX > 20:
325 ticks1[len(ticks1) - 1] = ">20"
326 reads = []
327 reads_rel = []
328
329 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1)
330
331 for i in range(len(list_to_plot2)):
332 unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
333 new_c = []
334 new_unique = []
335
336 for t in ticks:
337 if t not in unique:
338 new_c.append(0) # add zero count of not occuring
339 new_unique.append(t)
340 else:
341 c_idx = numpy.where(t == unique)[0]
342 new_c.append(c[c_idx])
343 new_unique.append(unique[c_idx])
344 y = numpy.array(new_unique) * numpy.array(new_c)
345 if len([list_to_plot_original > 20]) > 0:
346 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
347 reads.append(y)
348 reads_rel.append(list(numpy.float_(y)) / sum(y))
349
350 x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float))
351 x = [xi + barWidth for xi in x]
352
353 if to_plot[l] == "Relative frequencies":
354 counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1),
355 edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i])
356 else:
357 counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i],
358 alpha=1, linewidth=1, color=colors[i])
359 if i == len(list_to_plot2):
360 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
361 else:
362 barWidth += 1. / (len(list_to_plot) + 1)
363
364 if to_plot[l] == "Absolute frequencies":
365 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
366 else:
367 ax.set_xlabel("Family size", fontsize=14)
368
369 ax.set_xticks(numpy.array(ticks))
370 ax.set_xticklabels(ticks1)
371 ax.set_ylabel(to_plot[l], fontsize=14)
372 if log_axis:
373 ax.set_yscale('log')
374 ax.grid(b=True, which="major", color="#424242", linestyle=":")
375 ax.margins(0.01, None)
376
377 pdf.savefig(fig2)
378 plt.close()
379
380 # write data to CSV file tags
381 output_file.write("Values from family size distribution with all datasets (tags)\n")
252 output_file.write("\nFamily size") 382 output_file.write("\nFamily size")
253 for i in label: 383 for i in label:
254 output_file.write("{}{}".format(sep, i)) 384 output_file.write("{}{}".format(sep, i))
255 # output_file.write("{}sum".format(sep)) 385 # output_file.write("{}sum".format(sep))
256 output_file.write("\n") 386 output_file.write("\n")
273 output_file.write("{}{}".format(int(sum(counts[0])), sep)) 403 output_file.write("{}{}".format(int(sum(counts[0])), sep))
274 else: 404 else:
275 for i in counts[0]: 405 for i in counts[0]:
276 output_file.write("{}{}".format(int(sum(i)), sep)) 406 output_file.write("{}{}".format(int(sum(i)), sep))
277 407
408 # write data to CSV file PE reads
409 output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n")
410 output_file.write("\nFamily size")
411 for i in label:
412 output_file.write("{}{}".format(sep, i))
413 # output_file.write("{}sum".format(sep))
414 output_file.write("\n")
415 j = 0
416 for fs in bins:
417 if fs == 21:
418 fs = ">20"
419 else:
420 fs = "={}".format(fs)
421 output_file.write("FS{}{}".format(fs, sep))
422 if len(label) == 1:
423 output_file.write("{}{}".format(int(reads[0][j]), sep))
424 else:
425 for n in range(len(label)):
426 output_file.write("{}{}".format(int(reads[n][j]), sep))
427 output_file.write("\n")
428 j += 1
429 output_file.write("sum{}".format(sep))
430 if len(label) == 1:
431 output_file.write("{}{}".format(int(sum(reads)), sep))
432 else:
433 for i in reads:
434 output_file.write("{}{}".format(int(sum(i)), sep))
435 output_file.write("\n")
436
278 # Family size distribution after DCS and SSCS 437 # Family size distribution after DCS and SSCS
279 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): 438 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label):
280 maximumX = numpy.amax(dataset) 439 maximumX = numpy.amax(dataset)
281 minimumX = numpy.amin(dataset) 440 minimumX = numpy.amin(dataset)
282 441
296 duplTags_o = duplTags_double_o[0::2] # ab of DCS 455 duplTags_o = duplTags_double_o[0::2] # ab of DCS
297 456
298 duplTagsBA = duplTags_double[1::2] # ba of DCS 457 duplTagsBA = duplTags_double[1::2] # ba of DCS
299 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS 458 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS
300 459
460 # duplTags_double_tag = tags[numpy.in1d(seq, d)]
461 # duplTags_double_seq = seq[numpy.in1d(seq, d)]
462
301 # get family sizes for SSCS with no partner 463 # get family sizes for SSCS with no partner
302 ab = numpy.where(tags == "ab")[0] 464 ab = numpy.where(tags == "ab")[0]
303 abSeq = seq[ab] 465 abSeq = seq[ab]
304 ab_o = data_o[ab] 466 ab_o = data_o[ab]
305 ab = data[ab] 467 ab = data[ab]
320 # information for family size >= 3 482 # information for family size >= 3
321 dataAB_FS3 = dataAB[dataAB >= 3] 483 dataAB_FS3 = dataAB[dataAB >= 3]
322 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] 484 dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
323 dataBA_FS3 = dataBA[dataBA >= 3] 485 dataBA_FS3 = dataBA[dataBA >= 3]
324 dataBA_FS3_o = dataBA_o[dataBA_o >= 3] 486 dataBA_FS3_o = dataBA_o[dataBA_o >= 3]
325 ab_FS3 = ab[ab >= 3] 487 # ab_FS3 = ab[ab >= 3]
326 ba_FS3 = ba[ba >= 3] 488 # ba_FS3 = ba[ba >= 3]
327 ab_FS3_o = ab_o[ab_o >= 3] 489 # ab_FS3_o = ab_o[ab_o >= 3]
328 ba_FS3_o = ba_o[ba_o >= 3] 490 # ba_FS3_o = ba_o[ba_o >= 3]
329 491
330 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 492 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3
331 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 493 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3
332 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 494 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3
333 495
335 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 497 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3
336 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 498 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3
337 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 499 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3
338 500
339 fig = plt.figure() 501 fig = plt.figure()
340 plt.subplots_adjust(bottom=0.3) 502 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
341 counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], 503 counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"],
342 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], 504 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
343 rwidth=0.8) 505 rwidth=0.8)
344 # tick labels of x axis 506 # tick labels of x axis
345 ticks = numpy.arange(minimumX - 1, maximumX, 1) 507 ticks = numpy.arange(1, 22, 1)
346 ticks1 = map(str, ticks) 508 ticks1 = map(str, ticks)
347 ticks1[len(ticks1) - 1] = ">20" 509 if maximumX > 20:
510 ticks1[len(ticks1) - 1] = ">20"
348 plt.xticks(numpy.array(ticks), ticks1) 511 plt.xticks(numpy.array(ticks), ticks1)
349 singl = counts[0][2][0] # singletons 512 singl = counts[0][2][0] # singletons
350 last = counts[0][2][len(counts[0][0]) - 1] # large families 513 last = counts[0][2][len(counts[0][0]) - 1] # large families
351 514 if log_axis:
515 plt.yscale('log')
352 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 516 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
353 plt.title(name_file, fontsize=14) 517 plt.title(name_file, fontsize=14)
354 plt.xlabel("Family size", fontsize=14) 518 plt.xlabel("Family size", fontsize=14)
355 plt.ylabel("Absolute Frequency", fontsize=14) 519 plt.ylabel("Absolute Frequency", fontsize=14)
356 plt.margins(0.01, None) 520 plt.margins(0.01, None)
409 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) 573 output_file.write("\nDataset:{}{}\n".format(sep, name_file))
410 output_file.write("max. family size:{}{}\n".format(sep, max(integers))) 574 output_file.write("max. family size:{}{}\n".format(sep, max(integers)))
411 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) 575 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
412 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) 576 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
413 577
414 output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) 578 output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep))
415 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) 579 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep))
416 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( 580 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
417 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, 581 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep,
418 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) 582 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o)))
419 583