Mercurial > repos > mheinzl > hd
comparison hd.py @ 1:7414792e1cb8 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 90ee23e393deb06fa5c15e3778fa23c39a25f7ce
author | mheinzl |
---|---|
date | Sat, 12 May 2018 04:52:34 -0400 |
parents | 239c4448a163 |
children | 316fbf91dd12 |
comparison
equal
deleted
inserted
replaced
0:239c4448a163 | 1:7414792e1cb8 |
---|---|
22 import matplotlib.pyplot as plt | 22 import matplotlib.pyplot as plt |
23 import os.path | 23 import os.path |
24 import cPickle as pickle | 24 import cPickle as pickle |
25 from multiprocessing.pool import Pool | 25 from multiprocessing.pool import Pool |
26 from functools import partial | 26 from functools import partial |
27 from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD | 27 #from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD |
28 from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2 | 28 #from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2 |
29 from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2 | 29 #from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2 |
30 from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag | 30 #from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag |
31 from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2 | 31 #from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2 |
32 import argparse | 32 import argparse |
33 import sys | 33 import sys |
34 import os | 34 import os |
35 from matplotlib.backends.backend_pdf import PdfPages | 35 from matplotlib.backends.backend_pdf import PdfPages |
36 | 36 from collections import Counter |
37 | |
38 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, quant, | |
39 title_file1, subtitle, pdf, relative=False, diff = True): | |
40 if diff is False: | |
41 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] | |
42 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"] | |
43 else: | |
44 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] | |
45 if relative is True: | |
46 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] | |
47 else: | |
48 labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"] | |
49 | |
50 fig = plt.figure(figsize=(6, 7)) | |
51 ax = fig.add_subplot(111) | |
52 plt.subplots_adjust(bottom=0.1) | |
53 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) | |
54 maximumY = numpy.amax(p1) | |
55 | |
56 if len(range(minimumXFS, maximumXFS)) == 0: | |
57 range1 = range(minimumXFS - 1, minimumXFS + 2) | |
58 else: | |
59 range1 = range(0, maximumXFS + 2) | |
60 counts = plt.hist(familySizeList1, label=labels, | |
61 color=colors, stacked=True, | |
62 rwidth=0.8,alpha=1, align="left", | |
63 edgecolor="None",bins=range1) | |
64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
65 | |
66 plt.title(title_file1, fontsize=12) | |
67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
68 plt.xlabel("No. of Family Members", fontsize=12) | |
69 plt.ylabel("Absolute Frequency", fontsize=12) | |
70 | |
71 ticks = numpy.arange(0, maximumXFS + 1, 1) | |
72 ticks1 = map(str, ticks) | |
73 if maximumXFS >= 20: | |
74 ticks1[len(ticks1) - 1] = ">=20" | |
75 plt.xticks(numpy.array(ticks), ticks1) | |
76 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] | |
77 | |
78 plt.xlim((0, maximumXFS + 1)) | |
79 if len(numpy.concatenate(familySizeList1)) != 0: | |
80 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) | |
81 | |
82 plt.ylim((0, maximumY * 1.2)) | |
83 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: " | |
84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) | |
85 | |
86 count = numpy.bincount(quant) # original counts | |
87 legend1 = "{}\n{}\n{:.5f}" \ | |
88 .format(max(quant), count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) | |
89 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) | |
90 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), | |
91 float(counts[0][len(counts[0]) - 1][1]) / sum( | |
92 counts[0][len(counts[0]) - 1])) | |
93 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) | |
94 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
95 | |
96 pdf.savefig(fig, bbox_inches="tight") | |
97 plt.close("all") | |
98 | |
99 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, | |
100 xlabel,relative=False): | |
101 if relative is True: | |
102 step = 0.1 | |
103 else: | |
104 step = 1 | |
105 | |
106 fig = plt.figure(figsize=(6, 8)) | |
107 plt.subplots_adjust(bottom=0.1) | |
108 con_list1 = numpy.concatenate(list1) | |
109 p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())]) | |
110 maximumY = numpy.amax(p1) | |
111 | |
112 if relative is True: # relative difference | |
113 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) | |
114 else: | |
115 bin1 = maximumX + 1 | |
116 | |
117 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
118 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", | |
119 "FS>10"], rwidth=0.8, | |
120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
121 stacked=True, alpha=1, | |
122 align="left", | |
123 range=(0, maximumX + 1)) | |
124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
125 bins = counts[1] # width of bins | |
126 counts = numpy.array(map(int, counts[0][5])) | |
127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
128 plt.title(title_file1, fontsize=12) | |
129 plt.xlabel(xlabel, fontsize=12) | |
130 plt.ylabel("Absolute Frequency", fontsize=12) | |
131 | |
132 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
134 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
135 | |
136 plt.ylim((0, maximumY * 1.2)) | |
137 | |
138 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
139 for x_label, label in zip(counts, bin_centers): # labels for values | |
140 if x_label == 0: | |
141 continue | |
142 else: | |
143 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), | |
144 xy=(label, x_label + len(con_list1) * 0.01), | |
145 xycoords="data", color="#000066",fontsize=10) | |
146 | |
147 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) | |
148 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | |
149 | |
150 pdf.savefig(fig, bbox_inches="tight") | |
151 plt.close("all") | |
152 plt.clf() | |
153 | |
154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): | |
155 fig = plt.figure(figsize=(6, 8)) | |
156 plt.subplots_adjust(bottom=0.1) | |
157 | |
158 ham = [numpy.array(min_value), sum1, sum2] # new hd within tags | |
159 | |
160 maximumX = numpy.amax(numpy.concatenate(ham)) | |
161 minimumX = numpy.amin(numpy.concatenate(ham)) | |
162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) | |
163 | |
164 if len(range(minimumX, maximumX)) == 0: | |
165 range1 = minimumX | |
166 else: | |
167 range1 = range(minimumX, maximumX + 2) | |
168 | |
169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, | |
170 label=["HD of whole tag", "tag1 - a\nvs. tag2 - a", "tag1 - b\nvs. tag2 - b"], | |
171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1) | |
172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | |
173 plt.suptitle('Hamming distances within tags', fontsize=14) | |
174 plt.title(title_file1, fontsize=12) | |
175 plt.xlabel("Hamming Distance", fontsize=12) | |
176 plt.ylabel("Absolute Frequency", fontsize=12) | |
177 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
178 | |
179 | |
180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) | |
181 plt.xticks(numpy.arange(minimumX - 1, maximumX + 1, 1.0)) | |
182 plt.ylim((0, maximumY * 1.1)) | |
183 | |
184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) | |
185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | |
186 pdf.savefig(fig, bbox_inches="tight") | |
187 plt.close("all") | |
188 plt.clf() | |
189 | |
190 def createTableFSD2(list1, diff=True): | |
191 selfAB = numpy.concatenate(list1) | |
192 uniqueFS = numpy.unique(selfAB) | |
193 nr = numpy.arange(0, len(uniqueFS), 1) | |
194 if diff is False: | |
195 count = numpy.zeros((len(uniqueFS), 6)) | |
196 else: | |
197 count = numpy.zeros((len(uniqueFS), 7)) | |
198 | |
199 state = 1 | |
200 for i in list1: | |
201 counts = list(Counter(i).items()) | |
202 hd = [item[0] for item in counts] | |
203 c = [item[1] for item in counts] | |
204 table = numpy.column_stack((hd, c)) | |
205 if len(table) == 0: | |
206 state = state + 1 | |
207 continue | |
208 else: | |
209 if state == 1: | |
210 for i, l in zip(uniqueFS, nr): | |
211 for j in table: | |
212 if j[0] == uniqueFS[l]: | |
213 count[l, 0] = j[1] | |
214 if state == 2: | |
215 for i, l in zip(uniqueFS, nr): | |
216 for j in table: | |
217 if j[0] == uniqueFS[l]: | |
218 count[l, 1] = j[1] | |
219 | |
220 if state == 3: | |
221 for i, l in zip(uniqueFS, nr): | |
222 for j in table: | |
223 if j[0] == uniqueFS[l]: | |
224 count[l, 2] = j[1] | |
225 | |
226 if state == 4: | |
227 for i, l in zip(uniqueFS, nr): | |
228 for j in table: | |
229 if j[0] == uniqueFS[l]: | |
230 count[l, 3] = j[1] | |
231 | |
232 if state == 5: | |
233 for i, l in zip(uniqueFS, nr): | |
234 for j in table: | |
235 if j[0] == uniqueFS[l]: | |
236 count[l, 4] = j[1] | |
237 | |
238 if state == 6: | |
239 for i, l in zip(uniqueFS, nr): | |
240 for j in table: | |
241 if j[0] == uniqueFS[l]: | |
242 count[l, 5] = j[1] | |
243 | |
244 if state == 7: | |
245 for i, l in zip(uniqueFS, nr): | |
246 for j in table: | |
247 if j[0] == uniqueFS[l]: | |
248 count[l, 6] = j[1] | |
249 state = state + 1 | |
250 | |
251 sumRow = count.sum(axis=1) | |
252 sumCol = count.sum(axis=0) | |
253 | |
254 uniqueFS = uniqueFS.astype(str) | |
255 if uniqueFS[len(uniqueFS) - 1] == "20": | |
256 uniqueFS[len(uniqueFS) - 1] = ">20" | |
257 | |
258 first = ["FS={}".format(i) for i in uniqueFS] | |
259 final = numpy.column_stack((first, count, sumRow)) | |
260 | |
261 return (final, sumCol) | |
262 | |
263 def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True): | |
264 output_file.write(name) | |
265 output_file.write("\n") | |
266 if diff is False: | |
267 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) | |
268 else: | |
269 if rel is False: | |
270 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) | |
271 else: | |
272 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) | |
273 | |
274 for item in summary: | |
275 for nr in item: | |
276 if "FS" not in nr and "diff" not in nr: | |
277 nr = nr.astype(float) | |
278 nr = nr.astype(int) | |
279 output_file.write("{}{}".format(nr,sep)) | |
280 output_file.write("\n") | |
281 output_file.write("sum{}".format(sep)) | |
282 sumCol = map(int, sumCol) | |
283 for el in sumCol: | |
284 output_file.write("{}{}".format(el,sep)) | |
285 output_file.write("{}{}".format(overallSum.astype(int),sep)) | |
286 output_file.write("\n\n") | |
287 | |
288 def createTableHD(list1, row_label): | |
289 selfAB = numpy.concatenate(list1) | |
290 uniqueHD = numpy.unique(selfAB) | |
291 nr = numpy.arange(0, len(uniqueHD), 1) | |
292 count = numpy.zeros((len(uniqueHD), 6)) | |
293 state = 1 | |
294 for i in list1: | |
295 counts = list(Counter(i).items()) | |
296 hd = [item[0] for item in counts] | |
297 c = [item[1] for item in counts] | |
298 table = numpy.column_stack((hd, c)) | |
299 if len(table) == 0: | |
300 state = state + 1 | |
301 continue | |
302 else: | |
303 if state == 1: | |
304 for i, l in zip(uniqueHD, nr): | |
305 for j in table: | |
306 if j[0] == uniqueHD[l]: | |
307 count[l, 0] = j[1] | |
308 if state == 2: | |
309 for i, l in zip(uniqueHD, nr): | |
310 for j in table: | |
311 if j[0] == uniqueHD[l]: | |
312 count[l, 1] = j[1] | |
313 | |
314 if state == 3: | |
315 for i, l in zip(uniqueHD, nr): | |
316 for j in table: | |
317 if j[0] == uniqueHD[l]: | |
318 count[l, 2] = j[1] | |
319 | |
320 if state == 4: | |
321 for i, l in zip(uniqueHD, nr): | |
322 for j in table: | |
323 if j[0] == uniqueHD[l]: | |
324 count[l, 3] = j[1] | |
325 | |
326 if state == 5: | |
327 for i, l in zip(uniqueHD, nr): | |
328 for j in table: | |
329 if j[0] == uniqueHD[l]: | |
330 count[l, 4] = j[1] | |
331 | |
332 if state == 6: | |
333 for i, l in zip(uniqueHD, nr): | |
334 for j in table: | |
335 if j[0] == uniqueHD[l]: | |
336 count[l, 5] = j[1] | |
337 state = state + 1 | |
338 | |
339 sumRow = count.sum(axis=1) | |
340 sumCol = count.sum(axis=0) | |
341 first = ["{}{}".format(row_label,i) for i in uniqueHD] | |
342 final = numpy.column_stack((first, count, sumRow)) | |
343 | |
344 return (final, sumCol) | |
345 | |
346 def createTableHDwithTags(list1): | |
347 selfAB = numpy.concatenate(list1) | |
348 uniqueHD = numpy.unique(selfAB) | |
349 nr = numpy.arange(0, len(uniqueHD), 1) | |
350 count = numpy.zeros((len(uniqueHD), 3)) | |
351 | |
352 state = 1 | |
353 for i in list1: | |
354 counts = list(Counter(i).items()) | |
355 hd = [item[0] for item in counts] | |
356 c = [item[1] for item in counts] | |
357 table = numpy.column_stack((hd, c)) | |
358 if len(table) == 0: | |
359 state = state + 1 | |
360 continue | |
361 else: | |
362 if state == 1: | |
363 for i, l in zip(uniqueHD, nr): | |
364 for j in table: | |
365 if j[0] == uniqueHD[l]: | |
366 count[l, 0] = j[1] | |
367 if state == 2: | |
368 for i, l in zip(uniqueHD, nr): | |
369 for j in table: | |
370 if j[0] == uniqueHD[l]: | |
371 count[l, 1] = j[1] | |
372 | |
373 if state == 3: | |
374 for i, l in zip(uniqueHD, nr): | |
375 for j in table: | |
376 if j[0] == uniqueHD[l]: | |
377 count[l, 2] = j[1] | |
378 state = state + 1 | |
379 | |
380 sumRow = count.sum(axis=1) | |
381 sumCol = count.sum(axis=0) | |
382 first = ["HD={}".format(i) for i in uniqueHD] | |
383 final = numpy.column_stack((first, count, sumRow)) | |
384 | |
385 return (final, sumCol) | |
386 | |
387 def createFileHD(summary, sumCol, overallSum, output_file, name,sep): | |
388 output_file.write(name) | |
389 output_file.write("\n") | |
390 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) | |
391 for item in summary: | |
392 for nr in item: | |
393 if "HD" not in nr and "diff" not in nr: | |
394 nr = nr.astype(float) | |
395 nr = nr.astype(int) | |
396 output_file.write("{}{}".format(nr,sep)) | |
397 output_file.write("\n") | |
398 output_file.write("sum{}".format(sep)) | |
399 sumCol = map(int, sumCol) | |
400 for el in sumCol: | |
401 output_file.write("{}{}".format(el,sep)) | |
402 output_file.write("{}{}".format(overallSum.astype(int),sep)) | |
403 output_file.write("\n\n") | |
404 | |
405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): | |
406 output_file.write(name) | |
407 output_file.write("\n") | |
408 output_file.write("{}HD of whole tag;tag1-half1 vs. tag2-half1{}tag1-half2 vs. tag2-half2{}sum{}\n".format(sep,sep,sep,sep)) | |
409 for item in summary: | |
410 for nr in item: | |
411 if "HD" not in nr: | |
412 nr = nr.astype(float) | |
413 nr = nr.astype(int) | |
414 output_file.write("{}{}".format(nr,sep)) | |
415 output_file.write("\n") | |
416 output_file.write("sum{}".format(sep)) | |
417 sumCol = map(int, sumCol) | |
418 for el in sumCol: | |
419 output_file.write("{}{}".format(el,sep)) | |
420 output_file.write("{}{}".format(overallSum.astype(int),sep)) | |
421 output_file.write("\n\n") | |
422 | |
423 | |
424 | |
37 def hamming(array1, array2): | 425 def hamming(array1, array2): |
38 res = 99 * numpy.ones(len(array1)) | 426 res = 99 * numpy.ones(len(array1)) |
39 i = 0 | 427 i = 0 |
40 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 428 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
41 for a in array1: | 429 for a in array1: |