comparison hd.py @ 1:7414792e1cb8 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 90ee23e393deb06fa5c15e3778fa23c39a25f7ce
author mheinzl
date Sat, 12 May 2018 04:52:34 -0400
parents 239c4448a163
children 316fbf91dd12
comparison
equal deleted inserted replaced
0:239c4448a163 1:7414792e1cb8
22 import matplotlib.pyplot as plt 22 import matplotlib.pyplot as plt
23 import os.path 23 import os.path
24 import cPickle as pickle 24 import cPickle as pickle
25 from multiprocessing.pool import Pool 25 from multiprocessing.pool import Pool
26 from functools import partial 26 from functools import partial
27 from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD 27 #from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
28 from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2 28 #from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
29 from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2 29 #from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
30 from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag 30 #from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
31 from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2 31 #from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
32 import argparse 32 import argparse
33 import sys 33 import sys
34 import os 34 import os
35 from matplotlib.backends.backend_pdf import PdfPages 35 from matplotlib.backends.backend_pdf import PdfPages
36 36 from collections import Counter
37
38 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, quant,
39 title_file1, subtitle, pdf, relative=False, diff = True):
40 if diff is False:
41 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
42 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"]
43 else:
44 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
45 if relative is True:
46 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
47 else:
48 labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"]
49
50 fig = plt.figure(figsize=(6, 7))
51 ax = fig.add_subplot(111)
52 plt.subplots_adjust(bottom=0.1)
53 p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
54 maximumY = numpy.amax(p1)
55
56 if len(range(minimumXFS, maximumXFS)) == 0:
57 range1 = range(minimumXFS - 1, minimumXFS + 2)
58 else:
59 range1 = range(0, maximumXFS + 2)
60 counts = plt.hist(familySizeList1, label=labels,
61 color=colors, stacked=True,
62 rwidth=0.8,alpha=1, align="left",
63 edgecolor="None",bins=range1)
64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
65
66 plt.title(title_file1, fontsize=12)
67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
68 plt.xlabel("No. of Family Members", fontsize=12)
69 plt.ylabel("Absolute Frequency", fontsize=12)
70
71 ticks = numpy.arange(0, maximumXFS + 1, 1)
72 ticks1 = map(str, ticks)
73 if maximumXFS >= 20:
74 ticks1[len(ticks1) - 1] = ">=20"
75 plt.xticks(numpy.array(ticks), ticks1)
76 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
77
78 plt.xlim((0, maximumXFS + 1))
79 if len(numpy.concatenate(familySizeList1)) != 0:
80 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
81
82 plt.ylim((0, maximumY * 1.2))
83 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: "
84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
85
86 count = numpy.bincount(quant) # original counts
87 legend1 = "{}\n{}\n{:.5f}" \
88 .format(max(quant), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
89 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
90 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]),
91 float(counts[0][len(counts[0]) - 1][1]) / sum(
92 counts[0][len(counts[0]) - 1]))
93 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
94 plt.grid(b=True, which='major', color='#424242', linestyle=':')
95
96 pdf.savefig(fig, bbox_inches="tight")
97 plt.close("all")
98
99 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf,
100 xlabel,relative=False):
101 if relative is True:
102 step = 0.1
103 else:
104 step = 1
105
106 fig = plt.figure(figsize=(6, 8))
107 plt.subplots_adjust(bottom=0.1)
108 con_list1 = numpy.concatenate(list1)
109 p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
110 maximumY = numpy.amax(p1)
111
112 if relative is True: # relative difference
113 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
114 else:
115 bin1 = maximumX + 1
116
117 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
118 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10",
119 "FS>10"], rwidth=0.8,
120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
121 stacked=True, alpha=1,
122 align="left",
123 range=(0, maximumX + 1))
124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
125 bins = counts[1] # width of bins
126 counts = numpy.array(map(int, counts[0][5]))
127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
128 plt.title(title_file1, fontsize=12)
129 plt.xlabel(xlabel, fontsize=12)
130 plt.ylabel("Absolute Frequency", fontsize=12)
131
132 plt.grid(b=True, which='major', color='#424242', linestyle=':')
133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
134 plt.xticks(numpy.arange(0, maximumX + step, step))
135
136 plt.ylim((0, maximumY * 1.2))
137
138 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
139 for x_label, label in zip(counts, bin_centers): # labels for values
140 if x_label == 0:
141 continue
142 else:
143 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
144 xy=(label, x_label + len(con_list1) * 0.01),
145 xycoords="data", color="#000066",fontsize=10)
146
147 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
148 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
149
150 pdf.savefig(fig, bbox_inches="tight")
151 plt.close("all")
152 plt.clf()
153
154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf):
155 fig = plt.figure(figsize=(6, 8))
156 plt.subplots_adjust(bottom=0.1)
157
158 ham = [numpy.array(min_value), sum1, sum2] # new hd within tags
159
160 maximumX = numpy.amax(numpy.concatenate(ham))
161 minimumX = numpy.amin(numpy.concatenate(ham))
162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
163
164 if len(range(minimumX, maximumX)) == 0:
165 range1 = minimumX
166 else:
167 range1 = range(minimumX, maximumX + 2)
168
169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
170 label=["HD of whole tag", "tag1 - a\nvs. tag2 - a", "tag1 - b\nvs. tag2 - b"],
171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1)
172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
173 plt.suptitle('Hamming distances within tags', fontsize=14)
174 plt.title(title_file1, fontsize=12)
175 plt.xlabel("Hamming Distance", fontsize=12)
176 plt.ylabel("Absolute Frequency", fontsize=12)
177 plt.grid(b=True, which='major', color='#424242', linestyle=':')
178
179
180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
181 plt.xticks(numpy.arange(minimumX - 1, maximumX + 1, 1.0))
182 plt.ylim((0, maximumY * 1.1))
183
184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags)
185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
186 pdf.savefig(fig, bbox_inches="tight")
187 plt.close("all")
188 plt.clf()
189
190 def createTableFSD2(list1, diff=True):
191 selfAB = numpy.concatenate(list1)
192 uniqueFS = numpy.unique(selfAB)
193 nr = numpy.arange(0, len(uniqueFS), 1)
194 if diff is False:
195 count = numpy.zeros((len(uniqueFS), 6))
196 else:
197 count = numpy.zeros((len(uniqueFS), 7))
198
199 state = 1
200 for i in list1:
201 counts = list(Counter(i).items())
202 hd = [item[0] for item in counts]
203 c = [item[1] for item in counts]
204 table = numpy.column_stack((hd, c))
205 if len(table) == 0:
206 state = state + 1
207 continue
208 else:
209 if state == 1:
210 for i, l in zip(uniqueFS, nr):
211 for j in table:
212 if j[0] == uniqueFS[l]:
213 count[l, 0] = j[1]
214 if state == 2:
215 for i, l in zip(uniqueFS, nr):
216 for j in table:
217 if j[0] == uniqueFS[l]:
218 count[l, 1] = j[1]
219
220 if state == 3:
221 for i, l in zip(uniqueFS, nr):
222 for j in table:
223 if j[0] == uniqueFS[l]:
224 count[l, 2] = j[1]
225
226 if state == 4:
227 for i, l in zip(uniqueFS, nr):
228 for j in table:
229 if j[0] == uniqueFS[l]:
230 count[l, 3] = j[1]
231
232 if state == 5:
233 for i, l in zip(uniqueFS, nr):
234 for j in table:
235 if j[0] == uniqueFS[l]:
236 count[l, 4] = j[1]
237
238 if state == 6:
239 for i, l in zip(uniqueFS, nr):
240 for j in table:
241 if j[0] == uniqueFS[l]:
242 count[l, 5] = j[1]
243
244 if state == 7:
245 for i, l in zip(uniqueFS, nr):
246 for j in table:
247 if j[0] == uniqueFS[l]:
248 count[l, 6] = j[1]
249 state = state + 1
250
251 sumRow = count.sum(axis=1)
252 sumCol = count.sum(axis=0)
253
254 uniqueFS = uniqueFS.astype(str)
255 if uniqueFS[len(uniqueFS) - 1] == "20":
256 uniqueFS[len(uniqueFS) - 1] = ">20"
257
258 first = ["FS={}".format(i) for i in uniqueFS]
259 final = numpy.column_stack((first, count, sumRow))
260
261 return (final, sumCol)
262
263 def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True):
264 output_file.write(name)
265 output_file.write("\n")
266 if diff is False:
267 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
268 else:
269 if rel is False:
270 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
271 else:
272 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
273
274 for item in summary:
275 for nr in item:
276 if "FS" not in nr and "diff" not in nr:
277 nr = nr.astype(float)
278 nr = nr.astype(int)
279 output_file.write("{}{}".format(nr,sep))
280 output_file.write("\n")
281 output_file.write("sum{}".format(sep))
282 sumCol = map(int, sumCol)
283 for el in sumCol:
284 output_file.write("{}{}".format(el,sep))
285 output_file.write("{}{}".format(overallSum.astype(int),sep))
286 output_file.write("\n\n")
287
288 def createTableHD(list1, row_label):
289 selfAB = numpy.concatenate(list1)
290 uniqueHD = numpy.unique(selfAB)
291 nr = numpy.arange(0, len(uniqueHD), 1)
292 count = numpy.zeros((len(uniqueHD), 6))
293 state = 1
294 for i in list1:
295 counts = list(Counter(i).items())
296 hd = [item[0] for item in counts]
297 c = [item[1] for item in counts]
298 table = numpy.column_stack((hd, c))
299 if len(table) == 0:
300 state = state + 1
301 continue
302 else:
303 if state == 1:
304 for i, l in zip(uniqueHD, nr):
305 for j in table:
306 if j[0] == uniqueHD[l]:
307 count[l, 0] = j[1]
308 if state == 2:
309 for i, l in zip(uniqueHD, nr):
310 for j in table:
311 if j[0] == uniqueHD[l]:
312 count[l, 1] = j[1]
313
314 if state == 3:
315 for i, l in zip(uniqueHD, nr):
316 for j in table:
317 if j[0] == uniqueHD[l]:
318 count[l, 2] = j[1]
319
320 if state == 4:
321 for i, l in zip(uniqueHD, nr):
322 for j in table:
323 if j[0] == uniqueHD[l]:
324 count[l, 3] = j[1]
325
326 if state == 5:
327 for i, l in zip(uniqueHD, nr):
328 for j in table:
329 if j[0] == uniqueHD[l]:
330 count[l, 4] = j[1]
331
332 if state == 6:
333 for i, l in zip(uniqueHD, nr):
334 for j in table:
335 if j[0] == uniqueHD[l]:
336 count[l, 5] = j[1]
337 state = state + 1
338
339 sumRow = count.sum(axis=1)
340 sumCol = count.sum(axis=0)
341 first = ["{}{}".format(row_label,i) for i in uniqueHD]
342 final = numpy.column_stack((first, count, sumRow))
343
344 return (final, sumCol)
345
346 def createTableHDwithTags(list1):
347 selfAB = numpy.concatenate(list1)
348 uniqueHD = numpy.unique(selfAB)
349 nr = numpy.arange(0, len(uniqueHD), 1)
350 count = numpy.zeros((len(uniqueHD), 3))
351
352 state = 1
353 for i in list1:
354 counts = list(Counter(i).items())
355 hd = [item[0] for item in counts]
356 c = [item[1] for item in counts]
357 table = numpy.column_stack((hd, c))
358 if len(table) == 0:
359 state = state + 1
360 continue
361 else:
362 if state == 1:
363 for i, l in zip(uniqueHD, nr):
364 for j in table:
365 if j[0] == uniqueHD[l]:
366 count[l, 0] = j[1]
367 if state == 2:
368 for i, l in zip(uniqueHD, nr):
369 for j in table:
370 if j[0] == uniqueHD[l]:
371 count[l, 1] = j[1]
372
373 if state == 3:
374 for i, l in zip(uniqueHD, nr):
375 for j in table:
376 if j[0] == uniqueHD[l]:
377 count[l, 2] = j[1]
378 state = state + 1
379
380 sumRow = count.sum(axis=1)
381 sumCol = count.sum(axis=0)
382 first = ["HD={}".format(i) for i in uniqueHD]
383 final = numpy.column_stack((first, count, sumRow))
384
385 return (final, sumCol)
386
387 def createFileHD(summary, sumCol, overallSum, output_file, name,sep):
388 output_file.write(name)
389 output_file.write("\n")
390 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
391 for item in summary:
392 for nr in item:
393 if "HD" not in nr and "diff" not in nr:
394 nr = nr.astype(float)
395 nr = nr.astype(int)
396 output_file.write("{}{}".format(nr,sep))
397 output_file.write("\n")
398 output_file.write("sum{}".format(sep))
399 sumCol = map(int, sumCol)
400 for el in sumCol:
401 output_file.write("{}{}".format(el,sep))
402 output_file.write("{}{}".format(overallSum.astype(int),sep))
403 output_file.write("\n\n")
404
405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep):
406 output_file.write(name)
407 output_file.write("\n")
408 output_file.write("{}HD of whole tag;tag1-half1 vs. tag2-half1{}tag1-half2 vs. tag2-half2{}sum{}\n".format(sep,sep,sep,sep))
409 for item in summary:
410 for nr in item:
411 if "HD" not in nr:
412 nr = nr.astype(float)
413 nr = nr.astype(int)
414 output_file.write("{}{}".format(nr,sep))
415 output_file.write("\n")
416 output_file.write("sum{}".format(sep))
417 sumCol = map(int, sumCol)
418 for el in sumCol:
419 output_file.write("{}{}".format(el,sep))
420 output_file.write("{}{}".format(overallSum.astype(int),sep))
421 output_file.write("\n\n")
422
423
424
37 def hamming(array1, array2): 425 def hamming(array1, array2):
38 res = 99 * numpy.ones(len(array1)) 426 res = 99 * numpy.ones(len(array1))
39 i = 0 427 i = 0
40 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time 428 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
41 for a in array1: 429 for a in array1: