Mercurial > repos > mheinzl > td
comparison td.py @ 0:3e56058d9552 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 9bae9043a53f1e07b502acd1082450adcb6d9e31-dirty
author | mheinzl |
---|---|
date | Wed, 16 Oct 2019 04:17:59 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3e56058d9552 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Tag distance analysis of SSCSs | |
4 # | |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | |
6 # Contact: monika.heinzl@edumail.at | |
7 # | |
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and | |
9 # optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances | |
10 # separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) | |
11 # or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the | |
12 # difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the | |
13 # data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. | |
14 # The tool can run on a certain number of processors, which can be defined by the user. | |
15 | |
16 # USAGE: python td.py --inputFile filename --inputName1 filename --sample_size int / | |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int | |
18 # --nr_above_bars True/False --output_tabular outptufile_name_tabular | |
19 | |
20 import argparse | |
21 import itertools | |
22 import operator | |
23 import sys | |
24 from collections import Counter, defaultdict | |
25 from functools import partial | |
26 from multiprocessing.pool import Pool | |
27 import random | |
28 import os | |
29 | |
30 import matplotlib.pyplot as plt | |
31 import numpy | |
32 from matplotlib.backends.backend_pdf import PdfPages | |
33 | |
34 plt.switch_backend('agg') | |
35 | |
36 | |
37 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, | |
38 subtitle, pdf, relative=False, diff=True, rel_freq=False): | |
39 if diff is False: | |
40 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] | |
41 labels = ["TD=1", "TD=2", "TD=3", "TD=4", "TD=5-8", "TD>8"] | |
42 else: | |
43 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] | |
44 if relative is True: | |
45 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] | |
46 else: | |
47 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] | |
48 | |
49 fig = plt.figure(figsize=(6, 7)) | |
50 ax = fig.add_subplot(111) | |
51 plt.subplots_adjust(bottom=0.1) | |
52 p1 = numpy.bincount(numpy.concatenate(familySizeList1)) | |
53 maximumY = numpy.amax(p1) | |
54 | |
55 if len(range(minimumXFS, maximumXFS)) == 0: | |
56 range1 = range(minimumXFS - 1, minimumXFS + 2) | |
57 else: | |
58 range1 = range(0, maximumXFS + 2) | |
59 | |
60 if rel_freq: | |
61 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] | |
62 counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, | |
63 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
64 plt.ylabel("Relative Frequency", fontsize=14) | |
65 plt.ylim((0, 1.07)) | |
66 else: | |
67 counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, | |
68 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
69 if len(numpy.concatenate(familySizeList1)) != 0: | |
70 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) | |
71 plt.ylabel("Absolute Frequency", fontsize=14) | |
72 plt.ylim((0, maximumY * 1.2)) | |
73 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
74 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
75 plt.xlabel("Family size", fontsize=14) | |
76 ticks = numpy.arange(0, maximumXFS + 1, 1) | |
77 ticks1 = map(str, ticks) | |
78 if maximumXFS >= 20: | |
79 ticks1[len(ticks1) - 1] = ">=20" | |
80 plt.xticks(numpy.array(ticks), ticks1) | |
81 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] | |
82 plt.xlim((0, maximumXFS + 1)) | |
83 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " | |
84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) | |
85 | |
86 count = numpy.bincount(originalCounts) # original counts | |
87 if max(originalCounts) >= 20: | |
88 max_count = ">= 20" | |
89 else: | |
90 max_count = max(originalCounts) | |
91 legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) | |
92 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) | |
93 legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) | |
94 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) | |
95 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
96 pdf.savefig(fig, bbox_inches="tight") | |
97 plt.close("all") | |
98 | |
99 | |
100 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, | |
101 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
102 if relative is True: | |
103 step = 0.1 | |
104 else: | |
105 step = 1 | |
106 | |
107 fig = plt.figure(figsize=(6, 8)) | |
108 plt.subplots_adjust(bottom=0.1) | |
109 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) | |
110 maximumY = numpy.amax(p1) | |
111 if relative is True: # relative difference | |
112 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) | |
113 else: | |
114 bin1 = maximumX + 1 | |
115 | |
116 if rel_freq: | |
117 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] | |
118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, | |
119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, | |
120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
121 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
122 plt.ylim((0, 1.07)) | |
123 plt.ylabel("Relative Frequency", fontsize=14) | |
124 bins = counts[1] # width of bins | |
125 counts = numpy.array(map(float, counts[0][5])) | |
126 | |
127 else: | |
128 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
129 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, | |
130 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
131 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
132 maximumY = numpy.amax(p1) | |
133 plt.ylim((0, maximumY * 1.2)) | |
134 plt.ylabel("Absolute Frequency", fontsize=14) | |
135 bins = counts[1] # width of bins | |
136 counts = numpy.array(map(int, counts[0][5])) | |
137 | |
138 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
139 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
140 plt.xlabel(xlabel, fontsize=14) | |
141 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
142 plt.xlim((minimumX - step, maximumX + step)) | |
143 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
144 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
145 | |
146 if nr_above_bars: | |
147 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
148 for x_label, label in zip(counts, bin_centers): # labels for values | |
149 if x_label == 0: | |
150 continue | |
151 else: | |
152 if rel_freq: | |
153 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), | |
154 float(x_label)), | |
155 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), | |
156 xycoords="data", color="#000066", fontsize=10) | |
157 else: | |
158 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), | |
159 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), | |
160 xycoords="data", color="#000066", fontsize=10) | |
161 | |
162 if nr_unique_chimeras != 0: | |
163 if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ | |
164 (sum(counts) / nr_unique_chimeras) == 2: | |
165 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ | |
166 .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) | |
167 else: | |
168 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
169 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
170 else: | |
171 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
172 lenTags, len_sample, len(numpy.concatenate(list1))) | |
173 | |
174 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
175 pdf.savefig(fig, bbox_inches="tight") | |
176 plt.close("all") | |
177 plt.clf() | |
178 | |
179 | |
180 def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, | |
181 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
182 step = 1 | |
183 fig = plt.figure(figsize=(6, 8)) | |
184 plt.subplots_adjust(bottom=0.1) | |
185 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) | |
186 maximumY = numpy.amax(p1) | |
187 bin1 = maximumX + 1 | |
188 if rel_freq: | |
189 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] | |
190 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, | |
191 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
192 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
193 plt.ylim((0, 1.07)) | |
194 plt.ylabel("Relative Frequency", fontsize=14) | |
195 bins = counts[1] # width of bins | |
196 counts = numpy.array(map(float, counts[0][2])) | |
197 | |
198 else: | |
199 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
200 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
201 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
202 plt.ylim((0, maximumY * 1.2)) | |
203 plt.ylabel("Absolute Frequency", fontsize=14) | |
204 bins = counts[1] # width of bins | |
205 counts = numpy.array(map(int, counts[0][2])) | |
206 | |
207 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
208 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
209 plt.xlabel(xlabel, fontsize=14) | |
210 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
211 plt.xlim((minimumX - step, maximumX + step)) | |
212 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
213 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
214 | |
215 if nr_above_bars: | |
216 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
217 for x_label, label in zip(counts, bin_centers): # labels for values | |
218 if x_label == 0: | |
219 continue | |
220 else: | |
221 if rel_freq: | |
222 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), | |
223 float(x_label)), | |
224 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), | |
225 xycoords="data", color="#000066", fontsize=10) | |
226 else: | |
227 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), | |
228 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), | |
229 xycoords="data", color="#000066", fontsize=10) | |
230 | |
231 if nr_unique_chimeras != 0: | |
232 if (sum(counts) / nr_unique_chimeras) == 2: | |
233 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ | |
234 format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) | |
235 else: | |
236 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
237 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
238 else: | |
239 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
240 lenTags, len_sample, len(numpy.concatenate(list1))) | |
241 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
242 | |
243 legend2 = "SSCS ab = {:,} ({:.5f})\nSSCS ba = {:,} ({:.5f})\nDCS = {:,} ({:.5f})".format( | |
244 len(list1[1]), len(list1[1]) / float(nr_unique_chimeras), | |
245 len(list1[2]), len(list1[2]) / float(nr_unique_chimeras), | |
246 len(list1[0]), len(list1[0]) / float(nr_unique_chimeras)) | |
247 plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) | |
248 | |
249 pdf.savefig(fig, bbox_inches="tight") | |
250 plt.close("all") | |
251 plt.clf() | |
252 | |
253 | |
254 def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): | |
255 fig = plt.figure(figsize=(6, 8)) | |
256 plt.subplots_adjust(bottom=0.1) | |
257 | |
258 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags | |
259 maximumX = numpy.amax(numpy.concatenate(ham_partial)) | |
260 minimumX = numpy.amin(numpy.concatenate(ham_partial)) | |
261 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) | |
262 | |
263 if len(range(minimumX, maximumX)) == 0: | |
264 range1 = minimumX | |
265 else: | |
266 range1 = range(minimumX, maximumX + 2) | |
267 | |
268 if rel_freq: | |
269 w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] | |
270 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, | |
271 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], | |
272 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
273 edgecolor='black', linewidth=1) | |
274 plt.ylabel("Relative Frequency", fontsize=14) | |
275 plt.ylim(0, 1.07) | |
276 else: | |
277 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, | |
278 label=["TD a.min", "TD b.max", "TD b.min", "TD a.max", "TD a.min + b.max,\nTD a.max + b.min"], | |
279 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
280 edgecolor='black', linewidth=1) | |
281 plt.ylabel("Absolute Frequency", fontsize=14) | |
282 | |
283 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.6, 1)) | |
284 plt.suptitle('Tag distances within tags', fontsize=14) | |
285 plt.xlabel("TD", fontsize=14) | |
286 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
287 plt.xlim((minimumX - 1, maximumX + 1)) | |
288 # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) | |
289 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | |
290 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
291 lenTags, len_sample, len(numpy.concatenate(ham_partial))) | |
292 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | |
293 pdf.savefig(fig, bbox_inches="tight") | |
294 plt.close("all") | |
295 plt.clf() | |
296 | |
297 | |
298 def createTableFSD2(list1, diff=True): | |
299 selfAB = numpy.concatenate(list1) | |
300 uniqueFS = numpy.unique(selfAB) | |
301 nr = numpy.arange(0, len(uniqueFS), 1) | |
302 if diff is False: | |
303 count = numpy.zeros((len(uniqueFS), 6)) | |
304 else: | |
305 count = numpy.zeros((len(uniqueFS), 7)) | |
306 state = 1 | |
307 for i in list1: | |
308 counts = list(Counter(i).items()) | |
309 hd = [item[0] for item in counts] | |
310 c = [item[1] for item in counts] | |
311 table = numpy.column_stack((hd, c)) | |
312 if len(table) == 0: | |
313 state = state + 1 | |
314 continue | |
315 else: | |
316 if state == 1: | |
317 for k, l in zip(uniqueFS, nr): | |
318 for j in table: | |
319 if j[0] == uniqueFS[l]: | |
320 count[l, 0] = j[1] | |
321 if state == 2: | |
322 for k, l in zip(uniqueFS, nr): | |
323 for j in table: | |
324 if j[0] == uniqueFS[l]: | |
325 count[l, 1] = j[1] | |
326 if state == 3: | |
327 for k, l in zip(uniqueFS, nr): | |
328 for j in table: | |
329 if j[0] == uniqueFS[l]: | |
330 count[l, 2] = j[1] | |
331 if state == 4: | |
332 for k, l in zip(uniqueFS, nr): | |
333 for j in table: | |
334 if j[0] == uniqueFS[l]: | |
335 count[l, 3] = j[1] | |
336 if state == 5: | |
337 for k, l in zip(uniqueFS, nr): | |
338 for j in table: | |
339 if j[0] == uniqueFS[l]: | |
340 count[l, 4] = j[1] | |
341 if state == 6: | |
342 for k, l in zip(uniqueFS, nr): | |
343 for j in table: | |
344 if j[0] == uniqueFS[l]: | |
345 count[l, 5] = j[1] | |
346 if state == 7: | |
347 for k, l in zip(uniqueFS, nr): | |
348 for j in table: | |
349 if j[0] == uniqueFS[l]: | |
350 count[l, 6] = j[1] | |
351 state = state + 1 | |
352 sumRow = count.sum(axis=1) | |
353 sumCol = count.sum(axis=0) | |
354 uniqueFS = uniqueFS.astype(str) | |
355 if uniqueFS[len(uniqueFS) - 1] == "20": | |
356 uniqueFS[len(uniqueFS) - 1] = ">20" | |
357 first = ["FS={}".format(i) for i in uniqueFS] | |
358 final = numpy.column_stack((first, count, sumRow)) | |
359 return (final, sumCol) | |
360 | |
361 | |
362 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): | |
363 output_file.write(name) | |
364 output_file.write("\n") | |
365 if diff is False: | |
366 output_file.write("{}TD=1{}TD=2{}TD=3{}TD=4{}TD=5-8{}TD>8{}sum{}\n".format( | |
367 sep, sep, sep, sep, sep, sep, sep, sep)) | |
368 else: | |
369 if rel is False: | |
370 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( | |
371 sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
372 else: | |
373 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". | |
374 format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
375 for item in summary: | |
376 for nr in item: | |
377 if "FS" not in nr and "diff" not in nr: | |
378 nr = nr.astype(float) | |
379 nr = nr.astype(int) | |
380 output_file.write("{}{}".format(nr, sep)) | |
381 output_file.write("\n") | |
382 output_file.write("sum{}".format(sep)) | |
383 sumCol = map(int, sumCol) | |
384 for el in sumCol: | |
385 output_file.write("{}{}".format(el, sep)) | |
386 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
387 output_file.write("\n\n") | |
388 | |
389 | |
390 def createTableHD(list1, row_label): | |
391 selfAB = numpy.concatenate(list1) | |
392 uniqueHD = numpy.unique(selfAB) | |
393 nr = numpy.arange(0, len(uniqueHD), 1) | |
394 count = numpy.zeros((len(uniqueHD), 6)) | |
395 state = 1 | |
396 for i in list1: | |
397 counts = list(Counter(i).items()) | |
398 hd = [item[0] for item in counts] | |
399 c = [item[1] for item in counts] | |
400 table = numpy.column_stack((hd, c)) | |
401 if len(table) == 0: | |
402 state = state + 1 | |
403 continue | |
404 else: | |
405 if state == 1: | |
406 for k, l in zip(uniqueHD, nr): | |
407 for j in table: | |
408 if j[0] == uniqueHD[l]: | |
409 count[l, 0] = j[1] | |
410 if state == 2: | |
411 for k, l in zip(uniqueHD, nr): | |
412 for j in table: | |
413 if j[0] == uniqueHD[l]: | |
414 count[l, 1] = j[1] | |
415 if state == 3: | |
416 for k, l in zip(uniqueHD, nr): | |
417 for j in table: | |
418 if j[0] == uniqueHD[l]: | |
419 count[l, 2] = j[1] | |
420 if state == 4: | |
421 for k, l in zip(uniqueHD, nr): | |
422 for j in table: | |
423 if j[0] == uniqueHD[l]: | |
424 count[l, 3] = j[1] | |
425 if state == 5: | |
426 for k, l in zip(uniqueHD, nr): | |
427 for j in table: | |
428 if j[0] == uniqueHD[l]: | |
429 count[l, 4] = j[1] | |
430 if state == 6: | |
431 for k, l in zip(uniqueHD, nr): | |
432 for j in table: | |
433 if j[0] == uniqueHD[l]: | |
434 count[l, 5] = j[1] | |
435 state = state + 1 | |
436 sumRow = count.sum(axis=1) | |
437 sumCol = count.sum(axis=0) | |
438 first = ["{}{}".format(row_label, i) for i in uniqueHD] | |
439 final = numpy.column_stack((first, count, sumRow)) | |
440 return (final, sumCol) | |
441 | |
442 | |
443 def createTableHDwithTags(list1): | |
444 selfAB = numpy.concatenate(list1) | |
445 uniqueHD = numpy.unique(selfAB) | |
446 nr = numpy.arange(0, len(uniqueHD), 1) | |
447 count = numpy.zeros((len(uniqueHD), 5)) | |
448 state = 1 | |
449 for i in list1: | |
450 counts = list(Counter(i).items()) | |
451 hd = [item[0] for item in counts] | |
452 c = [item[1] for item in counts] | |
453 table = numpy.column_stack((hd, c)) | |
454 if len(table) == 0: | |
455 state = state + 1 | |
456 continue | |
457 else: | |
458 if state == 1: | |
459 for k, l in zip(uniqueHD, nr): | |
460 for j in table: | |
461 if j[0] == uniqueHD[l]: | |
462 count[l, 0] = j[1] | |
463 if state == 2: | |
464 for k, l in zip(uniqueHD, nr): | |
465 for j in table: | |
466 if j[0] == uniqueHD[l]: | |
467 count[l, 1] = j[1] | |
468 if state == 3: | |
469 for k, l in zip(uniqueHD, nr): | |
470 for j in table: | |
471 if j[0] == uniqueHD[l]: | |
472 count[l, 2] = j[1] | |
473 if state == 4: | |
474 for k, l in zip(uniqueHD, nr): | |
475 for j in table: | |
476 if j[0] == uniqueHD[l]: | |
477 count[l, 3] = j[1] | |
478 if state == 5: | |
479 for k, l in zip(uniqueHD, nr): | |
480 for j in table: | |
481 if j[0] == uniqueHD[l]: | |
482 count[l, 4] = j[1] | |
483 state = state + 1 | |
484 sumRow = count.sum(axis=1) | |
485 sumCol = count.sum(axis=0) | |
486 first = ["TD={}".format(i) for i in uniqueHD] | |
487 final = numpy.column_stack((first, count, sumRow)) | |
488 return (final, sumCol) | |
489 | |
490 | |
491 def createTableHDwithDCS(list1): | |
492 selfAB = numpy.concatenate(list1) | |
493 uniqueHD = numpy.unique(selfAB) | |
494 nr = numpy.arange(0, len(uniqueHD), 1) | |
495 count = numpy.zeros((len(uniqueHD), len(list1))) | |
496 state = 1 | |
497 for i in list1: | |
498 counts = list(Counter(i).items()) | |
499 hd = [item[0] for item in counts] | |
500 c = [item[1] for item in counts] | |
501 table = numpy.column_stack((hd, c)) | |
502 if len(table) == 0: | |
503 state = state + 1 | |
504 continue | |
505 else: | |
506 if state == 1: | |
507 for k, l in zip(uniqueHD, nr): | |
508 for j in table: | |
509 if j[0] == uniqueHD[l]: | |
510 count[l, 0] = j[1] | |
511 if state == 2: | |
512 for k, l in zip(uniqueHD, nr): | |
513 for j in table: | |
514 if j[0] == uniqueHD[l]: | |
515 count[l, 1] = j[1] | |
516 if state == 3: | |
517 for k, l in zip(uniqueHD, nr): | |
518 for j in table: | |
519 if j[0] == uniqueHD[l]: | |
520 count[l, 2] = j[1] | |
521 state = state + 1 | |
522 sumRow = count.sum(axis=1) | |
523 sumCol = count.sum(axis=0) | |
524 first = ["TD={}".format(i) for i in uniqueHD] | |
525 final = numpy.column_stack((first, count, sumRow)) | |
526 return (final, sumCol) | |
527 | |
528 | |
529 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): | |
530 output_file.write(name) | |
531 output_file.write("\n") | |
532 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( | |
533 sep, sep, sep, sep, sep, sep, sep, sep)) | |
534 for item in summary: | |
535 for nr in item: | |
536 if "TD" not in nr and "diff" not in nr: | |
537 nr = nr.astype(float) | |
538 nr = nr.astype(int) | |
539 output_file.write("{}{}".format(nr, sep)) | |
540 output_file.write("\n") | |
541 output_file.write("sum{}".format(sep)) | |
542 sumCol = map(int, sumCol) | |
543 for el in sumCol: | |
544 output_file.write("{}{}".format(el, sep)) | |
545 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
546 output_file.write("\n\n") | |
547 | |
548 | |
549 def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): | |
550 output_file.write(name) | |
551 output_file.write("\n") | |
552 output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) | |
553 for item in summary: | |
554 for nr in item: | |
555 if "TD" not in nr: | |
556 nr = nr.astype(float) | |
557 nr = nr.astype(int) | |
558 output_file.write("{}{}".format(nr, sep)) | |
559 output_file.write("\n") | |
560 output_file.write("sum{}".format(sep)) | |
561 sumCol = map(int, sumCol) | |
562 for el in sumCol: | |
563 output_file.write("{}{}".format(el, sep)) | |
564 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
565 output_file.write("\n\n") | |
566 | |
567 | |
568 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): | |
569 output_file.write(name) | |
570 output_file.write("\n") | |
571 output_file.write("{}TD a.min{}TD b.max{}TD b.min{}TD a.max{}TD a.min + b.max, TD a.max + b.min{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) | |
572 for item in summary: | |
573 for nr in item: | |
574 if "TD" not in nr: | |
575 nr = nr.astype(float) | |
576 nr = nr.astype(int) | |
577 output_file.write("{}{}".format(nr, sep)) | |
578 output_file.write("\n") | |
579 output_file.write("sum{}".format(sep)) | |
580 sumCol = map(int, sumCol) | |
581 for el in sumCol: | |
582 output_file.write("{}{}".format(el, sep)) | |
583 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
584 output_file.write("\n\n") | |
585 | |
586 | |
587 def hamming(array1, array2): | |
588 res = 99 * numpy.ones(len(array1)) | |
589 i = 0 | |
590 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
591 for a in array1: | |
592 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest | |
593 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero | |
594 i += 1 | |
595 return res | |
596 | |
597 | |
598 def hamming_difference(array1, array2, mate_b): | |
599 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
600 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | |
601 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | |
602 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | |
603 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | |
604 | |
605 # diff11 = 999 * numpy.ones(len(array2)) | |
606 # relativeDiffList = 999 * numpy.ones(len(array2)) | |
607 # ham1 = 999 * numpy.ones(len(array2)) | |
608 # ham2 = 999 * numpy.ones(len(array2)) | |
609 # min_valueList = 999 * numpy.ones(len(array2)) | |
610 # min_tagsList = 999 * numpy.ones(len(array2)) | |
611 # diff11_zeros = 999 * numpy.ones(len(array2)) | |
612 # min_tagsList_zeros = 999 * numpy.ones(len(array2)) | |
613 | |
614 diff11 = [] | |
615 relativeDiffList = [] | |
616 ham1 = [] | |
617 ham2 = [] | |
618 ham1min = [] | |
619 ham2min = [] | |
620 min_valueList = [] | |
621 min_tagsList = [] | |
622 diff11_zeros = [] | |
623 min_tagsList_zeros = [] | |
624 max_tag_list = [] | |
625 i = 0 # counter, only used to see how many HDs of tags were already calculated | |
626 if mate_b is False: # HD calculation for all a's | |
627 half1_mate1 = array1_half | |
628 half2_mate1 = array1_half2 | |
629 half1_mate2 = array2_half | |
630 half2_mate2 = array2_half2 | |
631 elif mate_b is True: # HD calculation for all b's | |
632 half1_mate1 = array1_half2 | |
633 half2_mate1 = array1_half | |
634 half1_mate2 = array2_half2 | |
635 half2_mate2 = array2_half | |
636 | |
637 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) | |
638 # print(len(half1_mate1)) | |
639 # half2_mate1 = half2_mate1[index_halves] | |
640 # array1 = array1[index_halves] | |
641 | |
642 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | |
643 # exclude identical tag from array2, to prevent comparison to itself | |
644 sameTag = numpy.where(array2 == tag)[0] | |
645 indexArray2 = numpy.arange(0, len(array2), 1) | |
646 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data | |
647 | |
648 # all tags without identical tag | |
649 array2_half_withoutSame = half1_mate2[index_withoutSame] | |
650 array2_half2_withoutSame = half2_mate2[index_withoutSame] | |
651 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | |
652 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | |
653 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | |
654 array2_half_withoutSame]) | |
655 min_index = numpy.where(dist == dist.min())[0] # get index of min HD | |
656 min_value = dist.min() | |
657 # min_value = dist[min_index] # get minimum HDs | |
658 # get all "b's" of the tag or all "a's" of the tag with minimum HD | |
659 min_tag_half2 = array2_half2_withoutSame[min_index] | |
660 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD | |
661 | |
662 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | |
663 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | |
664 max_value = dist_second_half.max() | |
665 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | |
666 max_tag = min_tag_array2[max_index] | |
667 | |
668 # for d, d2 in zip(min_value, max_value): | |
669 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | |
670 ham2.append(min_value) | |
671 ham2min.append(max_value) | |
672 else: # half1, corrects the variable of the HD from both halfs if it is a or b | |
673 ham1.append(min_value) | |
674 ham1min.append(max_value) | |
675 | |
676 min_valueList.append(min_value + max_value) | |
677 min_tagsList.append(tag) | |
678 difference1 = abs(min_value - max_value) | |
679 diff11.append(difference1) | |
680 rel_difference = round(float(difference1) / (min_value + max_value), 1) | |
681 relativeDiffList.append(rel_difference) | |
682 | |
683 # tags which have identical parts: | |
684 if min_value == 0 or max_value == 0: | |
685 min_tagsList_zeros.append(numpy.array(tag)) | |
686 difference1_zeros = abs(min_value - max_value) # td of non-identical part | |
687 diff11_zeros.append(difference1_zeros) | |
688 max_tag_list.append(numpy.array(max_tag)) | |
689 else: | |
690 min_tagsList_zeros.append(None) | |
691 diff11_zeros.append(None) | |
692 max_tag_list.append(None) | |
693 i += 1 | |
694 | |
695 # print(i) | |
696 # diff11 = [st for st in diff11 if st != 999] | |
697 # ham1 = [st for st in ham1 if st != 999] | |
698 # ham2 = [st for st in ham2 if st != 999] | |
699 # min_valueList = [st for st in min_valueList if st != 999] | |
700 # min_tagsList = [st for st in min_tagsList if st != 999] | |
701 # relativeDiffList = [st for st in relativeDiffList if st != 999] | |
702 # diff11_zeros = [st for st in diff11_zeros if st != 999] | |
703 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | |
704 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, | |
705 min_tagsList_zeros, ham1min, ham2min, max_tag_list]) | |
706 | |
707 | |
708 def readFileReferenceFree(file): | |
709 with open(file, 'r') as dest_f: | |
710 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | |
711 integers = numpy.array(data_array[:, 0]).astype(int) | |
712 return(integers, data_array) | |
713 | |
714 | |
715 def hammingDistanceWithFS(fs, ham): | |
716 fs = numpy.asarray(fs) | |
717 maximum = max(ham) | |
718 minimum = min(ham) | |
719 ham = numpy.asarray(ham) | |
720 | |
721 singletons = numpy.where(fs == 1)[0] | |
722 data = ham[singletons] | |
723 | |
724 hd2 = numpy.where(fs == 2)[0] | |
725 data2 = ham[hd2] | |
726 | |
727 hd3 = numpy.where(fs == 3)[0] | |
728 data3 = ham[hd3] | |
729 | |
730 hd4 = numpy.where(fs == 4)[0] | |
731 data4 = ham[hd4] | |
732 | |
733 hd5 = numpy.where((fs >= 5) & (fs <= 10))[0] | |
734 data5 = ham[hd5] | |
735 | |
736 hd6 = numpy.where(fs > 10)[0] | |
737 data6 = ham[hd6] | |
738 | |
739 list1 = [data, data2, data3, data4, data5, data6] | |
740 return(list1, maximum, minimum) | |
741 | |
742 | |
743 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): | |
744 hammingDistances = numpy.unique(ham) | |
745 fs = numpy.asarray(fs) | |
746 ham = numpy.asarray(ham) | |
747 bigFamilies2 = numpy.where(fs > 19)[0] | |
748 if len(bigFamilies2) != 0: | |
749 fs[bigFamilies2] = 20 | |
750 maximum = max(fs) | |
751 minimum = min(fs) | |
752 if diff is True: | |
753 hd0 = numpy.where(ham == 0)[0] | |
754 data0 = fs[hd0] | |
755 | |
756 if rel is True: | |
757 hd1 = numpy.where(ham == 0.1)[0] | |
758 else: | |
759 hd1 = numpy.where(ham == 1)[0] | |
760 data = fs[hd1] | |
761 | |
762 if rel is True: | |
763 hd2 = numpy.where(ham == 0.2)[0] | |
764 else: | |
765 hd2 = numpy.where(ham == 2)[0] | |
766 data2 = fs[hd2] | |
767 | |
768 if rel is True: | |
769 hd3 = numpy.where(ham == 0.3)[0] | |
770 else: | |
771 hd3 = numpy.where(ham == 3)[0] | |
772 data3 = fs[hd3] | |
773 | |
774 if rel is True: | |
775 hd4 = numpy.where(ham == 0.4)[0] | |
776 else: | |
777 hd4 = numpy.where(ham == 4)[0] | |
778 data4 = fs[hd4] | |
779 | |
780 if rel is True: | |
781 hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] | |
782 else: | |
783 hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] | |
784 data5 = fs[hd5] | |
785 | |
786 if rel is True: | |
787 hd6 = numpy.where(ham > 0.8)[0] | |
788 else: | |
789 hd6 = numpy.where(ham > 8)[0] | |
790 data6 = fs[hd6] | |
791 | |
792 if diff is True: | |
793 list1 = [data0, data, data2, data3, data4, data5, data6] | |
794 else: | |
795 list1 = [data, data2, data3, data4, data5, data6] | |
796 | |
797 return(list1, hammingDistances, maximum, minimum) | |
798 | |
799 | |
800 def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): | |
801 diff_zeros = numpy.array(diff_zeros) | |
802 maximum = numpy.amax(diff_zeros) | |
803 minimum = numpy.amin(diff_zeros) | |
804 minHD_tags_zeros = numpy.array(minHD_tags_zeros) | |
805 | |
806 idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) | |
807 subset_data = data_array[idx, :] | |
808 | |
809 seq = numpy.array(subset_data[:, 1]) | |
810 | |
811 # find all unique tags and get the indices for ALL tags, but only once | |
812 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
813 DCS_tags = u[c == 2] | |
814 rest_tags = u[c == 1] | |
815 | |
816 dcs = numpy.repeat("DCS", len(DCS_tags)) | |
817 idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) | |
818 sscs = subset_data[idx_sscs, 2] | |
819 | |
820 all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), | |
821 numpy.concatenate((dcs, sscs)))) | |
822 hd_DCS = [] | |
823 ab_SSCS = [] | |
824 ba_SSCS = [] | |
825 | |
826 for i in range(len(all_tags)): | |
827 tag = all_tags[i, :] | |
828 hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] | |
829 | |
830 if tag[1] == "DCS": | |
831 hd_DCS.append(hd) | |
832 elif tag[1] == "ab": | |
833 ab_SSCS.append(hd) | |
834 elif tag[1] == "ba": | |
835 ba_SSCS.append(hd) | |
836 | |
837 if len(hd_DCS) != 0: | |
838 hd_DCS = numpy.concatenate(hd_DCS) | |
839 if len(ab_SSCS) != 0: | |
840 ab_SSCS = numpy.concatenate(ab_SSCS) | |
841 if len(ba_SSCS) != 0: | |
842 ba_SSCS = numpy.concatenate(ba_SSCS) | |
843 list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting | |
844 return(list1, maximum, minimum) | |
845 | |
846 | |
847 def make_argparser(): | |
848 parser = argparse.ArgumentParser(description='Tag distance analysis of duplex sequencing data') | |
849 parser.add_argument('--inputFile', | |
850 help='Tabular File with three columns: ab or ba, tag and family size.') | |
851 parser.add_argument('--inputName1') | |
852 parser.add_argument('--sample_size', default=1000, type=int, | |
853 help='Sample size of Tag distance analysis.') | |
854 parser.add_argument('--subset_tag', default=0, type=int, | |
855 help='The tag is shortened to the given number.') | |
856 parser.add_argument('--nproc', default=4, type=int, | |
857 help='The tool runs with the given number of processors.') | |
858 parser.add_argument('--only_DCS', action="store_false", | |
859 help='Only tags of the DCSs are included in the HD analysis') | |
860 | |
861 parser.add_argument('--minFS', default=1, type=int, | |
862 help='Only tags, which have a family size greater or equal than specified, ' | |
863 'are included in the HD analysis') | |
864 parser.add_argument('--maxFS', default=0, type=int, | |
865 help='Only tags, which have a family size smaller or equal than specified, ' | |
866 'are included in the HD analysis') | |
867 parser.add_argument('--nr_above_bars', action="store_true", | |
868 help='If False, values above bars in the histograms are removed') | |
869 parser.add_argument('--rel_freq', action="store_false", | |
870 help='If True, the relative frequencies are displayed.') | |
871 | |
872 parser.add_argument('--output_tabular', default="data.tabular", type=str, | |
873 help='Name of the tabular file.') | |
874 parser.add_argument('--output_pdf', default="data.pdf", type=str, | |
875 help='Name of the pdf file.') | |
876 parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, | |
877 help='Name of the tabular file with all chimeric tags.') | |
878 | |
879 return parser | |
880 | |
881 | |
882 def Hamming_Distance_Analysis(argv): | |
883 parser = make_argparser() | |
884 args = parser.parse_args(argv[1:]) | |
885 file1 = args.inputFile | |
886 name1 = args.inputName1 | |
887 index_size = args.sample_size | |
888 title_savedFile_pdf = args.output_pdf | |
889 title_savedFile_csv = args.output_tabular | |
890 output_chimeras_tabular = args.output_chimeras_tabular | |
891 onlyDuplicates = args.only_DCS | |
892 rel_freq = args.rel_freq | |
893 minFS = args.minFS | |
894 maxFS = args.maxFS | |
895 nr_above_bars = args.nr_above_bars | |
896 subset = args.subset_tag | |
897 nproc = args.nproc | |
898 sep = "\t" | |
899 | |
900 # input checks | |
901 if index_size < 0: | |
902 print("index_size is a negative integer.") | |
903 exit(2) | |
904 if nproc <= 0: | |
905 print("nproc is smaller or equal zero") | |
906 exit(3) | |
907 if subset < 0: | |
908 print("subset_tag is smaller or equal zero.") | |
909 exit(5) | |
910 | |
911 # PLOT | |
912 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | |
913 plt.rcParams['xtick.labelsize'] = 14 | |
914 plt.rcParams['ytick.labelsize'] = 14 | |
915 plt.rcParams['patch.edgecolor'] = "#000000" | |
916 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | |
917 name1 = name1.split(".tabular")[0] | |
918 | |
919 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: | |
920 print("dataset: ", name1) | |
921 integers, data_array = readFileReferenceFree(file1) | |
922 data_array = numpy.array(data_array) | |
923 print("total nr of tags:", len(data_array)) | |
924 | |
925 # filter tags out which contain any other character than ATCG | |
926 valid_bases = ["A", "T", "G", "C"] | |
927 tagsToDelete = [] | |
928 for idx, t in enumerate(data_array[:, 1]): | |
929 for char in t: | |
930 if char not in valid_bases: | |
931 tagsToDelete.append(idx) | |
932 break | |
933 | |
934 if len(tagsToDelete) != 0: # delete tags with N in the tag from data | |
935 print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), | |
936 float(len(tagsToDelete)) / len(data_array)) | |
937 index_whole_array = numpy.arange(0, len(data_array), 1) | |
938 index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) | |
939 data_array = data_array[index_withoutN_inTag, :] | |
940 integers = integers[index_withoutN_inTag] | |
941 print("total nr of filtered tags:", len(data_array)) | |
942 | |
943 int_f = numpy.array(data_array[:, 0]).astype(int) | |
944 data_array = data_array[numpy.where(int_f >= minFS)] | |
945 integers = integers[integers >= minFS] | |
946 | |
947 # select family size for tags | |
948 if maxFS > 0: | |
949 int_f2 = numpy.array(data_array[:, 0]).astype(int) | |
950 data_array = data_array[numpy.where(int_f2 <= maxFS)] | |
951 integers = integers[integers <= maxFS] | |
952 | |
953 if onlyDuplicates is True: | |
954 tags = data_array[:, 2] | |
955 seq = data_array[:, 1] | |
956 | |
957 # find all unique tags and get the indices for ALL tags, but only once | |
958 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
959 d = u[c == 2] | |
960 | |
961 # get family sizes, tag for duplicates | |
962 duplTags_double = integers[numpy.in1d(seq, d)] | |
963 duplTags = duplTags_double[0::2] # ab of DCS | |
964 duplTagsBA = duplTags_double[1::2] # ba of DCS | |
965 | |
966 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | |
967 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | |
968 | |
969 if minFS > 1: | |
970 duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] | |
971 duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] | |
972 duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 | |
973 | |
974 data_array = numpy.column_stack((duplTags, duplTags_seq)) | |
975 data_array = numpy.column_stack((data_array, duplTags_tag)) | |
976 integers = numpy.array(data_array[:, 0]).astype(int) | |
977 print("DCS in whole dataset", len(data_array)) | |
978 | |
979 print("min FS", min(integers)) | |
980 print("max FS", max(integers)) | |
981 | |
982 # HD analysis for a subset of the tag | |
983 if subset > 0: | |
984 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | |
985 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | |
986 | |
987 flanking_region_float = float((len(tag1[0]) - subset)) / 2 | |
988 flanking_region = int(flanking_region_float) | |
989 if flanking_region_float % 2 == 0: | |
990 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) | |
991 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) | |
992 else: | |
993 flanking_region_rounded = int(round(flanking_region, 1)) | |
994 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded | |
995 tag1_shorten = numpy.array( | |
996 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) | |
997 tag2_shorten = numpy.array( | |
998 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | |
999 | |
1000 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | |
1001 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | |
1002 | |
1003 print("length of tag= ", len(data_array[0, 1])) | |
1004 # select sample: if no size given --> all vs. all comparison | |
1005 if index_size == 0: | |
1006 result = numpy.arange(0, len(data_array), 1) | |
1007 else: | |
1008 numpy.random.shuffle(data_array) | |
1009 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags | |
1010 result = numpy.random.choice(unique_indices, size=index_size, | |
1011 replace=False) # array of random sequences of size=index.size | |
1012 | |
1013 # result = numpy.random.choice(len(integers), size=index_size, | |
1014 # replace=False) # array of random sequences of size=index.size | |
1015 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] | |
1016 | |
1017 # with open("index_result.pkl", "wb") as o: | |
1018 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | |
1019 | |
1020 # save counts | |
1021 # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: | |
1022 # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) | |
1023 # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: | |
1024 # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) | |
1025 # | |
1026 # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: | |
1027 # result = pickle.load(f) | |
1028 # | |
1029 # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: | |
1030 # data_array = pickle.load(f1) | |
1031 | |
1032 # with open(data_folder + "index_result.txt", "w") as t: | |
1033 # for text in result: | |
1034 # t.write("{}\n".format(text)) | |
1035 | |
1036 # comparison random tags to whole dataset | |
1037 result1 = data_array[result, 1] # random tags | |
1038 result2 = data_array[:, 1] # all tags | |
1039 print("sample size= ", len(result1)) | |
1040 | |
1041 # HD analysis of whole tag | |
1042 proc_pool = Pool(nproc) | |
1043 chunks_sample = numpy.array_split(result1, nproc) | |
1044 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | |
1045 proc_pool.close() | |
1046 proc_pool.join() | |
1047 ham = numpy.concatenate(ham).astype(int) | |
1048 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | |
1049 # for h, tag in zip(ham, result1): | |
1050 # output_file1.write("{}\t{}\n".format(tag, h)) | |
1051 | |
1052 # # HD analysis for chimeric reads | |
1053 # result2 = data_array_whole_dataset[:,1] | |
1054 | |
1055 proc_pool_b = Pool(nproc) | |
1056 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | |
1057 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | |
1058 proc_pool_b.close() | |
1059 proc_pool_b.join() | |
1060 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), | |
1061 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) | |
1062 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), | |
1063 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) | |
1064 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), | |
1065 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) | |
1066 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), | |
1067 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | |
1068 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | |
1069 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | |
1070 | |
1071 rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) | |
1072 rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) | |
1073 diff1 = numpy.concatenate([item[0] for item in diff_list_a]) | |
1074 diff2 = numpy.concatenate([item[0] for item in diff_list_b]) | |
1075 | |
1076 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) | |
1077 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) | |
1078 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) | |
1079 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) | |
1080 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) | |
1081 | |
1082 chimera_tags1 = sum([item[10] for item in diff_list_a], []) | |
1083 chimera_tags2 = sum([item[10] for item in diff_list_b], []) | |
1084 | |
1085 rel_Diff = [] | |
1086 diff_zeros = [] | |
1087 minHD_tags_zeros = [] | |
1088 diff = [] | |
1089 chimera_tags = [] | |
1090 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ | |
1091 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, | |
1092 chimera_tags1, chimera_tags2): | |
1093 relatives = numpy.array([rel1, rel2]) | |
1094 absolutes = numpy.array([d1, d2]) | |
1095 max_idx = numpy.argmax(relatives) | |
1096 rel_Diff.append(relatives[max_idx]) | |
1097 diff.append(absolutes[max_idx]) | |
1098 | |
1099 if all(i is not None for i in [zeros1, zeros2]): | |
1100 diff_zeros.append(max(zeros1, zeros2)) | |
1101 minHD_tags_zeros.append(str(tag1)) | |
1102 tags = [ctag1, ctag2] | |
1103 chimera_tags.append(tags) | |
1104 elif zeros1 is not None and zeros2 is None: | |
1105 diff_zeros.append(zeros1) | |
1106 minHD_tags_zeros.append(str(tag1)) | |
1107 chimera_tags.append(ctag1) | |
1108 elif zeros1 is None and zeros2 is not None: | |
1109 diff_zeros.append(zeros2) | |
1110 minHD_tags_zeros.append(str(tag2)) | |
1111 chimera_tags.append(ctag2) | |
1112 | |
1113 chimera_tags_new = chimera_tags | |
1114 data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) | |
1115 # chimeras_dic = defaultdict(list) | |
1116 # | |
1117 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | |
1118 # if len(t2) >1 and type(t2) is not numpy.ndarray: | |
1119 # t2 = numpy.concatenate(t2) | |
1120 # chimeras_dic[t1].append(t2) | |
1121 | |
1122 checked_tags = [] | |
1123 stat_maxTags = [] | |
1124 | |
1125 with open(output_chimeras_tabular, "w") as output_file1: | |
1126 output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with TD=0\n") | |
1127 for i in range(len(data_chimeraAnalysis)): | |
1128 tag1 = data_chimeraAnalysis[i, 0] | |
1129 | |
1130 info_tag1 = data_array[data_array[:, 1] == tag1, :] | |
1131 fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] | |
1132 | |
1133 if tag1 in checked_tags: # skip tag if already written to file | |
1134 continue | |
1135 | |
1136 sample_half_a = tag1[0:(len(tag1)) / 2] | |
1137 sample_half_b = tag1[len(tag1) / 2:len(tag1)] | |
1138 | |
1139 max_tags = data_chimeraAnalysis[i, 1] | |
1140 if len(max_tags) > 1 and len(max_tags) != len(data_array[0, 1]) and type(max_tags) is not numpy.ndarray: | |
1141 max_tags = numpy.concatenate(max_tags) | |
1142 max_tags = numpy.unique(max_tags) | |
1143 stat_maxTags.append(len(max_tags)) | |
1144 | |
1145 info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] | |
1146 | |
1147 chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 | |
1148 chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 | |
1149 | |
1150 new_format = [] | |
1151 for j in range(len(max_tags)): | |
1152 fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] | |
1153 | |
1154 if sample_half_a == chimera_half_a[j]: | |
1155 max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) | |
1156 new_format.append(max_tag) | |
1157 | |
1158 elif sample_half_b == chimera_half_b[j]: | |
1159 max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) | |
1160 new_format.append(max_tag) | |
1161 checked_tags.append(max_tags[j]) | |
1162 | |
1163 sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) | |
1164 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | |
1165 | |
1166 checked_tags.append(tag1) | |
1167 | |
1168 output_file1.write( | |
1169 "This file contains all tags that were identified as chimeras as the first column and the " | |
1170 "corresponding tags which returned a Hamming distance of zero in either the first or the second " | |
1171 "half of the sample tag as the second column.\n" | |
1172 "The tags were separated by an empty space into their halves and the * marks the identical half.") | |
1173 output_file1.write("\n\nStatistics of nr. of tags that returned max. TD (2nd column)\n") | |
1174 output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) | |
1175 output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) | |
1176 output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) | |
1177 output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) | |
1178 output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) | |
1179 | |
1180 lenTags = len(data_array) | |
1181 len_sample = len(result1) | |
1182 | |
1183 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | |
1184 seq = numpy.array(data_array[result, 1]) # tags of sample | |
1185 ham = numpy.asarray(ham) # HD for sample of tags | |
1186 | |
1187 if onlyDuplicates is True: # ab and ba strands of DCSs | |
1188 quant = numpy.concatenate((quant, duplTagsBA[result])) | |
1189 seq = numpy.tile(seq, 2) | |
1190 ham = numpy.tile(ham, 2) | |
1191 diff = numpy.tile(diff, 2) | |
1192 rel_Diff = numpy.tile(rel_Diff, 2) | |
1193 diff_zeros = numpy.tile(diff_zeros, 2) | |
1194 | |
1195 nr_chimeric_tags = len(data_chimeraAnalysis) | |
1196 print("nr of chimeras", nr_chimeric_tags) | |
1197 | |
1198 # prepare data for different kinds of plots | |
1199 # distribution of FSs separated after HD | |
1200 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) | |
1201 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | |
1202 | |
1203 # get FS for all tags with min HD of analysis of chimeric reads | |
1204 # there are more tags than sample size in the plot, because one tag can have multiple minimas | |
1205 if onlyDuplicates: | |
1206 seqDic = defaultdict(list) | |
1207 for s, q in zip(seq, quant): | |
1208 seqDic[s].append(q) | |
1209 else: | |
1210 seqDic = dict(zip(seq, quant)) | |
1211 | |
1212 lst_minHD_tags = [] | |
1213 for i in minHD_tags: | |
1214 lst_minHD_tags.append(seqDic.get(i)) | |
1215 | |
1216 if onlyDuplicates: | |
1217 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], | |
1218 [item_b[1] for item_b in lst_minHD_tags])).astype(int) | |
1219 # histogram with absolute and relative difference between HDs of both parts of the tag | |
1220 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | |
1221 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | |
1222 rel_Diff) | |
1223 # chimeric read analysis: tags which have TD=0 in one of the halfs | |
1224 if len(minHD_tags_zeros) != 0: | |
1225 lst_minHD_tags_zeros = [] | |
1226 for i in minHD_tags_zeros: | |
1227 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | |
1228 if onlyDuplicates: | |
1229 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], | |
1230 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) | |
1231 | |
1232 # histogram with HD of non-identical half | |
1233 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | |
1234 lst_minHD_tags_zeros, diff_zeros) | |
1235 | |
1236 if onlyDuplicates is False: | |
1237 listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, | |
1238 diff_zeros, data_array) | |
1239 | |
1240 # plot Hamming Distance with Family size distribution | |
1241 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, | |
1242 subtitle="Tag distance separated by family size", lenTags=lenTags, | |
1243 xlabel="TD", nr_above_bars=nr_above_bars, len_sample=len_sample) | |
1244 | |
1245 # Plot FSD with separation after | |
1246 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, | |
1247 originalCounts=quant, subtitle="Family size distribution separated by Tag distance", | |
1248 pdf=pdf, relative=False, diff=False) | |
1249 | |
1250 # Plot HD within tags | |
1251 plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, | |
1252 rel_freq=rel_freq, len_sample=len_sample) | |
1253 | |
1254 # Plot difference between HD's separated after FSD | |
1255 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | |
1256 subtitle="Delta Tag distance within tags", lenTags=lenTags, rel_freq=rel_freq, | |
1257 xlabel="absolute delta TD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) | |
1258 | |
1259 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | |
1260 subtitle="Chimera Analysis: relative delta Tag distance", lenTags=lenTags, rel_freq=rel_freq, | |
1261 xlabel="relative delta TD", relative=True, nr_above_bars=nr_above_bars, | |
1262 nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
1263 | |
1264 # plots for chimeric reads | |
1265 if len(minHD_tags_zeros) != 0: | |
1266 # HD | |
1267 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | |
1268 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, | |
1269 lenTags=lenTags, xlabel="TD", relative=False, | |
1270 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
1271 | |
1272 if onlyDuplicates is False: | |
1273 plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, | |
1274 subtitle="Tag distance of chimeric families (CF)", rel_freq=rel_freq, | |
1275 lenTags=lenTags, xlabel="TD", relative=False, | |
1276 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
1277 | |
1278 # print all data to a CSV file | |
1279 # HD | |
1280 summary, sumCol = createTableHD(list1, "TD=") | |
1281 overallSum = sum(sumCol) # sum of columns in table | |
1282 | |
1283 # FSD | |
1284 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) | |
1285 overallSum5 = sum(sumCol5) | |
1286 | |
1287 # HD of both parts of the tag | |
1288 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) | |
1289 overallSum9 = sum(sumCol9) | |
1290 | |
1291 # HD | |
1292 # absolute difference | |
1293 summary11, sumCol11 = createTableHD(listDifference1, "diff=") | |
1294 overallSum11 = sum(sumCol11) | |
1295 # relative difference and all tags | |
1296 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | |
1297 overallSum13 = sum(sumCol13) | |
1298 | |
1299 # chimeric reads | |
1300 if len(minHD_tags_zeros) != 0: | |
1301 # absolute difference and tags where at least one half has HD=0 | |
1302 summary15, sumCol15 = createTableHD(listDifference1_zeros, "TD=") | |
1303 overallSum15 = sum(sumCol15) | |
1304 | |
1305 if onlyDuplicates is False: | |
1306 summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) | |
1307 overallSum16 = sum(sumCol16) | |
1308 | |
1309 output_file.write("{}\n".format(name1)) | |
1310 output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) | |
1311 | |
1312 # HD | |
1313 createFileHD(summary, sumCol, overallSum, output_file, | |
1314 "Tag distance separated by family size", sep) | |
1315 # FSD | |
1316 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | |
1317 "Family size distribution separated by Tag distance", sep, | |
1318 diff=False) | |
1319 | |
1320 # output_file.write("{}{}\n".format(sep, name1)) | |
1321 output_file.write("\n") | |
1322 max_fs = numpy.bincount(integers[result]) | |
1323 output_file.write("max. family size in sample:{}{}\n".format(sep, max(integers[result]))) | |
1324 output_file.write("absolute frequency:{}{}\n".format(sep, max_fs[len(max_fs) - 1])) | |
1325 output_file.write( | |
1326 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) | |
1327 | |
1328 # HD within tags | |
1329 output_file.write( | |
1330 "Chimera Analysis:\nThe tags are splitted into two halves (part a and b) for which the Tag distances (TD) are calculated seperately.\n" | |
1331 "The tag distance of the first half (part a) is calculated by comparing part a of the tag in the sample against all a parts in the dataset and by selecting the minimum value (TD a.min).\n" | |
1332 "In the next step, we select those tags that showed the minimum TD and estimate the TD for the second half (part b) of the tag by comparing part b against the previously selected subset.\n" | |
1333 "The maximum value represents then TD b.max. Finally, these process is repeated but starting with part b instead and TD b.min and TD a.max are calculated.\n" | |
1334 "Next, the absolute differences between TD a.min & TD b.max and TD b.min & TD a.max are estimated (delta HD).\n" | |
1335 "These are then divided by the sum of both parts (TD a.min + TD b.max or TD b.min + TD a.max, respectively) which give the relative differences between the partial HDs (rel. delta HD).\n" | |
1336 "For simplicity, we used the maximum value of the relative differences and the respective delta HD.\n" | |
1337 "Note that when only tags that can form a DCS are included in the analysis, the family sizes for both directions (ab and ba) of the strand will be included in the plots.\n") | |
1338 | |
1339 output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) | |
1340 | |
1341 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | |
1342 "Tag distance of each half in the tag", sep) | |
1343 createFileHD(summary11, sumCol11, overallSum11, output_file, | |
1344 "Absolute delta Tag distance within the tag", sep) | |
1345 | |
1346 createFileHD(summary13, sumCol13, overallSum13, output_file, | |
1347 "Chimera analysis: relative delta Tag distance", sep) | |
1348 | |
1349 if len(minHD_tags_zeros) != 0: | |
1350 output_file.write( | |
1351 "All tags are filtered and only those tags where one half is identical (TD=0) and therefore, have a relative delta TD of 1, are kept.\n" | |
1352 "These tags are considered as chimeras.\n") | |
1353 createFileHD(summary15, sumCol15, overallSum15, output_file, | |
1354 "Tag distance of chimeric families separated after FS", sep) | |
1355 | |
1356 if onlyDuplicates is False: | |
1357 createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, | |
1358 "Tag distance of chimeric families separated after DCS and single SSCS (ab, ba)", sep) | |
1359 | |
1360 output_file.write("\n") | |
1361 | |
1362 | |
1363 if __name__ == '__main__': | |
1364 sys.exit(Hamming_Distance_Analysis(sys.argv)) |