Mercurial > repos > mheinzl > hd
comparison hd.py @ 29:6b15b3b6405c draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit 5b3ab8c6467fe3a52e89f5a7d175bd8a0189018a-dirty
author | mheinzl |
---|---|
date | Wed, 24 Jul 2019 05:58:15 -0400 |
parents | 1fa7342a140d |
children | 46bfbec0f9e6 |
comparison
equal
deleted
inserted
replaced
28:1fa7342a140d | 29:6b15b3b6405c |
---|---|
3 # Hamming distance analysis of SSCSs | 3 # Hamming distance analysis of SSCSs |
4 # | 4 # |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | 5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) |
6 # Contact: monika.heinzl@edumail.at | 6 # Contact: monika.heinzl@edumail.at |
7 # | 7 # |
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input. | 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and |
9 # The program produces a plot which shows a histogram of Hamming distances separated after family sizes, | 9 # optionally a second TABULAR file as input. The program produces a plot which shows a histogram of Hamming distances |
10 # a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS. | 10 # separated after family sizes, a family size distribution separated after Hamming distances for all (sample_size=0) |
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads | 11 # or a given sample of SSCSs or SSCSs, which form a DCS. In additon, the tool produces HD and FSD plots for the |
12 # and finally a CSV file with the data of the plots. | 12 # difference between the HDs of both parts of the tags and for the chimeric reads and finally a CSV file with the |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # data of the plots. It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / | 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int |
18 # --nr_above_bars True/False --output_tabular outptufile_name_tabular | |
18 | 19 |
19 import argparse | 20 import argparse |
20 import itertools | 21 import itertools |
21 import operator | 22 import operator |
22 import sys | 23 import sys |
32 | 33 |
33 plt.switch_backend('agg') | 34 plt.switch_backend('agg') |
34 | 35 |
35 | 36 |
36 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, | 37 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, |
37 title_file1, subtitle, pdf, relative=False, diff=True): | 38 subtitle, pdf, relative=False, diff=True, rel_freq=False): |
38 if diff is False: | 39 if diff is False: |
39 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] | 40 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] |
40 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"] | 41 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"] |
41 else: | 42 else: |
42 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] | 43 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] |
46 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] | 47 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] |
47 | 48 |
48 fig = plt.figure(figsize=(6, 7)) | 49 fig = plt.figure(figsize=(6, 7)) |
49 ax = fig.add_subplot(111) | 50 ax = fig.add_subplot(111) |
50 plt.subplots_adjust(bottom=0.1) | 51 plt.subplots_adjust(bottom=0.1) |
51 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) | 52 p1 = numpy.bincount(numpy.concatenate(familySizeList1)) |
52 maximumY = numpy.amax(p1) | 53 maximumY = numpy.amax(p1) |
53 | 54 |
54 if len(range(minimumXFS, maximumXFS)) == 0: | 55 if len(range(minimumXFS, maximumXFS)) == 0: |
55 range1 = range(minimumXFS - 1, minimumXFS + 2) | 56 range1 = range(minimumXFS - 1, minimumXFS + 2) |
56 else: | 57 else: |
57 range1 = range(0, maximumXFS + 2) | 58 range1 = range(0, maximumXFS + 2) |
58 counts = plt.hist(familySizeList1, label=labels, | 59 |
59 color=colors, stacked=True, | 60 if rel_freq: |
60 rwidth=0.8, alpha=1, align="left", | 61 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(familySizeList1)) for data in familySizeList1] |
61 edgecolor="None", bins=range1) | 62 counts = plt.hist(familySizeList1, label=labels, weights=w, color=colors, stacked=True, |
63 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
64 plt.ylabel("Relative Frequency", fontsize=14) | |
65 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.1)) | |
66 else: | |
67 counts = plt.hist(familySizeList1, label=labels, color=colors, stacked=True, | |
68 rwidth=0.8, alpha=1, align="left", edgecolor="None", bins=range1) | |
69 if len(numpy.concatenate(familySizeList1)) != 0: | |
70 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) | |
71 plt.ylabel("Absolute Frequency", fontsize=14) | |
72 plt.ylim((0, maximumY * 1.2)) | |
62 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 73 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
63 | |
64 # plt.title(title_file1, fontsize=12) | |
65 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 74 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
66 plt.xlabel("Family size", fontsize=14) | 75 plt.xlabel("Family size", fontsize=14) |
67 plt.ylabel("Absolute Frequency", fontsize=14) | |
68 | |
69 ticks = numpy.arange(0, maximumXFS + 1, 1) | 76 ticks = numpy.arange(0, maximumXFS + 1, 1) |
70 ticks1 = map(str, ticks) | 77 ticks1 = map(str, ticks) |
71 if maximumXFS >= 20: | 78 if maximumXFS >= 20: |
72 ticks1[len(ticks1) - 1] = ">=20" | 79 ticks1[len(ticks1) - 1] = ">=20" |
73 plt.xticks(numpy.array(ticks), ticks1) | 80 plt.xticks(numpy.array(ticks), ticks1) |
74 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] | 81 [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0] |
75 | |
76 plt.xlim((0, maximumXFS + 1)) | 82 plt.xlim((0, maximumXFS + 1)) |
77 if len(numpy.concatenate(familySizeList1)) != 0: | |
78 plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1)) | |
79 | |
80 plt.ylim((0, maximumY * 1.2)) | |
81 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " | 83 legend = "\nfamily size: \nabsolute frequency: \nrelative frequency: " |
82 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) | 84 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) |
83 | 85 |
84 count = numpy.bincount(originalCounts) # original counts | 86 count = numpy.bincount(originalCounts) # original counts |
85 if max(originalCounts) >= 20: | 87 if max(originalCounts) >= 20: |
86 max_count = ">= 20" | 88 max_count = ">= 20" |
87 else: | 89 else: |
88 max_count = max(originalCounts) | 90 max_count = max(originalCounts) |
89 legend1 = "{}\n{}\n{:.5f}".format(max_count, count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) | 91 legend1 = "{}\n{}\n{:.5f}".format(max_count, p1[len(p1) - 1], float(p1[len(p1) - 1]) / sum(p1)) |
90 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) | 92 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) |
91 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1])) | 93 legend3 = "singletons\n{:,}\n{:.5f}".format(int(p1[1]), float(p1[1]) / sum(p1)) |
92 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) | 94 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) |
93 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 95 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
94 | |
95 pdf.savefig(fig, bbox_inches="tight") | 96 pdf.savefig(fig, bbox_inches="tight") |
96 plt.close("all") | 97 plt.close("all") |
97 | 98 |
98 | 99 |
99 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0): | 100 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, |
101 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
100 if relative is True: | 102 if relative is True: |
101 step = 0.1 | 103 step = 0.1 |
102 else: | 104 else: |
103 step = 1 | 105 step = 1 |
104 | 106 |
105 fig = plt.figure(figsize=(6, 8)) | 107 fig = plt.figure(figsize=(6, 8)) |
106 plt.subplots_adjust(bottom=0.1) | 108 plt.subplots_adjust(bottom=0.1) |
107 con_list1 = numpy.concatenate(list1) | 109 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) |
108 p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())]) | |
109 maximumY = numpy.amax(p1) | 110 maximumY = numpy.amax(p1) |
110 maximumX = int(maximumX) | |
111 print("max X", maximumX ) | |
112 | |
113 if relative is True: # relative difference | 111 if relative is True: # relative difference |
114 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) | 112 bin1 = numpy.arange(-1, maximumX + 0.2, 0.1) |
115 else: | 113 else: |
116 bin1 = maximumX + 1 | 114 bin1 = maximumX + 1 |
117 | 115 |
118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | 116 if rel_freq: |
119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", | 117 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] |
120 "FS>10"], rwidth=0.8, | 118 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, |
121 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | 119 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, |
122 stacked=True, alpha=1, | 120 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], |
123 align="left", | 121 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) |
124 range=(0, maximumX + 1)) | 122 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2)) |
123 plt.ylabel("Relative Frequency", fontsize=14) | |
124 bins = counts[1] # width of bins | |
125 counts = numpy.array(map(float, counts[0][5])) | |
126 | |
127 else: | |
128 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
129 label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10", "FS>10"], rwidth=0.8, | |
130 color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"], | |
131 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
132 maximumY = numpy.amax(p1) | |
133 plt.ylim((0, maximumY * 1.2)) | |
134 plt.ylabel("Absolute Frequency", fontsize=14) | |
135 bins = counts[1] # width of bins | |
136 counts = numpy.array(map(int, counts[0][5])) | |
137 | |
125 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 138 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
126 bins = counts[1] # width of bins | |
127 counts = numpy.array(map(int, counts[0][5])) | |
128 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 139 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
129 # plt.title(title_file1, fontsize=12) | |
130 plt.xlabel(xlabel, fontsize=14) | 140 plt.xlabel(xlabel, fontsize=14) |
131 plt.ylabel("Absolute Frequency", fontsize=14) | |
132 | |
133 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 141 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
134 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | 142 plt.xlim((minimumX - step, maximumX + step)) |
143 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
135 plt.xticks(numpy.arange(0, maximumX + step, step)) | 144 plt.xticks(numpy.arange(0, maximumX + step, step)) |
136 | 145 |
137 plt.ylim((0, maximumY * 1.2)) | 146 if nr_above_bars: |
138 | |
139 if nr_above_bars is True: | |
140 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | 147 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] |
141 for x_label, label in zip(counts, bin_centers): # labels for values | 148 for x_label, label in zip(counts, bin_centers): # labels for values |
142 if x_label == 0: | 149 if x_label == 0: |
143 continue | 150 continue |
144 else: | 151 else: |
145 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), | 152 if rel_freq: |
146 xy=(label, x_label + len(con_list1) * 0.01), | 153 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), |
147 xycoords="data", color="#000066", fontsize=10) | 154 float(x_label)), |
148 | 155 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), |
149 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts)) | 156 xycoords="data", color="#000066", fontsize=10) |
150 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | 157 else: |
151 | 158 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), |
152 # if nr_unique_chimeras != 0 and len_sample != 0: | 159 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), |
153 # if relative == True: | 160 xycoords="data", color="#000066", fontsize=10) |
154 # legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, | 161 |
155 # int(nr_unique_chimeras) / float(len_sample)) | 162 if nr_unique_chimeras != 0: |
156 # else: | 163 if (relative and ((counts[len(counts)-1] / nr_unique_chimeras) == 2)) or \ |
157 # legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) | 164 (sum(counts) / nr_unique_chimeras) == 2: |
158 # plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure) | 165 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})"\ |
159 | 166 .format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) |
167 else: | |
168 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
169 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
170 else: | |
171 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
172 lenTags, len_sample, len(numpy.concatenate(list1))) | |
173 | |
174 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
160 pdf.savefig(fig, bbox_inches="tight") | 175 pdf.savefig(fig, bbox_inches="tight") |
161 plt.close("all") | 176 plt.close("all") |
162 plt.clf() | 177 plt.clf() |
163 | 178 |
164 | 179 |
165 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample): | 180 def plotHDwithDCS(list1, maximumX, minimumX, subtitle, lenTags, pdf, xlabel, relative=False, |
181 nr_above_bars=True, nr_unique_chimeras=0, len_sample=0, rel_freq=False): | |
182 step = 1 | |
166 fig = plt.figure(figsize=(6, 8)) | 183 fig = plt.figure(figsize=(6, 8)) |
167 plt.subplots_adjust(bottom=0.1) | 184 plt.subplots_adjust(bottom=0.1) |
185 p1 = numpy.array([v for k, v in sorted(Counter(numpy.concatenate(list1)).iteritems())]) | |
186 maximumY = numpy.amax(p1) | |
187 bin1 = maximumX + 1 | |
188 if rel_freq: | |
189 w = [numpy.zeros_like(data) + 1. / len(numpy.concatenate(list1)) for data in list1] | |
190 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, weights=w, | |
191 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
192 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
193 plt.ylim((0, (float(maximumY) / sum(p1)) * 1.2)) | |
194 plt.ylabel("Relative Frequency", fontsize=14) | |
195 bins = counts[1] # width of bins | |
196 counts = numpy.array(map(float, counts[0][2])) | |
197 | |
198 else: | |
199 counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1, | |
200 label=["DCS", "ab", "ba"], rwidth=0.8, color=["#FF0000", "#5FB404", "#FFBF00"], | |
201 stacked=True, alpha=1, align="left", range=(0, maximumX + 1)) | |
202 plt.ylim((0, maximumY * 1.2)) | |
203 plt.ylabel("Absolute Frequency", fontsize=14) | |
204 bins = counts[1] # width of bins | |
205 counts = numpy.array(map(int, counts[0][2])) | |
206 | |
207 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | |
208 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | |
209 plt.xlabel(xlabel, fontsize=14) | |
210 plt.grid(b=True, which='major', color='#424242', linestyle=':') | |
211 plt.xlim((minimumX - step, maximumX + step)) | |
212 # plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | |
213 plt.xticks(numpy.arange(0, maximumX + step, step)) | |
214 | |
215 if nr_above_bars: | |
216 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | |
217 for x_label, label in zip(counts, bin_centers): # labels for values | |
218 if x_label == 0: | |
219 continue | |
220 else: | |
221 if rel_freq: | |
222 plt.annotate("{:,}\n{:.3f}".format(int(round(x_label * len(numpy.concatenate(list1)))), | |
223 float(x_label)), | |
224 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.0001), | |
225 xycoords="data", color="#000066", fontsize=10) | |
226 else: | |
227 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts)), | |
228 xy=(label, x_label + len(numpy.concatenate(list1)) * 0.01), | |
229 xycoords="data", color="#000066", fontsize=10) | |
230 | |
231 if nr_unique_chimeras != 0: | |
232 if (sum(counts) / nr_unique_chimeras) == 2: | |
233 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,} ({:,})".\ | |
234 format(lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras, nr_unique_chimeras * 2) | |
235 else: | |
236 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}\nnr. of CF = {:,}".format( | |
237 lenTags, len_sample, len(numpy.concatenate(list1)), nr_unique_chimeras) | |
238 else: | |
239 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( | |
240 lenTags, len_sample, len(numpy.concatenate(list1))) | |
241 plt.text(0.14, -0.07, legend, size=12, transform=plt.gcf().transFigure) | |
242 | |
243 legend2 = "SSCS ab = {:,}\nSSCS ba = {:,}\nDCS = {:,}".format(len(list1[1]), len(list1[2]), len(list1[0])) | |
244 plt.text(0.6, -0.047, legend2, size=12, transform=plt.gcf().transFigure) | |
245 | |
246 pdf.savefig(fig, bbox_inches="tight") | |
247 plt.close("all") | |
248 plt.clf() | |
249 | |
250 | |
251 def plotHDwithinSeq(sum1, sum1min, sum2, sum2min, min_value, lenTags, pdf, len_sample, rel_freq=False): | |
252 fig = plt.figure(figsize=(6, 8)) | |
253 plt.subplots_adjust(bottom=0.1) | |
168 | 254 |
169 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags | 255 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags |
170 | |
171 maximumX = numpy.amax(numpy.concatenate(ham_partial)) | 256 maximumX = numpy.amax(numpy.concatenate(ham_partial)) |
172 minimumX = numpy.amin(numpy.concatenate(ham_partial)) | 257 minimumX = numpy.amin(numpy.concatenate(ham_partial)) |
173 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) | 258 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) |
174 | 259 |
175 if len(range(minimumX, maximumX)) == 0: | 260 if len(range(minimumX, maximumX)) == 0: |
176 range1 = minimumX | 261 range1 = minimumX |
177 else: | 262 else: |
178 range1 = range(minimumX, maximumX + 2) | 263 range1 = range(minimumX, maximumX + 2) |
179 | 264 |
180 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) | 265 if rel_freq: |
266 w = [numpy.zeros_like(data) + 1. / len(data) for data in ham_partial] | |
267 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, weights=w, | |
268 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"], | |
269 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
270 edgecolor='black', linewidth=1) | |
271 plt.ylabel("Relative Frequency", fontsize=14) | |
272 # plt.ylim(-0.1, (float(maximumY) / len(numpy.concatenate(ham_partial))) * 1.2) | |
273 else: | |
274 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, | |
275 label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b', a'+b"], | |
276 bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
277 edgecolor='black', linewidth=1) | |
278 plt.ylabel("Absolute Frequency", fontsize=14) | |
181 | 279 |
182 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | 280 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) |
183 plt.suptitle('Hamming distances within tags', fontsize=14) | 281 plt.suptitle('Hamming distances within tags', fontsize=14) |
184 # plt.title(title_file1, fontsize=12) | |
185 plt.xlabel("HD", fontsize=14) | 282 plt.xlabel("HD", fontsize=14) |
186 plt.ylabel("Absolute Frequency", fontsize=14) | |
187 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 283 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
188 | 284 plt.xlim((minimumX - 1, maximumX + 1)) |
189 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) | 285 # plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) |
190 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | 286 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) |
191 # plt.ylim(0, maximumY * 1.2) | 287 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format( |
192 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial))) | 288 lenTags, len_sample, len(numpy.concatenate(ham_partial))) |
193 | |
194 # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) | |
195 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | 289 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) |
196 pdf.savefig(fig, bbox_inches="tight") | 290 pdf.savefig(fig, bbox_inches="tight") |
197 plt.close("all") | 291 plt.close("all") |
198 plt.clf() | 292 plt.clf() |
199 | 293 |
204 nr = numpy.arange(0, len(uniqueFS), 1) | 298 nr = numpy.arange(0, len(uniqueFS), 1) |
205 if diff is False: | 299 if diff is False: |
206 count = numpy.zeros((len(uniqueFS), 6)) | 300 count = numpy.zeros((len(uniqueFS), 6)) |
207 else: | 301 else: |
208 count = numpy.zeros((len(uniqueFS), 7)) | 302 count = numpy.zeros((len(uniqueFS), 7)) |
209 | |
210 state = 1 | 303 state = 1 |
211 for i in list1: | 304 for i in list1: |
212 counts = list(Counter(i).items()) | 305 counts = list(Counter(i).items()) |
213 hd = [item[0] for item in counts] | 306 hd = [item[0] for item in counts] |
214 c = [item[1] for item in counts] | 307 c = [item[1] for item in counts] |
216 if len(table) == 0: | 309 if len(table) == 0: |
217 state = state + 1 | 310 state = state + 1 |
218 continue | 311 continue |
219 else: | 312 else: |
220 if state == 1: | 313 if state == 1: |
221 for i, l in zip(uniqueFS, nr): | 314 for k, l in zip(uniqueFS, nr): |
222 for j in table: | 315 for j in table: |
223 if j[0] == uniqueFS[l]: | 316 if j[0] == uniqueFS[l]: |
224 count[l, 0] = j[1] | 317 count[l, 0] = j[1] |
225 if state == 2: | 318 if state == 2: |
226 for i, l in zip(uniqueFS, nr): | 319 for k, l in zip(uniqueFS, nr): |
227 for j in table: | 320 for j in table: |
228 if j[0] == uniqueFS[l]: | 321 if j[0] == uniqueFS[l]: |
229 count[l, 1] = j[1] | 322 count[l, 1] = j[1] |
230 | |
231 if state == 3: | 323 if state == 3: |
232 for i, l in zip(uniqueFS, nr): | 324 for k, l in zip(uniqueFS, nr): |
233 for j in table: | 325 for j in table: |
234 if j[0] == uniqueFS[l]: | 326 if j[0] == uniqueFS[l]: |
235 count[l, 2] = j[1] | 327 count[l, 2] = j[1] |
236 | |
237 if state == 4: | 328 if state == 4: |
238 for i, l in zip(uniqueFS, nr): | 329 for k, l in zip(uniqueFS, nr): |
239 for j in table: | 330 for j in table: |
240 if j[0] == uniqueFS[l]: | 331 if j[0] == uniqueFS[l]: |
241 count[l, 3] = j[1] | 332 count[l, 3] = j[1] |
242 | |
243 if state == 5: | 333 if state == 5: |
244 for i, l in zip(uniqueFS, nr): | 334 for k, l in zip(uniqueFS, nr): |
245 for j in table: | 335 for j in table: |
246 if j[0] == uniqueFS[l]: | 336 if j[0] == uniqueFS[l]: |
247 count[l, 4] = j[1] | 337 count[l, 4] = j[1] |
248 | |
249 if state == 6: | 338 if state == 6: |
250 for i, l in zip(uniqueFS, nr): | 339 for k, l in zip(uniqueFS, nr): |
251 for j in table: | 340 for j in table: |
252 if j[0] == uniqueFS[l]: | 341 if j[0] == uniqueFS[l]: |
253 count[l, 5] = j[1] | 342 count[l, 5] = j[1] |
254 | |
255 if state == 7: | 343 if state == 7: |
256 for i, l in zip(uniqueFS, nr): | 344 for k, l in zip(uniqueFS, nr): |
257 for j in table: | 345 for j in table: |
258 if j[0] == uniqueFS[l]: | 346 if j[0] == uniqueFS[l]: |
259 count[l, 6] = j[1] | 347 count[l, 6] = j[1] |
260 state = state + 1 | 348 state = state + 1 |
261 | |
262 sumRow = count.sum(axis=1) | 349 sumRow = count.sum(axis=1) |
263 sumCol = count.sum(axis=0) | 350 sumCol = count.sum(axis=0) |
264 | |
265 uniqueFS = uniqueFS.astype(str) | 351 uniqueFS = uniqueFS.astype(str) |
266 if uniqueFS[len(uniqueFS) - 1] == "20": | 352 if uniqueFS[len(uniqueFS) - 1] == "20": |
267 uniqueFS[len(uniqueFS) - 1] = ">20" | 353 uniqueFS[len(uniqueFS) - 1] = ">20" |
268 | |
269 first = ["FS={}".format(i) for i in uniqueFS] | 354 first = ["FS={}".format(i) for i in uniqueFS] |
270 final = numpy.column_stack((first, count, sumRow)) | 355 final = numpy.column_stack((first, count, sumRow)) |
271 | |
272 return (final, sumCol) | 356 return (final, sumCol) |
273 | 357 |
274 | 358 |
275 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): | 359 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): |
276 output_file.write(name) | 360 output_file.write(name) |
277 output_file.write("\n") | 361 output_file.write("\n") |
278 if diff is False: | 362 if diff is False: |
279 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) | 363 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format( |
364 sep, sep, sep, sep, sep, sep, sep, sep)) | |
280 else: | 365 else: |
281 if rel is False: | 366 if rel is False: |
282 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | 367 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format( |
368 sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
283 else: | 369 else: |
284 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | 370 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n". |
285 | 371 format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) |
286 for item in summary: | 372 for item in summary: |
287 for nr in item: | 373 for nr in item: |
288 if "FS" not in nr and "diff" not in nr: | 374 if "FS" not in nr and "diff" not in nr: |
289 nr = nr.astype(float) | 375 nr = nr.astype(float) |
290 nr = nr.astype(int) | 376 nr = nr.astype(int) |
312 if len(table) == 0: | 398 if len(table) == 0: |
313 state = state + 1 | 399 state = state + 1 |
314 continue | 400 continue |
315 else: | 401 else: |
316 if state == 1: | 402 if state == 1: |
317 for i, l in zip(uniqueHD, nr): | 403 for k, l in zip(uniqueHD, nr): |
318 for j in table: | 404 for j in table: |
319 if j[0] == uniqueHD[l]: | 405 if j[0] == uniqueHD[l]: |
320 count[l, 0] = j[1] | 406 count[l, 0] = j[1] |
321 if state == 2: | 407 if state == 2: |
322 for i, l in zip(uniqueHD, nr): | 408 for k, l in zip(uniqueHD, nr): |
323 for j in table: | 409 for j in table: |
324 if j[0] == uniqueHD[l]: | 410 if j[0] == uniqueHD[l]: |
325 count[l, 1] = j[1] | 411 count[l, 1] = j[1] |
326 | |
327 if state == 3: | 412 if state == 3: |
328 for i, l in zip(uniqueHD, nr): | 413 for k, l in zip(uniqueHD, nr): |
329 for j in table: | 414 for j in table: |
330 if j[0] == uniqueHD[l]: | 415 if j[0] == uniqueHD[l]: |
331 count[l, 2] = j[1] | 416 count[l, 2] = j[1] |
332 | |
333 if state == 4: | 417 if state == 4: |
334 for i, l in zip(uniqueHD, nr): | 418 for k, l in zip(uniqueHD, nr): |
335 for j in table: | 419 for j in table: |
336 if j[0] == uniqueHD[l]: | 420 if j[0] == uniqueHD[l]: |
337 count[l, 3] = j[1] | 421 count[l, 3] = j[1] |
338 | |
339 if state == 5: | 422 if state == 5: |
340 for i, l in zip(uniqueHD, nr): | 423 for k, l in zip(uniqueHD, nr): |
341 for j in table: | 424 for j in table: |
342 if j[0] == uniqueHD[l]: | 425 if j[0] == uniqueHD[l]: |
343 count[l, 4] = j[1] | 426 count[l, 4] = j[1] |
344 | |
345 if state == 6: | 427 if state == 6: |
346 for i, l in zip(uniqueHD, nr): | 428 for k, l in zip(uniqueHD, nr): |
347 for j in table: | 429 for j in table: |
348 if j[0] == uniqueHD[l]: | 430 if j[0] == uniqueHD[l]: |
349 count[l, 5] = j[1] | 431 count[l, 5] = j[1] |
350 state = state + 1 | 432 state = state + 1 |
351 | |
352 sumRow = count.sum(axis=1) | 433 sumRow = count.sum(axis=1) |
353 sumCol = count.sum(axis=0) | 434 sumCol = count.sum(axis=0) |
354 first = ["{}{}".format(row_label, i) for i in uniqueHD] | 435 first = ["{}{}".format(row_label, i) for i in uniqueHD] |
355 final = numpy.column_stack((first, count, sumRow)) | 436 final = numpy.column_stack((first, count, sumRow)) |
356 | |
357 return (final, sumCol) | 437 return (final, sumCol) |
358 | 438 |
359 | 439 |
360 def createTableHDwithTags(list1): | 440 def createTableHDwithTags(list1): |
361 selfAB = numpy.concatenate(list1) | 441 selfAB = numpy.concatenate(list1) |
362 uniqueHD = numpy.unique(selfAB) | 442 uniqueHD = numpy.unique(selfAB) |
363 nr = numpy.arange(0, len(uniqueHD), 1) | 443 nr = numpy.arange(0, len(uniqueHD), 1) |
364 count = numpy.zeros((len(uniqueHD), 5)) | 444 count = numpy.zeros((len(uniqueHD), 5)) |
365 | |
366 state = 1 | 445 state = 1 |
367 for i in list1: | 446 for i in list1: |
368 counts = list(Counter(i).items()) | 447 counts = list(Counter(i).items()) |
369 hd = [item[0] for item in counts] | 448 hd = [item[0] for item in counts] |
370 c = [item[1] for item in counts] | 449 c = [item[1] for item in counts] |
372 if len(table) == 0: | 451 if len(table) == 0: |
373 state = state + 1 | 452 state = state + 1 |
374 continue | 453 continue |
375 else: | 454 else: |
376 if state == 1: | 455 if state == 1: |
377 for i, l in zip(uniqueHD, nr): | 456 for k, l in zip(uniqueHD, nr): |
378 for j in table: | 457 for j in table: |
379 if j[0] == uniqueHD[l]: | 458 if j[0] == uniqueHD[l]: |
380 count[l, 0] = j[1] | 459 count[l, 0] = j[1] |
381 if state == 2: | 460 if state == 2: |
382 for i, l in zip(uniqueHD, nr): | 461 for k, l in zip(uniqueHD, nr): |
383 for j in table: | 462 for j in table: |
384 if j[0] == uniqueHD[l]: | 463 if j[0] == uniqueHD[l]: |
385 count[l, 1] = j[1] | 464 count[l, 1] = j[1] |
386 if state == 3: | 465 if state == 3: |
387 for i, l in zip(uniqueHD, nr): | 466 for k, l in zip(uniqueHD, nr): |
388 for j in table: | 467 for j in table: |
389 if j[0] == uniqueHD[l]: | 468 if j[0] == uniqueHD[l]: |
390 count[l, 2] = j[1] | 469 count[l, 2] = j[1] |
391 if state == 4: | 470 if state == 4: |
392 for i, l in zip(uniqueHD, nr): | 471 for k, l in zip(uniqueHD, nr): |
393 for j in table: | 472 for j in table: |
394 if j[0] == uniqueHD[l]: | 473 if j[0] == uniqueHD[l]: |
395 count[l, 3] = j[1] | 474 count[l, 3] = j[1] |
396 if state == 5: | 475 if state == 5: |
397 for i, l in zip(uniqueHD, nr): | 476 for k, l in zip(uniqueHD, nr): |
398 for j in table: | 477 for j in table: |
399 if j[0] == uniqueHD[l]: | 478 if j[0] == uniqueHD[l]: |
400 count[l, 4] = j[1] | 479 count[l, 4] = j[1] |
401 | |
402 state = state + 1 | 480 state = state + 1 |
403 | |
404 sumRow = count.sum(axis=1) | 481 sumRow = count.sum(axis=1) |
405 sumCol = count.sum(axis=0) | 482 sumCol = count.sum(axis=0) |
406 first = ["HD={}".format(i) for i in uniqueHD] | 483 first = ["HD={}".format(i) for i in uniqueHD] |
407 final = numpy.column_stack((first, count, sumRow)) | 484 final = numpy.column_stack((first, count, sumRow)) |
408 | 485 return (final, sumCol) |
486 | |
487 | |
488 def createTableHDwithDCS(list1): | |
489 selfAB = numpy.concatenate(list1) | |
490 uniqueHD = numpy.unique(selfAB) | |
491 nr = numpy.arange(0, len(uniqueHD), 1) | |
492 count = numpy.zeros((len(uniqueHD), len(list1))) | |
493 state = 1 | |
494 for i in list1: | |
495 counts = list(Counter(i).items()) | |
496 hd = [item[0] for item in counts] | |
497 c = [item[1] for item in counts] | |
498 table = numpy.column_stack((hd, c)) | |
499 if len(table) == 0: | |
500 state = state + 1 | |
501 continue | |
502 else: | |
503 if state == 1: | |
504 for k, l in zip(uniqueHD, nr): | |
505 for j in table: | |
506 if j[0] == uniqueHD[l]: | |
507 count[l, 0] = j[1] | |
508 if state == 2: | |
509 for k, l in zip(uniqueHD, nr): | |
510 for j in table: | |
511 if j[0] == uniqueHD[l]: | |
512 count[l, 1] = j[1] | |
513 if state == 3: | |
514 for k, l in zip(uniqueHD, nr): | |
515 for j in table: | |
516 if j[0] == uniqueHD[l]: | |
517 count[l, 2] = j[1] | |
518 state = state + 1 | |
519 sumRow = count.sum(axis=1) | |
520 sumCol = count.sum(axis=0) | |
521 first = ["HD={}".format(i) for i in uniqueHD] | |
522 final = numpy.column_stack((first, count, sumRow)) | |
409 return (final, sumCol) | 523 return (final, sumCol) |
410 | 524 |
411 | 525 |
412 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): | 526 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): |
413 output_file.write(name) | 527 output_file.write(name) |
414 output_file.write("\n") | 528 output_file.write("\n") |
415 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) | 529 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format( |
530 sep, sep, sep, sep, sep, sep, sep, sep)) | |
416 for item in summary: | 531 for item in summary: |
417 for nr in item: | 532 for nr in item: |
418 if "HD" not in nr and "diff" not in nr: | 533 if "HD" not in nr and "diff" not in nr: |
419 nr = nr.astype(float) | 534 nr = nr.astype(float) |
420 nr = nr.astype(int) | 535 nr = nr.astype(int) |
426 output_file.write("{}{}".format(el, sep)) | 541 output_file.write("{}{}".format(el, sep)) |
427 output_file.write("{}{}".format(overallSum.astype(int), sep)) | 542 output_file.write("{}{}".format(overallSum.astype(int), sep)) |
428 output_file.write("\n\n") | 543 output_file.write("\n\n") |
429 | 544 |
430 | 545 |
431 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): | 546 def createFileHDwithDCS(summary, sumCol, overallSum, output_file, name, sep): |
432 output_file.write(name) | 547 output_file.write(name) |
433 output_file.write("\n") | 548 output_file.write("\n") |
434 output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) | 549 output_file.write("{}DCS{}SSCS ab{}SSCS ba{}sum{}\n".format(sep, sep, sep, sep, sep)) |
435 for item in summary: | 550 for item in summary: |
436 for nr in item: | 551 for nr in item: |
437 if "HD" not in nr: | 552 if "HD" not in nr: |
438 nr = nr.astype(float) | 553 nr = nr.astype(float) |
439 nr = nr.astype(int) | 554 nr = nr.astype(int) |
445 output_file.write("{}{}".format(el, sep)) | 560 output_file.write("{}{}".format(el, sep)) |
446 output_file.write("{}{}".format(overallSum.astype(int), sep)) | 561 output_file.write("{}{}".format(overallSum.astype(int), sep)) |
447 output_file.write("\n\n") | 562 output_file.write("\n\n") |
448 | 563 |
449 | 564 |
565 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): | |
566 output_file.write(name) | |
567 output_file.write("\n") | |
568 output_file.write("{}HD DCS{}HD b'{}HD b{}HD a'{}HD a+b', a'+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) | |
569 for item in summary: | |
570 for nr in item: | |
571 if "HD" not in nr: | |
572 nr = nr.astype(float) | |
573 nr = nr.astype(int) | |
574 output_file.write("{}{}".format(nr, sep)) | |
575 output_file.write("\n") | |
576 output_file.write("sum{}".format(sep)) | |
577 sumCol = map(int, sumCol) | |
578 for el in sumCol: | |
579 output_file.write("{}{}".format(el, sep)) | |
580 output_file.write("{}{}".format(overallSum.astype(int), sep)) | |
581 output_file.write("\n\n") | |
582 | |
583 | |
450 def hamming(array1, array2): | 584 def hamming(array1, array2): |
451 res = 99 * numpy.ones(len(array1)) | 585 res = 99 * numpy.ones(len(array1)) |
452 i = 0 | 586 i = 0 |
453 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 587 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
454 for a in array1: | 588 for a in array1: |
455 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest | 589 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest |
456 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero | 590 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero |
457 # print(i) | |
458 i += 1 | 591 i += 1 |
459 return res | 592 return res |
460 | 593 |
461 | 594 |
462 def hamming_difference(array1, array2, mate_b): | 595 def hamming_difference(array1, array2, mate_b): |
463 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 596 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
464 | |
465 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | 597 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 |
466 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | 598 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 |
467 | |
468 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | 599 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 |
469 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | 600 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 |
470 | 601 |
471 # diff11 = 999 * numpy.ones(len(array2)) | 602 # diff11 = 999 * numpy.ones(len(array2)) |
472 # relativeDiffList = 999 * numpy.ones(len(array2)) | 603 # relativeDiffList = 999 * numpy.ones(len(array2)) |
497 elif mate_b is True: # HD calculation for all b's | 628 elif mate_b is True: # HD calculation for all b's |
498 half1_mate1 = array1_half2 | 629 half1_mate1 = array1_half2 |
499 half2_mate1 = array1_half | 630 half2_mate1 = array1_half |
500 half1_mate2 = array2_half2 | 631 half1_mate2 = array2_half2 |
501 half2_mate2 = array2_half | 632 half2_mate2 = array2_half |
633 | |
502 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) | 634 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) |
503 # print(len(half1_mate1)) | 635 # print(len(half1_mate1)) |
504 # half2_mate1 = half2_mate1[index_halves] | 636 # half2_mate1 = half2_mate1[index_halves] |
505 # array1 = array1[index_halves] | 637 # array1 = array1[index_halves] |
506 | 638 |
512 | 644 |
513 # all tags without identical tag | 645 # all tags without identical tag |
514 array2_half_withoutSame = half1_mate2[index_withoutSame] | 646 array2_half_withoutSame = half1_mate2[index_withoutSame] |
515 array2_half2_withoutSame = half2_mate2[index_withoutSame] | 647 array2_half2_withoutSame = half2_mate2[index_withoutSame] |
516 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | 648 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) |
517 | 649 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" |
518 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | 650 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in |
519 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | 651 array2_half_withoutSame]) |
520 min_index = numpy.where(dist == dist.min())[0] # get index of min HD | 652 min_index = numpy.where(dist == dist.min())[0] # get index of min HD |
521 min_value = dist.min() | 653 min_value = dist.min() |
522 # min_value = dist[min_index] # get minimum HDs | 654 # min_value = dist[min_index] # get minimum HDs |
523 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD | 655 # get all "b's" of the tag or all "a's" of the tag with minimum HD |
656 min_tag_half2 = array2_half2_withoutSame[min_index] | |
524 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD | 657 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD |
525 | 658 |
526 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | 659 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in |
660 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | |
527 max_value = dist_second_half.max() | 661 max_value = dist_second_half.max() |
528 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | 662 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD |
529 max_tag = min_tag_array2[max_index] | 663 max_tag = min_tag_array2[max_index] |
530 | 664 |
531 # for d, d2 in zip(min_value, max_value): | 665 # for d, d2 in zip(min_value, max_value): |
546 # tags which have identical parts: | 680 # tags which have identical parts: |
547 if min_value == 0 or max_value == 0: | 681 if min_value == 0 or max_value == 0: |
548 min_tagsList_zeros.append(numpy.array(tag)) | 682 min_tagsList_zeros.append(numpy.array(tag)) |
549 difference1_zeros = abs(min_value - max_value) # hd of non-identical part | 683 difference1_zeros = abs(min_value - max_value) # hd of non-identical part |
550 diff11_zeros.append(difference1_zeros) | 684 diff11_zeros.append(difference1_zeros) |
551 max_tag_list.append(max_tag) | 685 max_tag_list.append(numpy.array(max_tag)) |
552 else: | 686 else: |
553 min_tagsList_zeros.append(None) | 687 min_tagsList_zeros.append(None) |
554 diff11_zeros.append(None) | 688 diff11_zeros.append(None) |
555 max_tag_list.append(numpy.array(["None"])) | 689 max_tag_list.append(None) |
556 | |
557 # max_tag_list.append(numpy.array(max_tag)) | |
558 | |
559 i += 1 | 690 i += 1 |
560 | 691 |
561 # print(i) | 692 # print(i) |
562 # diff11 = [st for st in diff11 if st != 999] | 693 # diff11 = [st for st in diff11 if st != 999] |
563 # ham1 = [st for st in ham1 if st != 999] | 694 # ham1 = [st for st in ham1 if st != 999] |
565 # min_valueList = [st for st in min_valueList if st != 999] | 696 # min_valueList = [st for st in min_valueList if st != 999] |
566 # min_tagsList = [st for st in min_tagsList if st != 999] | 697 # min_tagsList = [st for st in min_tagsList if st != 999] |
567 # relativeDiffList = [st for st in relativeDiffList if st != 999] | 698 # relativeDiffList = [st for st in relativeDiffList if st != 999] |
568 # diff11_zeros = [st for st in diff11_zeros if st != 999] | 699 # diff11_zeros = [st for st in diff11_zeros if st != 999] |
569 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | 700 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] |
570 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list]) | 701 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, |
702 min_tagsList_zeros, ham1min, ham2min, max_tag_list]) | |
571 | 703 |
572 | 704 |
573 def readFileReferenceFree(file): | 705 def readFileReferenceFree(file): |
574 with open(file, 'r') as dest_f: | 706 with open(file, 'r') as dest_f: |
575 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | 707 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') |
606 | 738 |
607 | 739 |
608 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): | 740 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): |
609 hammingDistances = numpy.unique(ham) | 741 hammingDistances = numpy.unique(ham) |
610 fs = numpy.asarray(fs) | 742 fs = numpy.asarray(fs) |
611 | |
612 ham = numpy.asarray(ham) | 743 ham = numpy.asarray(ham) |
613 bigFamilies2 = numpy.where(fs > 19)[0] | 744 bigFamilies2 = numpy.where(fs > 19)[0] |
614 if len(bigFamilies2) != 0: | 745 if len(bigFamilies2) != 0: |
615 fs[bigFamilies2] = 20 | 746 fs[bigFamilies2] = 20 |
616 maximum = max(fs) | 747 maximum = max(fs) |
659 list1 = [data0, data, data2, data3, data4, data5, data6] | 790 list1 = [data0, data, data2, data3, data4, data5, data6] |
660 else: | 791 else: |
661 list1 = [data, data2, data3, data4, data5, data6] | 792 list1 = [data, data2, data3, data4, data5, data6] |
662 | 793 |
663 return(list1, hammingDistances, maximum, minimum) | 794 return(list1, hammingDistances, maximum, minimum) |
795 | |
796 | |
797 def hammingDistanceWithDCS(minHD_tags_zeros, diff_zeros, data_array): | |
798 diff_zeros = numpy.array(diff_zeros) | |
799 maximum = numpy.amax(diff_zeros) | |
800 minimum = numpy.amin(diff_zeros) | |
801 minHD_tags_zeros = numpy.array(minHD_tags_zeros) | |
802 | |
803 idx = numpy.concatenate([numpy.where(data_array[:, 1] == i)[0] for i in minHD_tags_zeros]) | |
804 subset_data = data_array[idx, :] | |
805 | |
806 seq = numpy.array(subset_data[:, 1]) | |
807 | |
808 # find all unique tags and get the indices for ALL tags, but only once | |
809 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
810 DCS_tags = u[c == 2] | |
811 rest_tags = u[c == 1] | |
812 | |
813 dcs = numpy.repeat("DCS", len(DCS_tags)) | |
814 idx_sscs = numpy.concatenate([numpy.where(subset_data[:, 1] == i)[0] for i in rest_tags]) | |
815 sscs = subset_data[idx_sscs, 2] | |
816 | |
817 all_tags = numpy.column_stack((numpy.concatenate((DCS_tags, subset_data[idx_sscs, 1])), | |
818 numpy.concatenate((dcs, sscs)))) | |
819 hd_DCS = [] | |
820 ab_SSCS = [] | |
821 ba_SSCS = [] | |
822 | |
823 for i in range(len(all_tags)): | |
824 tag = all_tags[i, :] | |
825 hd = diff_zeros[numpy.where(minHD_tags_zeros == tag[0])[0]] | |
826 | |
827 if tag[1] == "DCS": | |
828 hd_DCS.append(hd) | |
829 elif tag[1] == "ab": | |
830 ab_SSCS.append(hd) | |
831 elif tag[1] == "ba": | |
832 ba_SSCS.append(hd) | |
833 | |
834 if len(hd_DCS) != 0: | |
835 hd_DCS = numpy.concatenate(hd_DCS) | |
836 if len(ab_SSCS) != 0: | |
837 ab_SSCS = numpy.concatenate(ab_SSCS) | |
838 if len(ba_SSCS) != 0: | |
839 ba_SSCS = numpy.concatenate(ba_SSCS) | |
840 list1 = [hd_DCS, ab_SSCS, ba_SSCS] # list for plotting | |
841 return(list1, maximum, minimum) | |
664 | 842 |
665 | 843 |
666 def make_argparser(): | 844 def make_argparser(): |
667 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') | 845 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') |
668 parser.add_argument('--inputFile', | 846 parser.add_argument('--inputFile', |
676 help='The tool runs with the given number of processors.') | 854 help='The tool runs with the given number of processors.') |
677 parser.add_argument('--only_DCS', action="store_false", | 855 parser.add_argument('--only_DCS', action="store_false", |
678 help='Only tags of the DCSs are included in the HD analysis') | 856 help='Only tags of the DCSs are included in the HD analysis') |
679 | 857 |
680 parser.add_argument('--minFS', default=1, type=int, | 858 parser.add_argument('--minFS', default=1, type=int, |
681 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') | 859 help='Only tags, which have a family size greater or equal than specified, ' |
860 'are included in the HD analysis') | |
682 parser.add_argument('--maxFS', default=0, type=int, | 861 parser.add_argument('--maxFS', default=0, type=int, |
683 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') | 862 help='Only tags, which have a family size smaller or equal than specified, ' |
863 'are included in the HD analysis') | |
684 parser.add_argument('--nr_above_bars', action="store_true", | 864 parser.add_argument('--nr_above_bars', action="store_true", |
685 help='If no, values above bars in the histograms are removed') | 865 help='If False, values above bars in the histograms are removed') |
866 parser.add_argument('--rel_freq', action="store_false", | |
867 help='If True, the relative frequencies are displayed.') | |
686 | 868 |
687 parser.add_argument('--output_tabular', default="data.tabular", type=str, | 869 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
688 help='Name of the tabular file.') | 870 help='Name of the tabular file.') |
689 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 871 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
690 help='Name of the pdf file.') | 872 help='Name of the pdf file.') |
693 | 875 |
694 return parser | 876 return parser |
695 | 877 |
696 | 878 |
697 def Hamming_Distance_Analysis(argv): | 879 def Hamming_Distance_Analysis(argv): |
698 | 880 # def Hamming_Distance_Analysis(file1, name1, index_size, title_savedFile_pdf, title_savedFile_csv, |
881 # output_chimeras_tabular, onlyDuplicates, minFS=1, maxFS=0, nr_above_bars=True, | |
882 # subset=False, nproc=12, rel_freq=False): | |
699 parser = make_argparser() | 883 parser = make_argparser() |
700 args = parser.parse_args(argv[1:]) | 884 args = parser.parse_args(argv[1:]) |
701 | |
702 file1 = args.inputFile | 885 file1 = args.inputFile |
703 name1 = args.inputName1 | 886 name1 = args.inputName1 |
704 | |
705 index_size = args.sample_size | 887 index_size = args.sample_size |
706 title_savedFile_pdf = args.output_pdf | 888 title_savedFile_pdf = args.output_pdf |
707 title_savedFile_csv = args.output_tabular | 889 title_savedFile_csv = args.output_tabular |
708 output_chimeras_tabular = args.output_chimeras_tabular | 890 output_chimeras_tabular = args.output_chimeras_tabular |
709 | |
710 sep = "\t" | |
711 onlyDuplicates = args.only_DCS | 891 onlyDuplicates = args.only_DCS |
892 rel_freq = args.rel_freq | |
712 minFS = args.minFS | 893 minFS = args.minFS |
713 maxFS = args.maxFS | 894 maxFS = args.maxFS |
714 nr_above_bars = args.nr_above_bars | 895 nr_above_bars = args.nr_above_bars |
715 | |
716 subset = args.subset_tag | 896 subset = args.subset_tag |
717 nproc = args.nproc | 897 nproc = args.nproc |
898 sep = "\t" | |
718 | 899 |
719 # input checks | 900 # input checks |
720 if index_size < 0: | 901 if index_size < 0: |
721 print("index_size is a negative integer.") | 902 print("index_size is a negative integer.") |
722 exit(2) | 903 exit(2) |
723 | |
724 if nproc <= 0: | 904 if nproc <= 0: |
725 print("nproc is smaller or equal zero") | 905 print("nproc is smaller or equal zero") |
726 exit(3) | 906 exit(3) |
727 | |
728 if subset < 0: | 907 if subset < 0: |
729 print("subset_tag is smaller or equal zero.") | 908 print("subset_tag is smaller or equal zero.") |
730 exit(5) | 909 exit(5) |
731 | 910 |
732 # PLOT | 911 # PLOT |
733 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | 912 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color |
734 plt.rcParams['xtick.labelsize'] = 14 | 913 plt.rcParams['xtick.labelsize'] = 14 |
735 plt.rcParams['ytick.labelsize'] = 14 | 914 plt.rcParams['ytick.labelsize'] = 14 |
736 plt.rcParams['patch.edgecolor'] = "#000000" | 915 plt.rcParams['patch.edgecolor'] = "#000000" |
737 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 916 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
738 | |
739 name1 = name1.split(".tabular")[0] | 917 name1 = name1.split(".tabular")[0] |
740 | 918 |
741 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: | 919 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: |
742 print("dataset: ", name1) | 920 print("dataset: ", name1) |
743 integers, data_array = readFileReferenceFree(file1) | 921 integers, data_array = readFileReferenceFree(file1) |
744 data_array = numpy.array(data_array) | 922 data_array = numpy.array(data_array) |
745 print("total nr of tags with Ns:", len(data_array)) | 923 print("total nr of tags:", len(data_array)) |
746 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] | 924 |
747 if len(n) != 0: # delete tags with N in the tag from data | 925 # filter tags out which contain any other character than ATCG |
748 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) | 926 valid_bases = ["A", "T", "G", "C"] |
927 tagsToDelete = [] | |
928 for idx, t in enumerate(data_array[:, 1]): | |
929 for char in t: | |
930 if char not in valid_bases: | |
931 tagsToDelete.append(idx) | |
932 break | |
933 | |
934 if len(tagsToDelete) != 0: # delete tags with N in the tag from data | |
935 print("nr of tags with any other character than A, T, C, G:", len(tagsToDelete), | |
936 float(len(tagsToDelete)) / len(data_array)) | |
749 index_whole_array = numpy.arange(0, len(data_array), 1) | 937 index_whole_array = numpy.arange(0, len(data_array), 1) |
750 index_withoutN_inTag = numpy.delete(index_whole_array, n) | 938 index_withoutN_inTag = numpy.delete(index_whole_array, tagsToDelete) |
751 data_array = data_array[index_withoutN_inTag, :] | 939 data_array = data_array[index_withoutN_inTag, :] |
752 integers = integers[index_withoutN_inTag] | 940 integers = integers[index_withoutN_inTag] |
753 print("total nr of tags without Ns:", len(data_array)) | 941 print("total nr of filtered tags:", len(data_array)) |
754 | 942 |
755 int_f = numpy.array(data_array[:, 0]).astype(int) | 943 int_f = numpy.array(data_array[:, 0]).astype(int) |
756 data_array = data_array[numpy.where(int_f >= minFS)] | 944 data_array = data_array[numpy.where(int_f >= minFS)] |
757 integers = integers[integers >= minFS] | 945 integers = integers[integers >= minFS] |
758 | 946 |
766 tags = data_array[:, 2] | 954 tags = data_array[:, 2] |
767 seq = data_array[:, 1] | 955 seq = data_array[:, 1] |
768 | 956 |
769 # find all unique tags and get the indices for ALL tags, but only once | 957 # find all unique tags and get the indices for ALL tags, but only once |
770 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | 958 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) |
771 d = u[c > 1] | 959 d = u[c == 2] |
772 | 960 |
773 # get family sizes, tag for duplicates | 961 # get family sizes, tag for duplicates |
774 duplTags_double = integers[numpy.in1d(seq, d)] | 962 duplTags_double = integers[numpy.in1d(seq, d)] |
775 duplTags = duplTags_double[0::2] # ab of DCS | 963 duplTags = duplTags_double[0::2] # ab of DCS |
776 duplTagsBA = duplTags_double[1::2] # ba of DCS | 964 duplTagsBA = duplTags_double[1::2] # ba of DCS |
777 | 965 |
778 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | 966 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab |
779 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | 967 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags |
780 | 968 |
781 if minFS > 1: | 969 if minFS > 1: |
782 duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)] | 970 duplTags_tag = duplTags_tag[(duplTags >= minFS) & (duplTagsBA >= minFS)] |
783 duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)] | 971 duplTags_seq = duplTags_seq[(duplTags >= minFS) & (duplTagsBA >= minFS)] |
784 duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 | 972 duplTags = duplTags[(duplTags >= minFS) & (duplTagsBA >= minFS)] # ab+ba with FS>=3 |
785 | 973 |
786 data_array = numpy.column_stack((duplTags, duplTags_seq)) | 974 data_array = numpy.column_stack((duplTags, duplTags_seq)) |
787 data_array = numpy.column_stack((data_array, duplTags_tag)) | 975 data_array = numpy.column_stack((data_array, duplTags_tag)) |
788 integers = numpy.array(data_array[:, 0]).astype(int) | 976 integers = numpy.array(data_array[:, 0]).astype(int) |
789 print("DCS in whole dataset", len(data_array)) | 977 print("DCS in whole dataset", len(data_array)) |
817 if index_size == 0: | 1005 if index_size == 0: |
818 result = numpy.arange(0, len(data_array), 1) | 1006 result = numpy.arange(0, len(data_array), 1) |
819 else: | 1007 else: |
820 numpy.random.shuffle(data_array) | 1008 numpy.random.shuffle(data_array) |
821 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags | 1009 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags |
822 result = numpy.random.choice(unique_indices, size=index_size, replace=False) # array of random sequences of size=index.size | 1010 result = numpy.random.choice(unique_indices, size=index_size, |
1011 replace=False) # array of random sequences of size=index.size | |
823 | 1012 |
824 # result = numpy.random.choice(len(integers), size=index_size, | 1013 # result = numpy.random.choice(len(integers), size=index_size, |
825 # replace=False) # array of random sequences of size=index.size | 1014 # replace=False) # array of random sequences of size=index.size |
826 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] | 1015 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] |
827 | 1016 |
828 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | 1017 # with open("index_result.pkl", "wb") as o: |
829 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | 1018 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) |
1019 | |
1020 # save counts | |
1021 # with open(data_folder + "index_sampleTags1000_Barcode3_DCS.pkl", "wb") as f: | |
1022 # pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) | |
1023 # with open(data_folder + "dataArray_sampleTags1000_Barcode3_DCS.pkl", "wb") as f1: | |
1024 # pickle.dump(data_array, f1, pickle.HIGHEST_PROTOCOL) | |
1025 # | |
1026 # with open(data_folder + "index_sampleTags100.pkl", "rb") as f: | |
1027 # result = pickle.load(f) | |
1028 # | |
1029 # with open(data_folder + "dataArray_sampleTags100.pkl", "rb") as f1: | |
1030 # data_array = pickle.load(f1) | |
1031 | |
1032 # with open(data_folder + "index_result.txt", "w") as t: | |
1033 # for text in result: | |
1034 # t.write("{}\n".format(text)) | |
830 | 1035 |
831 # comparison random tags to whole dataset | 1036 # comparison random tags to whole dataset |
832 result1 = data_array[result, 1] # random tags | 1037 result1 = data_array[result, 1] # random tags |
833 result2 = data_array[:, 1] # all tags | 1038 result2 = data_array[:, 1] # all tags |
834 print("sample size= ", len(result1)) | 1039 print("sample size= ", len(result1)) |
871 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) | 1076 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) |
872 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) | 1077 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) |
873 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) | 1078 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) |
874 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) | 1079 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) |
875 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) | 1080 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) |
876 chim_tags = [item[10] for item in diff_list_a] | 1081 |
877 chim_tags2 = [item[10] for item in diff_list_b] | 1082 chimera_tags1 = sum([item[10] for item in diff_list_a], []) |
878 chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i] | 1083 chimera_tags2 = numpy.concatenate([item[10] for item in diff_list_b]) |
879 chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i] | |
880 | 1084 |
881 rel_Diff = [] | 1085 rel_Diff = [] |
882 diff_zeros = [] | 1086 diff_zeros = [] |
883 minHD_tags_zeros = [] | 1087 minHD_tags_zeros = [] |
884 diff = [] | 1088 diff = [] |
885 chimera_tags = [] | 1089 chimera_tags = [] |
886 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ | 1090 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ |
887 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2): | 1091 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, |
1092 chimera_tags1, chimera_tags2): | |
888 rel_Diff.append(max(rel1, rel2)) | 1093 rel_Diff.append(max(rel1, rel2)) |
889 diff.append(max(d1, d2)) | 1094 diff.append(max(d1, d2)) |
890 | 1095 |
891 if all(i is not None for i in [zeros1, zeros2]): | 1096 if all(i is not None for i in [zeros1, zeros2]): |
892 diff_zeros.append(max(zeros1, zeros2)) | 1097 diff_zeros.append(max(zeros1, zeros2)) |
901 diff_zeros.append(zeros2) | 1106 diff_zeros.append(zeros2) |
902 minHD_tags_zeros.append(str(tag2)) | 1107 minHD_tags_zeros.append(str(tag2)) |
903 chimera_tags.append(ctag2) | 1108 chimera_tags.append(ctag2) |
904 | 1109 |
905 chimera_tags_new = chimera_tags | 1110 chimera_tags_new = chimera_tags |
906 #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) | 1111 data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) |
907 # chimeras_dic = defaultdict(list) | 1112 # chimeras_dic = defaultdict(list) |
908 # | 1113 # |
909 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | 1114 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): |
910 # if len(t2) >1 and type(t2) is not numpy.ndarray: | 1115 # if len(t2) >1 and type(t2) is not numpy.ndarray: |
911 # t2 = numpy.concatenate(t2) | 1116 # t2 = numpy.concatenate(t2) |
912 # chimeras_dic[t1].append(t2) | 1117 # chimeras_dic[t1].append(t2) |
913 | 1118 |
1119 checked_tags = [] | |
1120 stat_maxTags = [] | |
1121 | |
914 with open(output_chimeras_tabular, "w") as output_file1: | 1122 with open(output_chimeras_tabular, "w") as output_file1: |
915 output_file1.write("chimera tag\tsimilar tag with HD=0\n") | 1123 output_file1.write("chimera tag\tfamily size, read direction\tsimilar tag with HD=0\n") |
916 for i in range(len(minHD_tags_zeros)): | 1124 for i in range(len(data_chimeraAnalysis)): |
917 tag1 = minHD_tags_zeros[i] | 1125 tag1 = data_chimeraAnalysis[i, 0] |
1126 | |
1127 info_tag1 = data_array[data_array[:, 1] == tag1, :] | |
1128 fs_tag1 = ["{} {}".format(t[0], t[2]) for t in info_tag1] | |
1129 | |
1130 if tag1 in checked_tags: # skip tag if already written to file | |
1131 continue | |
1132 | |
918 sample_half_a = tag1[0:(len(tag1)) / 2] | 1133 sample_half_a = tag1[0:(len(tag1)) / 2] |
919 sample_half_b = tag1[len(tag1) / 2:len(tag1)] | 1134 sample_half_b = tag1[len(tag1) / 2:len(tag1)] |
920 | 1135 |
921 max_tags = chimera_tags_new[i] | 1136 max_tags = data_chimeraAnalysis[i, 1] |
922 if isinstance(max_tags, list) and len(max_tags) > 1: | 1137 if len(max_tags) > 1 and type(max_tags) is not numpy.ndarray: |
923 max_tags = numpy.concatenate(max_tags) | 1138 max_tags = numpy.concatenate(max_tags) |
924 #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray: | |
925 # print(max_tags) | |
926 # max_tags = numpy.concatenate(max_tags) | |
927 max_tags = numpy.unique(max_tags) | 1139 max_tags = numpy.unique(max_tags) |
928 | 1140 stat_maxTags.append(len(max_tags)) |
929 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 | 1141 |
930 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 | 1142 info_maxTags = [data_array[data_array[:, 1] == t, :] for t in max_tags] |
1143 | |
1144 chimera_half_a = numpy.array([t[0:(len(t)) / 2] for t in max_tags]) # mate1 part1 | |
1145 chimera_half_b = numpy.array([t[len(t) / 2:len(t)] for t in max_tags]) # mate1 part 2 | |
931 | 1146 |
932 new_format = [] | 1147 new_format = [] |
933 for j in range(len(max_tags)): | 1148 for j in range(len(max_tags)): |
1149 fs_maxTags = ["{} {}".format(t[0], t[2]) for t in info_maxTags[j]] | |
1150 | |
934 if sample_half_a == chimera_half_a[j]: | 1151 if sample_half_a == chimera_half_a[j]: |
935 max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j]) | 1152 max_tag = "*{}* {} {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) |
936 new_format.append(max_tag) | 1153 new_format.append(max_tag) |
937 | 1154 |
938 elif sample_half_b == chimera_half_b[j]: | 1155 elif sample_half_b == chimera_half_b[j]: |
939 max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j]) | 1156 max_tag = "{} *{}* {}".format(chimera_half_a[j], chimera_half_b[j], ", ".join(fs_maxTags)) |
940 new_format.append(max_tag) | 1157 new_format.append(max_tag) |
941 | 1158 checked_tags.append(max_tags[j]) |
942 sample_tag = "{} {}".format(sample_half_a, sample_half_b) | 1159 |
1160 sample_tag = "{} {}\t{}".format(sample_half_a, sample_half_b, ", ".join(fs_tag1)) | |
943 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | 1161 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) |
1162 | |
1163 checked_tags.append(tag1) | |
1164 | |
944 output_file1.write( | 1165 output_file1.write( |
945 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " | 1166 "This file contains all tags that were identified as chimeras as the first column and the " |
1167 "corresponding tags which returned a Hamming distance of zero in either the first or the second " | |
1168 "half of the sample tag as the second column.\n" | |
946 "The tags were separated by an empty space into their halves and the * marks the identical half.") | 1169 "The tags were separated by an empty space into their halves and the * marks the identical half.") |
947 | 1170 output_file1.write("\n\nStatistics of nr. of tags that returned max. HD (2nd column)\n") |
948 # unique_chimeras = numpy.array(minHD_tags_zeros) | 1171 output_file1.write("minimum\t{}\ttag(s)\n".format(numpy.amin(numpy.array(stat_maxTags)))) |
949 # | 1172 output_file1.write("mean\t{}\ttag(s)\n".format(numpy.mean(numpy.array(stat_maxTags)))) |
950 # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 | 1173 output_file1.write("median\t{}\ttag(s)\n".format(numpy.median(numpy.array(stat_maxTags)))) |
951 # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 | 1174 output_file1.write("maximum\t{}\ttag(s)\n".format(numpy.amax(numpy.array(stat_maxTags)))) |
952 # | 1175 output_file1.write("sum\t{}\ttag(s)\n".format(numpy.sum(numpy.array(stat_maxTags)))) |
953 # output_file1.write("sample tag\tsimilar tag\n") | |
954 # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): | |
955 # max_tags = numpy.concatenate(chimeras_dic.get(tag1)) | |
956 # max_tags = numpy.unique(max_tags) | |
957 # | |
958 # chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 | |
959 # chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 | |
960 # | |
961 # new_format = [] | |
962 # for i in range(len(max_tags)): | |
963 # if a == chimera_half_a[i]: | |
964 # max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) | |
965 # new_format.append(max_tag) | |
966 # | |
967 # elif b == chimera_half_b[i]: | |
968 # max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) | |
969 # new_format.append(max_tag) | |
970 # | |
971 # sample_tag = "{} {}".format(a, b) | |
972 # output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | |
973 # output_file1.write( | |
974 # "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " | |
975 # "The tags were separated by an empty space into their halves and the * marks the identical half.") | |
976 | |
977 nr_chimeric_tags = len(minHD_tags_zeros) | |
978 print("nr of unique chimeras", nr_chimeric_tags) | |
979 | 1176 |
980 lenTags = len(data_array) | 1177 lenTags = len(data_array) |
981 len_sample = len(result1) | 1178 len_sample = len(result1) |
982 | 1179 |
983 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | 1180 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags |
989 seq = numpy.tile(seq, 2) | 1186 seq = numpy.tile(seq, 2) |
990 ham = numpy.tile(ham, 2) | 1187 ham = numpy.tile(ham, 2) |
991 diff = numpy.tile(diff, 2) | 1188 diff = numpy.tile(diff, 2) |
992 rel_Diff = numpy.tile(rel_Diff, 2) | 1189 rel_Diff = numpy.tile(rel_Diff, 2) |
993 diff_zeros = numpy.tile(diff_zeros, 2) | 1190 diff_zeros = numpy.tile(diff_zeros, 2) |
1191 | |
1192 nr_chimeric_tags = len(data_chimeraAnalysis) | |
1193 print("nr of chimeras", nr_chimeric_tags) | |
994 | 1194 |
995 # prepare data for different kinds of plots | 1195 # prepare data for different kinds of plots |
996 # distribution of FSs separated after HD | 1196 # distribution of FSs separated after HD |
997 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) | 1197 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) |
998 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | 1198 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS |
1009 lst_minHD_tags = [] | 1209 lst_minHD_tags = [] |
1010 for i in minHD_tags: | 1210 for i in minHD_tags: |
1011 lst_minHD_tags.append(seqDic.get(i)) | 1211 lst_minHD_tags.append(seqDic.get(i)) |
1012 | 1212 |
1013 if onlyDuplicates: | 1213 if onlyDuplicates: |
1014 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int) | 1214 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], |
1015 | 1215 [item_b[1] for item_b in lst_minHD_tags])).astype(int) |
1016 # histogram with absolute and relative difference between HDs of both parts of the tag | 1216 # histogram with absolute and relative difference between HDs of both parts of the tag |
1017 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | 1217 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) |
1018 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | 1218 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, |
1019 rel_Diff) | 1219 rel_Diff) |
1020 # chimeric read analysis: tags which have HD=0 in one of the halfs | 1220 # chimeric read analysis: tags which have HD=0 in one of the halfs |
1021 if len(minHD_tags_zeros) != 0: | 1221 if len(minHD_tags_zeros) != 0: |
1022 lst_minHD_tags_zeros = [] | 1222 lst_minHD_tags_zeros = [] |
1023 for i in minHD_tags_zeros: | 1223 for i in minHD_tags_zeros: |
1024 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | 1224 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads |
1025 if onlyDuplicates: | 1225 if onlyDuplicates: |
1026 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) | 1226 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], |
1227 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) | |
1027 | 1228 |
1028 # histogram with HD of non-identical half | 1229 # histogram with HD of non-identical half |
1029 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) | 1230 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( |
1231 lst_minHD_tags_zeros, diff_zeros) | |
1232 | |
1233 if onlyDuplicates is False: | |
1234 listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros = hammingDistanceWithDCS(minHD_tags_zeros, | |
1235 diff_zeros, data_array) | |
1030 | 1236 |
1031 # plot Hamming Distance with Family size distribution | 1237 # plot Hamming Distance with Family size distribution |
1032 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | 1238 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, rel_freq=rel_freq, |
1033 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, | 1239 subtitle="Hamming distance separated by family size", lenTags=lenTags, |
1034 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample) | 1240 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample) |
1035 | 1241 |
1036 # Plot FSD with separation after | 1242 # Plot FSD with separation after |
1037 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 1243 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, rel_freq=rel_freq, |
1038 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", | 1244 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", |
1039 pdf=pdf, relative=False, title_file1=name1, diff=False) | 1245 pdf=pdf, relative=False, diff=False) |
1040 | 1246 |
1041 # Plot HD within tags | 1247 # Plot HD within tags |
1042 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, | 1248 plotHDwithinSeq(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, |
1043 title_file1=name1, len_sample=len_sample) | 1249 rel_freq=rel_freq, len_sample=len_sample) |
1044 | 1250 |
1045 # Plot difference between HD's separated after FSD | 1251 # Plot difference between HD's separated after FSD |
1046 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 1252 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, |
1047 subtitle="Delta Hamming distance within tags", | 1253 subtitle="Delta Hamming distance within tags", lenTags=lenTags, rel_freq=rel_freq, |
1048 title_file1=name1, lenTags=lenTags, | |
1049 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) | 1254 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) |
1050 | 1255 |
1051 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 1256 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
1052 subtitle="Chimera Analysis: relative delta Hamming distances", | 1257 subtitle="Chimera Analysis: relative delta Hamming distance", lenTags=lenTags, rel_freq=rel_freq, |
1053 title_file1=name1, lenTags=lenTags, | 1258 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, |
1054 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | 1259 nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
1055 | 1260 |
1056 # plots for chimeric reads | 1261 # plots for chimeric reads |
1057 if len(minHD_tags_zeros) != 0: | 1262 if len(minHD_tags_zeros) != 0: |
1058 # HD | 1263 # HD |
1059 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras", | 1264 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, |
1060 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, | 1265 subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq, |
1266 lenTags=lenTags, xlabel="HD", relative=False, | |
1061 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | 1267 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
1268 | |
1269 if onlyDuplicates is False: | |
1270 plotHDwithDCS(listDCS_zeros, maximumXDCS_zeros, minimumXDCS_zeros, pdf=pdf, | |
1271 subtitle="Hamming distance of chimeric families (CF)", rel_freq=rel_freq, | |
1272 lenTags=lenTags, xlabel="HD", relative=False, | |
1273 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | |
1062 | 1274 |
1063 # print all data to a CSV file | 1275 # print all data to a CSV file |
1064 # HD | 1276 # HD |
1065 summary, sumCol = createTableHD(list1, "HD=") | 1277 summary, sumCol = createTableHD(list1, "HD=") |
1066 overallSum = sum(sumCol) # sum of columns in table | 1278 overallSum = sum(sumCol) # sum of columns in table |
1085 if len(minHD_tags_zeros) != 0: | 1297 if len(minHD_tags_zeros) != 0: |
1086 # absolute difference and tags where at least one half has HD=0 | 1298 # absolute difference and tags where at least one half has HD=0 |
1087 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") | 1299 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") |
1088 overallSum15 = sum(sumCol15) | 1300 overallSum15 = sum(sumCol15) |
1089 | 1301 |
1302 if onlyDuplicates is False: | |
1303 summary16, sumCol16 = createTableHDwithDCS(listDCS_zeros) | |
1304 overallSum16 = sum(sumCol16) | |
1305 | |
1090 output_file.write("{}\n".format(name1)) | 1306 output_file.write("{}\n".format(name1)) |
1091 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | 1307 output_file.write("nr of tags{}{:,}\nsample size{}{:,}\n\n".format(sep, lenTags, sep, len_sample)) |
1092 numpy.concatenate(list1)), lenTags, lenTags)) | |
1093 | 1308 |
1094 # HD | 1309 # HD |
1095 createFileHD(summary, sumCol, overallSum, output_file, | 1310 createFileHD(summary, sumCol, overallSum, output_file, |
1096 "Hamming distance separated by family size", sep) | 1311 "Hamming distance separated by family size", sep) |
1097 # FSD | 1312 # FSD |
1107 output_file.write( | 1322 output_file.write( |
1108 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) | 1323 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) |
1109 | 1324 |
1110 # HD within tags | 1325 # HD within tags |
1111 output_file.write( | 1326 output_file.write( |
1112 "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n" | 1327 "The Hamming distances were calculated by comparing the first halve against all halves and selected the " |
1113 "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n" | 1328 "minimum value (HD a).\nFor the second half of the tag, we compared them against all tags which resulted " |
1114 "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" | 1329 "in the minimum HD of the previous step and selected the maximum value (HD b').\nFinally, it was possible " |
1115 "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta values in the end.\n" | 1330 "to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" |
1116 "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n") | 1331 "These calculations were repeated, but starting with the second half in the first step to find all " |
1117 | 1332 "possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta " |
1118 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | 1333 "values in the end.\nWhen only tags that can form DCS were allowed in the analysis, family sizes for the " |
1334 "forward and reverse (ab and ba) will be included in the plots.\n") | |
1335 | |
1336 output_file.write("\nlength of one half of the tag{}{}\n\n".format(sep, len(data_array[0, 1]) / 2)) | |
1119 | 1337 |
1120 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | 1338 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, |
1121 "Hamming distance of each half in the tag", sep) | 1339 "Hamming distance of each half in the tag", sep) |
1122 createFileHD(summary11, sumCol11, overallSum11, output_file, | 1340 createFileHD(summary11, sumCol11, overallSum11, output_file, |
1123 "Absolute delta Hamming distances within the tag", sep) | 1341 "Absolute delta Hamming distance within the tag", sep) |
1124 | 1342 |
1125 createFileHD(summary13, sumCol13, overallSum13, output_file, | 1343 createFileHD(summary13, sumCol13, overallSum13, output_file, |
1126 "Chimera analysis: relative delta Hamming distances", sep) | 1344 "Chimera analysis: relative delta Hamming distance", sep) |
1127 | 1345 |
1128 if len(minHD_tags_zeros) != 0: | 1346 if len(minHD_tags_zeros) != 0: |
1129 output_file.write( | 1347 output_file.write( |
1130 "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n") | 1348 "All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, " |
1349 "had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances " | |
1350 "of the chimeric tags are shown.\n") | |
1131 createFileHD(summary15, sumCol15, overallSum15, output_file, | 1351 createFileHD(summary15, sumCol15, overallSum15, output_file, |
1132 "Hamming distances of chimeras", sep) | 1352 "Hamming distance of chimeric families separated after FS", sep) |
1353 | |
1354 if onlyDuplicates is False: | |
1355 createFileHDwithDCS(summary16, sumCol16, overallSum16, output_file, | |
1356 "Hamming distance of chimeric families separated after DCS and single SSCS", sep) | |
1133 | 1357 |
1134 output_file.write("\n") | 1358 output_file.write("\n") |
1135 | 1359 |
1136 | 1360 |
1137 if __name__ == '__main__': | 1361 if __name__ == '__main__': |