Mercurial > repos > mheinzl > hd
comparison hd.py @ 19:2e9f7ea7ae93 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
author | mheinzl |
---|---|
date | Mon, 08 Oct 2018 05:56:04 -0400 |
parents | a8581bf627fd |
children | b084b6a8e3ac |
comparison
equal
deleted
inserted
replaced
18:a8581bf627fd | 19:2e9f7ea7ae93 |
---|---|
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads | 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads |
12 # and finally a CSV file with the data of the plots. | 12 # and finally a CSV file with the data of the plots. |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / | 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf |
18 | 18 |
19 import numpy | 19 import argparse |
20 import itertools | 20 import itertools |
21 import operator | 21 import operator |
22 import sys | |
23 from collections import Counter | |
24 from functools import partial | |
25 from multiprocessing.pool import Pool | |
26 | |
22 import matplotlib.pyplot as plt | 27 import matplotlib.pyplot as plt |
23 import os.path | 28 import numpy |
24 import cPickle as pickle | |
25 from multiprocessing.pool import Pool | |
26 from functools import partial | |
27 import argparse | |
28 import sys | |
29 import os | |
30 from matplotlib.backends.backend_pdf import PdfPages | 29 from matplotlib.backends.backend_pdf import PdfPages |
31 from collections import Counter | 30 |
32 | 31 plt.switch_backend('agg') |
33 def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, originalCounts, | 32 |
34 title_file1, subtitle, pdf, relative=False, diff = True): | 33 |
34 def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts, | |
35 title_file1, subtitle, pdf, relative=False, diff=True): | |
35 if diff is False: | 36 if diff is False: |
36 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] | 37 colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"] |
37 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"] | 38 labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"] |
38 else: | 39 else: |
39 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] | 40 colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"] |
40 if relative is True: | 41 if relative is True: |
41 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] | 42 labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"] |
42 else: | 43 else: |
43 labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"] | 44 labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"] |
44 | 45 |
45 fig = plt.figure(figsize=(6, 7)) | 46 fig = plt.figure(figsize=(6, 7)) |
46 ax = fig.add_subplot(111) | 47 ax = fig.add_subplot(111) |
47 plt.subplots_adjust(bottom=0.1) | 48 plt.subplots_adjust(bottom=0.1) |
48 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) | 49 p1 = numpy.bincount(numpy.concatenate((familySizeList1))) |
52 range1 = range(minimumXFS - 1, minimumXFS + 2) | 53 range1 = range(minimumXFS - 1, minimumXFS + 2) |
53 else: | 54 else: |
54 range1 = range(0, maximumXFS + 2) | 55 range1 = range(0, maximumXFS + 2) |
55 counts = plt.hist(familySizeList1, label=labels, | 56 counts = plt.hist(familySizeList1, label=labels, |
56 color=colors, stacked=True, | 57 color=colors, stacked=True, |
57 rwidth=0.8,alpha=1, align="left", | 58 rwidth=0.8, alpha=1, align="left", |
58 edgecolor="None",bins=range1) | 59 edgecolor="None", bins=range1) |
59 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 60 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
60 | 61 |
61 #plt.title(title_file1, fontsize=12) | 62 # plt.title(title_file1, fontsize=12) |
62 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 63 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
63 plt.xlabel("Family size", fontsize=14) | 64 plt.xlabel("Family size", fontsize=14) |
64 plt.ylabel("Absolute Frequency", fontsize=14) | 65 plt.ylabel("Absolute Frequency", fontsize=14) |
65 | 66 |
66 ticks = numpy.arange(0, maximumXFS + 1, 1) | 67 ticks = numpy.arange(0, maximumXFS + 1, 1) |
77 plt.ylim((0, maximumY * 1.2)) | 78 plt.ylim((0, maximumY * 1.2)) |
78 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: " | 79 legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: " |
79 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) | 80 plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure) |
80 | 81 |
81 count = numpy.bincount(originalCounts) # original counts | 82 count = numpy.bincount(originalCounts) # original counts |
82 legend1 = "{}\n{}\n{:.5f}" \ | 83 legend1 = "{}\n{}\n{:.5f}".format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) |
83 .format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count)) | |
84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) | 84 plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure) |
85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), | 85 legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]), float(counts[0][len(counts[0]) - 1][1]) / sum(counts[0][len(counts[0]) - 1])) |
86 float(counts[0][len(counts[0]) - 1][1]) / sum( | |
87 counts[0][len(counts[0]) - 1])) | |
88 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) | 86 plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12) |
89 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 87 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
90 | 88 |
91 pdf.savefig(fig, bbox_inches="tight") | 89 pdf.savefig(fig, bbox_inches="tight") |
92 plt.close("all") | 90 plt.close("all") |
93 | 91 |
94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, | 92 |
95 xlabel,relative=False, nr_above_bars = True): | 93 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True): |
96 if relative is True: | 94 if relative is True: |
97 step = 0.1 | 95 step = 0.1 |
98 else: | 96 else: |
99 step = 1 | 97 step = 1 |
100 | 98 |
118 range=(0, maximumX + 1)) | 116 range=(0, maximumX + 1)) |
119 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 117 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
120 bins = counts[1] # width of bins | 118 bins = counts[1] # width of bins |
121 counts = numpy.array(map(int, counts[0][5])) | 119 counts = numpy.array(map(int, counts[0][5])) |
122 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 120 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
123 # plt.title(title_file1, fontsize=12) | 121 # plt.title(title_file1, fontsize=12) |
124 plt.xlabel(xlabel, fontsize=14) | 122 plt.xlabel(xlabel, fontsize=14) |
125 plt.ylabel("Absolute Frequency", fontsize=14) | 123 plt.ylabel("Absolute Frequency", fontsize=14) |
126 | 124 |
127 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 125 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | 126 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) |
136 if x_label == 0: | 134 if x_label == 0: |
137 continue | 135 continue |
138 else: | 136 else: |
139 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), | 137 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), |
140 xy=(label, x_label + len(con_list1) * 0.01), | 138 xy=(label, x_label + len(con_list1) * 0.01), |
141 xycoords="data", color="#000066",fontsize=10) | 139 xycoords="data", color="#000066", fontsize=10) |
142 | 140 |
143 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) | 141 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) |
144 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 142 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
145 | 143 |
146 pdf.savefig(fig, bbox_inches="tight") | 144 pdf.savefig(fig, bbox_inches="tight") |
147 plt.close("all") | 145 plt.close("all") |
148 plt.clf() | 146 plt.clf() |
149 | 147 |
150 def plotHDwithinSeq_Sum2(sum1, sum2,sum1min, sum2min, min_value, lenTags, title_file1, pdf): | 148 |
149 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf): | |
151 fig = plt.figure(figsize=(6, 8)) | 150 fig = plt.figure(figsize=(6, 8)) |
152 plt.subplots_adjust(bottom=0.1) | 151 plt.subplots_adjust(bottom=0.1) |
153 | 152 |
154 #ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags | 153 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags |
155 ham = [sum1, sum2, sum1min, sum2min, numpy.array(min_value)] # new hd within tags | 154 |
156 | 155 maximumX = numpy.amax(numpy.concatenate(ham_partial)) |
157 | 156 minimumX = numpy.amin(numpy.concatenate(ham_partial)) |
158 maximumX = numpy.amax(numpy.concatenate(ham)) | 157 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial)))) |
159 minimumX = numpy.amin(numpy.concatenate(ham)) | |
160 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) | |
161 | 158 |
162 if len(range(minimumX, maximumX)) == 0: | 159 if len(range(minimumX, maximumX)) == 0: |
163 range1 = minimumX | 160 range1 = minimumX |
164 else: | 161 else: |
165 range1 = range(minimumX, maximumX + 2) | 162 range1 = range(minimumX, maximumX + 2) |
166 | 163 |
167 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, | 164 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) |
168 # label=[ "HD a", "HD b","HD a+b"], | 165 |
169 label=[ "HD a","HD b'", "HD b", "HD a'", "HD a+b"], | |
170 #bins=range1, color=[ "#58ACFA", "#FA5858","#585858"], | |
171 color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
172 edgecolor='black', linewidth=1) | |
173 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | 166 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) |
174 plt.suptitle('Hamming distances within tags', fontsize=14) | 167 plt.suptitle('Hamming distances within tags', fontsize=14) |
175 #plt.title(title_file1, fontsize=12) | 168 # plt.title(title_file1, fontsize=12) |
176 plt.xlabel("HD", fontsize=14) | 169 plt.xlabel("HD", fontsize=14) |
177 plt.ylabel("Absolute Frequency", fontsize=14) | 170 plt.ylabel("Absolute Frequency", fontsize=14) |
178 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 171 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
179 | 172 |
180 | 173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) |
181 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) | |
182 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | 174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) |
183 plt.ylim((0, maximumY * 1.2)) | 175 # plt.ylim(0, maximumY * 1.2) |
184 | 176 |
185 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) | 177 legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags) |
186 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
187 pdf.savefig(fig, bbox_inches="tight") | 179 pdf.savefig(fig, bbox_inches="tight") |
188 plt.close("all") | 180 plt.close("all") |
189 plt.clf() | 181 plt.clf() |
182 | |
190 | 183 |
191 def createTableFSD2(list1, diff=True): | 184 def createTableFSD2(list1, diff=True): |
192 selfAB = numpy.concatenate(list1) | 185 selfAB = numpy.concatenate(list1) |
193 uniqueFS = numpy.unique(selfAB) | 186 uniqueFS = numpy.unique(selfAB) |
194 nr = numpy.arange(0, len(uniqueFS), 1) | 187 nr = numpy.arange(0, len(uniqueFS), 1) |
206 if len(table) == 0: | 199 if len(table) == 0: |
207 state = state + 1 | 200 state = state + 1 |
208 continue | 201 continue |
209 else: | 202 else: |
210 if state == 1: | 203 if state == 1: |
211 for i, l in zip(uniqueFS, nr): | 204 for i, l in zip(uniqueFS, nr): |
212 for j in table: | 205 for j in table: |
213 if j[0] == uniqueFS[l]: | 206 if j[0] == uniqueFS[l]: |
214 count[l, 0] = j[1] | 207 count[l, 0] = j[1] |
215 if state == 2: | 208 if state == 2: |
216 for i, l in zip(uniqueFS, nr): | 209 for i, l in zip(uniqueFS, nr): |
259 first = ["FS={}".format(i) for i in uniqueFS] | 252 first = ["FS={}".format(i) for i in uniqueFS] |
260 final = numpy.column_stack((first, count, sumRow)) | 253 final = numpy.column_stack((first, count, sumRow)) |
261 | 254 |
262 return (final, sumCol) | 255 return (final, sumCol) |
263 | 256 |
264 def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True): | 257 |
258 def createFileFSD2(summary, sumCol, overallSum, output_file, name, sep, rel=False, diff=True): | |
265 output_file.write(name) | 259 output_file.write(name) |
266 output_file.write("\n") | 260 output_file.write("\n") |
267 if diff is False: | 261 if diff is False: |
268 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) | 262 output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) |
269 else: | 263 else: |
270 if rel is False: | 264 if rel is False: |
271 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) | 265 output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) |
272 else: | 266 else: |
273 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep)) | 267 output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) |
274 | 268 |
275 for item in summary: | 269 for item in summary: |
276 for nr in item: | 270 for nr in item: |
277 if "FS" not in nr and "diff" not in nr: | 271 if "FS" not in nr and "diff" not in nr: |
278 nr = nr.astype(float) | 272 nr = nr.astype(float) |
279 nr = nr.astype(int) | 273 nr = nr.astype(int) |
280 output_file.write("{}{}".format(nr,sep)) | 274 output_file.write("{}{}".format(nr, sep)) |
281 output_file.write("\n") | 275 output_file.write("\n") |
282 output_file.write("sum{}".format(sep)) | 276 output_file.write("sum{}".format(sep)) |
283 sumCol = map(int, sumCol) | 277 sumCol = map(int, sumCol) |
284 for el in sumCol: | 278 for el in sumCol: |
285 output_file.write("{}{}".format(el,sep)) | 279 output_file.write("{}{}".format(el, sep)) |
286 output_file.write("{}{}".format(overallSum.astype(int),sep)) | 280 output_file.write("{}{}".format(overallSum.astype(int), sep)) |
287 output_file.write("\n\n") | 281 output_file.write("\n\n") |
282 | |
288 | 283 |
289 def createTableHD(list1, row_label): | 284 def createTableHD(list1, row_label): |
290 selfAB = numpy.concatenate(list1) | 285 selfAB = numpy.concatenate(list1) |
291 uniqueHD = numpy.unique(selfAB) | 286 uniqueHD = numpy.unique(selfAB) |
292 nr = numpy.arange(0, len(uniqueHD), 1) | 287 nr = numpy.arange(0, len(uniqueHD), 1) |
300 if len(table) == 0: | 295 if len(table) == 0: |
301 state = state + 1 | 296 state = state + 1 |
302 continue | 297 continue |
303 else: | 298 else: |
304 if state == 1: | 299 if state == 1: |
305 for i, l in zip(uniqueHD, nr): | 300 for i, l in zip(uniqueHD, nr): |
306 for j in table: | 301 for j in table: |
307 if j[0] == uniqueHD[l]: | 302 if j[0] == uniqueHD[l]: |
308 count[l, 0] = j[1] | 303 count[l, 0] = j[1] |
309 if state == 2: | 304 if state == 2: |
310 for i, l in zip(uniqueHD, nr): | 305 for i, l in zip(uniqueHD, nr): |
337 count[l, 5] = j[1] | 332 count[l, 5] = j[1] |
338 state = state + 1 | 333 state = state + 1 |
339 | 334 |
340 sumRow = count.sum(axis=1) | 335 sumRow = count.sum(axis=1) |
341 sumCol = count.sum(axis=0) | 336 sumCol = count.sum(axis=0) |
342 first = ["{}{}".format(row_label,i) for i in uniqueHD] | 337 first = ["{}{}".format(row_label, i) for i in uniqueHD] |
343 final = numpy.column_stack((first, count, sumRow)) | 338 final = numpy.column_stack((first, count, sumRow)) |
344 | 339 |
345 return (final, sumCol) | 340 return (final, sumCol) |
341 | |
346 | 342 |
347 def createTableHDwithTags(list1): | 343 def createTableHDwithTags(list1): |
348 selfAB = numpy.concatenate(list1) | 344 selfAB = numpy.concatenate(list1) |
349 uniqueHD = numpy.unique(selfAB) | 345 uniqueHD = numpy.unique(selfAB) |
350 nr = numpy.arange(0, len(uniqueHD), 1) | 346 nr = numpy.arange(0, len(uniqueHD), 1) |
351 count = numpy.zeros((len(uniqueHD), 3)) | 347 count = numpy.zeros((len(uniqueHD), 5)) |
352 | 348 |
353 state = 1 | 349 state = 1 |
354 for i in list1: | 350 for i in list1: |
355 counts = list(Counter(i).items()) | 351 counts = list(Counter(i).items()) |
356 hd = [item[0] for item in counts] | 352 hd = [item[0] for item in counts] |
359 if len(table) == 0: | 355 if len(table) == 0: |
360 state = state + 1 | 356 state = state + 1 |
361 continue | 357 continue |
362 else: | 358 else: |
363 if state == 1: | 359 if state == 1: |
364 for i, l in zip(uniqueHD, nr): | 360 for i, l in zip(uniqueHD, nr): |
365 for j in table: | 361 for j in table: |
366 if j[0] == uniqueHD[l]: | 362 if j[0] == uniqueHD[l]: |
367 count[l, 0] = j[1] | 363 count[l, 0] = j[1] |
368 if state == 2: | 364 if state == 2: |
369 for i, l in zip(uniqueHD, nr): | 365 for i, l in zip(uniqueHD, nr): |
370 for j in table: | 366 for j in table: |
371 if j[0] == uniqueHD[l]: | 367 if j[0] == uniqueHD[l]: |
372 count[l, 1] = j[1] | 368 count[l, 1] = j[1] |
373 | |
374 if state == 3: | 369 if state == 3: |
375 for i, l in zip(uniqueHD, nr): | 370 for i, l in zip(uniqueHD, nr): |
376 for j in table: | 371 for j in table: |
377 if j[0] == uniqueHD[l]: | 372 if j[0] == uniqueHD[l]: |
378 count[l, 2] = j[1] | 373 count[l, 2] = j[1] |
374 if state == 4: | |
375 for i, l in zip(uniqueHD, nr): | |
376 for j in table: | |
377 if j[0] == uniqueHD[l]: | |
378 count[l, 3] = j[1] | |
379 if state == 5: | |
380 for i, l in zip(uniqueHD, nr): | |
381 for j in table: | |
382 if j[0] == uniqueHD[l]: | |
383 count[l, 4] = j[1] | |
384 | |
379 state = state + 1 | 385 state = state + 1 |
380 | 386 |
381 sumRow = count.sum(axis=1) | 387 sumRow = count.sum(axis=1) |
382 sumCol = count.sum(axis=0) | 388 sumCol = count.sum(axis=0) |
383 first = ["HD={}".format(i) for i in uniqueHD] | 389 first = ["HD={}".format(i) for i in uniqueHD] |
384 final = numpy.column_stack((first, count, sumRow)) | 390 final = numpy.column_stack((first, count, sumRow)) |
385 | 391 |
386 return (final, sumCol) | 392 return (final, sumCol) |
387 | 393 |
388 def createFileHD(summary, sumCol, overallSum, output_file, name,sep): | 394 |
395 def createFileHD(summary, sumCol, overallSum, output_file, name, sep): | |
389 output_file.write(name) | 396 output_file.write(name) |
390 output_file.write("\n") | 397 output_file.write("\n") |
391 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep)) | 398 output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) |
392 for item in summary: | 399 for item in summary: |
393 for nr in item: | 400 for nr in item: |
394 if "HD" not in nr and "diff" not in nr: | 401 if "HD" not in nr and "diff" not in nr: |
395 nr = nr.astype(float) | 402 nr = nr.astype(float) |
396 nr = nr.astype(int) | 403 nr = nr.astype(int) |
397 output_file.write("{}{}".format(nr,sep)) | 404 output_file.write("{}{}".format(nr, sep)) |
398 output_file.write("\n") | 405 output_file.write("\n") |
399 output_file.write("sum{}".format(sep)) | 406 output_file.write("sum{}".format(sep)) |
400 sumCol = map(int, sumCol) | 407 sumCol = map(int, sumCol) |
401 for el in sumCol: | 408 for el in sumCol: |
402 output_file.write("{}{}".format(el,sep)) | 409 output_file.write("{}{}".format(el, sep)) |
403 output_file.write("{}{}".format(overallSum.astype(int),sep)) | 410 output_file.write("{}{}".format(overallSum.astype(int), sep)) |
404 output_file.write("\n\n") | 411 output_file.write("\n\n") |
405 | 412 |
406 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): | 413 |
414 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name, sep): | |
407 output_file.write(name) | 415 output_file.write(name) |
408 output_file.write("\n") | 416 output_file.write("\n") |
409 output_file.write("{}HD a+b;HD a{}HD b{}sum{}\n".format(sep,sep,sep,sep)) | 417 output_file.write("{}HD a{}HD b'{}HD b{}HD a'{}HD a+b{}sum{}\n".format(sep, sep, sep, sep, sep, sep, sep)) |
410 for item in summary: | 418 for item in summary: |
411 for nr in item: | 419 for nr in item: |
412 if "HD" not in nr: | 420 if "HD" not in nr: |
413 nr = nr.astype(float) | 421 nr = nr.astype(float) |
414 nr = nr.astype(int) | 422 nr = nr.astype(int) |
415 output_file.write("{}{}".format(nr,sep)) | 423 output_file.write("{}{}".format(nr, sep)) |
416 output_file.write("\n") | 424 output_file.write("\n") |
417 output_file.write("sum{}".format(sep)) | 425 output_file.write("sum{}".format(sep)) |
418 sumCol = map(int, sumCol) | 426 sumCol = map(int, sumCol) |
419 for el in sumCol: | 427 for el in sumCol: |
420 output_file.write("{}{}".format(el,sep)) | 428 output_file.write("{}{}".format(el, sep)) |
421 output_file.write("{}{}".format(overallSum.astype(int),sep)) | 429 output_file.write("{}{}".format(overallSum.astype(int), sep)) |
422 output_file.write("\n\n") | 430 output_file.write("\n\n") |
423 | 431 |
432 | |
424 def hamming(array1, array2): | 433 def hamming(array1, array2): |
425 res = 99 * numpy.ones(len(array1)) | 434 res = 99 * numpy.ones(len(array1)) |
426 i = 0 | 435 i = 0 |
427 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 436 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
428 for a in array1: | 437 for a in array1: |
429 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest | 438 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest |
430 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero | 439 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero |
431 #print(i) | 440 # print(i) |
432 i += 1 | 441 i += 1 |
433 return res | 442 return res |
434 | 443 |
444 | |
435 def hamming_difference(array1, array2, mate_b): | 445 def hamming_difference(array1, array2, mate_b): |
436 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 446 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
437 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | 447 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 |
438 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | 448 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 |
439 | 449 |
440 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | 450 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 |
441 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | 451 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 |
442 | 452 |
443 #diff11 = 999 * numpy.ones(len(array2)) | 453 # diff11 = 999 * numpy.ones(len(array2)) |
444 #relativeDiffList = 999 * numpy.ones(len(array2)) | 454 # relativeDiffList = 999 * numpy.ones(len(array2)) |
445 #ham1 = 999 * numpy.ones(len(array2)) | 455 # ham1 = 999 * numpy.ones(len(array2)) |
446 #ham2 = 999 * numpy.ones(len(array2)) | 456 # ham2 = 999 * numpy.ones(len(array2)) |
447 #min_valueList = 999 * numpy.ones(len(array2)) | 457 # min_valueList = 999 * numpy.ones(len(array2)) |
448 #min_tagsList = 999 * numpy.ones(len(array2)) | 458 # min_tagsList = 999 * numpy.ones(len(array2)) |
449 #diff11_zeros = 999 * numpy.ones(len(array2)) | 459 # diff11_zeros = 999 * numpy.ones(len(array2)) |
450 #min_tagsList_zeros = 999 * numpy.ones(len(array2)) | 460 # min_tagsList_zeros = 999 * numpy.ones(len(array2)) |
451 | 461 |
452 | |
453 diff11 = [] | 462 diff11 = [] |
454 relativeDiffList = [] | 463 relativeDiffList = [] |
455 ham1 = [] | 464 ham1 = [] |
456 ham2 = [] | 465 ham2 = [] |
457 ham1min = [] | 466 ham1min = [] |
458 ham2min = [] | 467 ham2min = [] |
459 min_valueList = [] | 468 min_valueList = [] |
460 min_tagsList = [] | 469 min_tagsList = [] |
461 diff11_zeros = [] | 470 diff11_zeros = [] |
462 min_tagsList_zeros = [] | 471 min_tagsList_zeros = [] |
463 i = 0 # counter, only used to see how many HDs of tags were already calculated | 472 i = 0 # counter, only used to see how many HDs of tags were already calculated |
464 if mate_b is False: # HD calculation for all a's | 473 if mate_b is False: # HD calculation for all a's |
465 half1_mate1 = array1_half | 474 half1_mate1 = array1_half |
466 half2_mate1 = array1_half2 | 475 half2_mate1 = array1_half2 |
467 half1_mate2 = array2_half | 476 half1_mate2 = array2_half |
468 half2_mate2 = array2_half2 | 477 half2_mate2 = array2_half2 |
469 elif mate_b is True: # HD calculation for all b's | 478 elif mate_b is True: # HD calculation for all b's |
470 half1_mate1 = array1_half2 | 479 half1_mate1 = array1_half2 |
471 half2_mate1 = array1_half | 480 half2_mate1 = array1_half |
472 half1_mate2 = array2_half2 | 481 half1_mate2 = array2_half2 |
473 half2_mate2 = array2_half | 482 half2_mate2 = array2_half |
474 | 483 |
475 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | 484 for a, b, tag in zip(half1_mate1, half2_mate1, array1): |
476 ## exclude identical tag from array2, to prevent comparison to itself | 485 # exclude identical tag from array2, to prevent comparison to itself |
477 sameTag = numpy.where(array2 == tag) | 486 sameTag = numpy.where(array2 == tag) |
478 indexArray2 = numpy.arange(0, len(array2), 1) | 487 indexArray2 = numpy.arange(0, len(array2), 1) |
479 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data | 488 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data |
480 | 489 |
481 # all tags without identical tag | 490 # all tags without identical tag |
482 array2_half_withoutSame = half1_mate2[index_withoutSame] | 491 array2_half_withoutSame = half1_mate2[index_withoutSame] |
483 array2_half2_withoutSame = half2_mate2[index_withoutSame] | 492 array2_half2_withoutSame = half2_mate2[index_withoutSame] |
484 #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | 493 # array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) |
485 | 494 |
486 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | 495 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in |
487 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | 496 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" |
488 min_index = numpy.where(dist == dist.min()) # get index of min HD | 497 min_index = numpy.where(dist == dist.min()) # get index of min HD |
489 min_value = dist[min_index] # get minimum HDs | 498 min_value = dist[min_index] # get minimum HDs |
490 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD | 499 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD |
491 #min_tag = array2_withoutSame[min_index] # get whole tag with min HD | 500 # min_tag = array2_withoutSame[min_index] # get whole tag with min HD |
492 | 501 |
493 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | 502 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in |
494 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | 503 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" |
495 for d_1, d_2 in zip(min_value, dist2): | 504 for d, d2 in zip(min_value, dist2): |
496 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | 505 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b |
497 d = d_2 | |
498 d2 = d_1 | |
499 ham2.append(d) | 506 ham2.append(d) |
500 ham2min.append(d2) | 507 ham2min.append(d2) |
501 else: # half1, corrects the variable of the HD from both halfs if it is a or b | 508 else: # half1, corrects the variable of the HD from both halfs if it is a or b |
502 d = d_1 | |
503 d2 = d_2 | |
504 ham1.append(d) | 509 ham1.append(d) |
505 ham1min.append(d2) | 510 ham1min.append(d2) |
506 | 511 |
507 min_valueList.append(d + d2) | 512 min_valueList.append(d + d2) |
508 min_tagsList.append(tag) | 513 min_tagsList.append(tag) |
509 # ham1.append(d) | |
510 # ham2.append(d2) | |
511 difference1 = abs(d - d2) | 514 difference1 = abs(d - d2) |
512 diff11.append(difference1) | 515 diff11.append(difference1) |
513 rel_difference = round(float(difference1) / (d + d2), 1) | 516 rel_difference = round(float(difference1) / (d + d2), 1) |
514 relativeDiffList.append(rel_difference) | 517 relativeDiffList.append(rel_difference) |
515 | 518 |
516 #### tags which have identical parts: | 519 # tags which have identical parts: |
517 if d == 0 or d2 == 0: | 520 if d == 0 or d2 == 0: |
518 min_tagsList_zeros.append(tag) | 521 min_tagsList_zeros.append(tag) |
519 difference1_zeros = abs(d - d2) | 522 difference1_zeros = abs(d - d2) |
520 diff11_zeros.append(difference1_zeros) | 523 diff11_zeros.append(difference1_zeros) |
521 i += 1 | 524 i += 1 |
522 | 525 |
523 #print(i) | 526 # print(i) |
524 #diff11 = [st for st in diff11 if st != 999] | 527 # diff11 = [st for st in diff11 if st != 999] |
525 #ham1 = [st for st in ham1 if st != 999] | 528 # ham1 = [st for st in ham1 if st != 999] |
526 #ham2 = [st for st in ham2 if st != 999] | 529 # ham2 = [st for st in ham2 if st != 999] |
527 #min_valueList = [st for st in min_valueList if st != 999] | 530 # min_valueList = [st for st in min_valueList if st != 999] |
528 #min_tagsList = [st for st in min_tagsList if st != 999] | 531 # min_tagsList = [st for st in min_tagsList if st != 999] |
529 #relativeDiffList = [st for st in relativeDiffList if st != 999] | 532 # relativeDiffList = [st for st in relativeDiffList if st != 999] |
530 #diff11_zeros = [st for st in diff11_zeros if st != 999] | 533 # diff11_zeros = [st for st in diff11_zeros if st != 999] |
531 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | 534 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] |
532 | 535 |
533 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min]) | 536 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min]) |
537 | |
534 | 538 |
535 def readFileReferenceFree(file): | 539 def readFileReferenceFree(file): |
536 with open(file, 'r') as dest_f: | 540 with open(file, 'r') as dest_f: |
537 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | 541 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') |
538 integers = numpy.array(data_array[:, 0]).astype(int) | 542 integers = numpy.array(data_array[:, 0]).astype(int) |
539 return(integers, data_array) | 543 return(integers, data_array) |
540 | 544 |
545 | |
541 def hammingDistanceWithFS(fs, ham): | 546 def hammingDistanceWithFS(fs, ham): |
542 fs = numpy.asarray(fs) | 547 fs = numpy.asarray(fs) |
543 maximum = max(ham) | 548 maximum = max(ham) |
544 minimum = min(ham) | 549 minimum = min(ham) |
545 ham = numpy.asarray(ham) | 550 ham = numpy.asarray(ham) |
563 data6 = ham[hd6] | 568 data6 = ham[hd6] |
564 | 569 |
565 list1 = [data, data2, data3, data4, data5, data6] | 570 list1 = [data, data2, data3, data4, data5, data6] |
566 return(list1, maximum, minimum) | 571 return(list1, maximum, minimum) |
567 | 572 |
568 def familySizeDistributionWithHD(fs, ham, diff=False, rel = True): | 573 |
574 def familySizeDistributionWithHD(fs, ham, diff=False, rel=True): | |
569 hammingDistances = numpy.unique(ham) | 575 hammingDistances = numpy.unique(ham) |
570 fs = numpy.asarray(fs) | 576 fs = numpy.asarray(fs) |
571 | 577 |
572 ham = numpy.asarray(ham) | 578 ham = numpy.asarray(ham) |
573 bigFamilies2 = numpy.where(fs > 19)[0] | 579 bigFamilies2 = numpy.where(fs > 19)[0] |
614 else: | 620 else: |
615 hd6 = numpy.where(ham > 8)[0] | 621 hd6 = numpy.where(ham > 8)[0] |
616 data6 = fs[hd6] | 622 data6 = fs[hd6] |
617 | 623 |
618 if diff is True: | 624 if diff is True: |
619 list1 = [data0,data, data2, data3, data4, data5, data6] | 625 list1 = [data0, data, data2, data3, data4, data5, data6] |
620 else: | 626 else: |
621 list1 = [data, data2, data3, data4, data5, data6] | 627 list1 = [data, data2, data3, data4, data5, data6] |
622 | 628 |
623 return(list1, hammingDistances, maximum, minimum) | 629 return(list1, hammingDistances, maximum, minimum) |
630 | |
624 | 631 |
625 def make_argparser(): | 632 def make_argparser(): |
626 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') | 633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') |
627 parser.add_argument('--inputFile', | 634 parser.add_argument('--inputFile', |
628 help='Tabular File with three columns: ab or ba, tag and family size.') | 635 help='Tabular File with three columns: ab or ba, tag and family size.') |
629 parser.add_argument('--inputName1') | 636 parser.add_argument('--inputName1') |
630 parser.add_argument('--inputFile2',default=None, | 637 parser.add_argument('--inputFile2', default=None, |
631 help='Tabular File with three columns: ab or ba, tag and family size.') | 638 help='Tabular File with three columns: ab or ba, tag and family size.') |
632 parser.add_argument('--inputName2') | 639 parser.add_argument('--inputName2') |
633 parser.add_argument('--sample_size', default=1000,type=int, | 640 parser.add_argument('--sample_size', default=1000, type=int, |
634 help='Sample size of Hamming distance analysis.') | 641 help='Sample size of Hamming distance analysis.') |
635 parser.add_argument('--sep', default=",", | 642 parser.add_argument('--subset_tag', default=0, type=int, |
636 help='Separator in the csv file.') | |
637 parser.add_argument('--subset_tag', default=0,type=int, | |
638 help='The tag is shortened to the given number.') | 643 help='The tag is shortened to the given number.') |
639 parser.add_argument('--nproc', default=4,type=int, | 644 parser.add_argument('--nproc', default=4, type=int, |
640 help='The tool runs with the given number of processors.') | 645 help='The tool runs with the given number of processors.') |
641 parser.add_argument('--only_DCS', action="store_false", # default=False, type=bool, | 646 parser.add_argument('--only_DCS', action="store_false", |
642 help='Only tags of the DCSs are included in the HD analysis') | 647 help='Only tags of the DCSs are included in the HD analysis') |
643 | 648 |
644 parser.add_argument('--minFS', default=1, type=int, | 649 parser.add_argument('--minFS', default=1, type=int, |
645 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') | 650 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') |
646 parser.add_argument('--maxFS', default=0, type=int, | 651 parser.add_argument('--maxFS', default=0, type=int, |
647 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') | 652 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') |
648 parser.add_argument('--nr_above_bars', action="store_true", # default=False, type=bool, | 653 parser.add_argument('--nr_above_bars', action="store_true", |
649 help='If no, values above bars in the histrograms are removed') | 654 help='If no, values above bars in the histrograms are removed') |
650 | 655 |
651 parser.add_argument('--output_csv', default="data.csv", type=str, | 656 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
652 help='Name of the csv file.') | 657 help='Name of the tabular file.') |
653 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 658 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
654 help='Name of the pdf file.') | 659 help='Name of the pdf file.') |
655 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, | 660 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, |
656 help='Name of the pdf file.') | 661 help='Name of the pdf file.') |
657 parser.add_argument('--output_csv2', default="data2.csv", type=str, | 662 parser.add_argument('--output_tabular2', default="data2.tabular", type=str, |
658 help='Name of the csv file.') | 663 help='Name of the tabular file.') |
659 | 664 |
660 return parser | 665 return parser |
666 | |
661 | 667 |
662 def Hamming_Distance_Analysis(argv): | 668 def Hamming_Distance_Analysis(argv): |
663 parser = make_argparser() | 669 parser = make_argparser() |
664 args = parser.parse_args(argv[1:]) | 670 args = parser.parse_args(argv[1:]) |
665 | 671 |
671 | 677 |
672 index_size = args.sample_size | 678 index_size = args.sample_size |
673 title_savedFile_pdf = args.output_pdf | 679 title_savedFile_pdf = args.output_pdf |
674 title_savedFile_pdf2 = args.output_pdf2 | 680 title_savedFile_pdf2 = args.output_pdf2 |
675 | 681 |
676 title_savedFile_csv = args.output_csv | 682 title_savedFile_csv = args.output_tabular |
677 title_savedFile_csv2 = args.output_csv2 | 683 title_savedFile_csv2 = args.output_tabular2 |
678 | 684 |
679 sep = args.sep | 685 sep = "\t" |
680 onlyDuplicates = args.only_DCS | 686 onlyDuplicates = args.only_DCS |
681 minFS = args.minFS | 687 minFS = args.minFS |
682 maxFS = args.maxFS | 688 maxFS = args.maxFS |
683 nr_above_bars = args.nr_above_bars | 689 nr_above_bars = args.nr_above_bars |
684 | |
685 | 690 |
686 subset = args.subset_tag | 691 subset = args.subset_tag |
687 nproc = args.nproc | 692 nproc = args.nproc |
688 | 693 |
689 ### input checks | 694 # input checks |
690 if index_size < 0: | 695 if index_size < 0: |
691 print("index_size is a negative integer.") | 696 print("index_size is a negative integer.") |
692 exit(2) | 697 exit(2) |
693 | 698 |
694 if nproc <= 0: | 699 if nproc <= 0: |
695 print("nproc is smaller or equal zero") | 700 print("nproc is smaller or equal zero") |
696 exit(3) | 701 exit(3) |
697 | 702 |
698 if type(sep) is not str or len(sep)>1: | |
699 print("sep must be a single character.") | |
700 exit(4) | |
701 | |
702 if subset < 0: | 703 if subset < 0: |
703 print("subset_tag is smaller or equal zero.") | 704 print("subset_tag is smaller or equal zero.") |
704 exit(5) | 705 exit(5) |
705 | 706 |
706 ### PLOT ### | 707 # PLOT |
707 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | 708 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color |
708 plt.rcParams['xtick.labelsize'] = 14 | 709 plt.rcParams['xtick.labelsize'] = 14 |
709 plt.rcParams['ytick.labelsize'] = 14 | 710 plt.rcParams['ytick.labelsize'] = 14 |
710 plt.rcParams['patch.edgecolor'] = "#000000" | 711 plt.rcParams['patch.edgecolor'] = "#000000" |
711 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
761 data_array = numpy.column_stack((duplTags, duplTags_seq)) | 762 data_array = numpy.column_stack((duplTags, duplTags_seq)) |
762 data_array = numpy.column_stack((data_array, duplTags_tag)) | 763 data_array = numpy.column_stack((data_array, duplTags_tag)) |
763 integers = numpy.array(data_array[:, 0]).astype(int) | 764 integers = numpy.array(data_array[:, 0]).astype(int) |
764 print("DCS in whole dataset", len(data_array)) | 765 print("DCS in whole dataset", len(data_array)) |
765 | 766 |
766 ## HD analysis for a subset of the tag | 767 # HD analysis for a subset of the tag |
767 if subset > 0: | 768 if subset > 0: |
768 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | 769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) |
769 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | 770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) |
770 | 771 |
771 flanking_region_float = float((len(tag1[0]) - subset)) / 2 | 772 flanking_region_float = float((len(tag1[0]) - subset)) / 2 |
787 print("length of tag= ", len(data_array[0, 1])) | 788 print("length of tag= ", len(data_array[0, 1])) |
788 # select sample: if no size given --> all vs. all comparison | 789 # select sample: if no size given --> all vs. all comparison |
789 if index_size == 0: | 790 if index_size == 0: |
790 result = numpy.arange(0, len(data_array), 1) | 791 result = numpy.arange(0, len(data_array), 1) |
791 else: | 792 else: |
792 result = numpy.random.choice(len(integers), size=index_size, | 793 result = numpy.random.choice(len(integers), size=index_size, replace=False) # array of random sequences of size=index.size |
793 replace=False) # array of random sequences of size=index.size | 794 |
794 | 795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: |
795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | 796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) |
796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | |
797 | 797 |
798 # comparison random tags to whole dataset | 798 # comparison random tags to whole dataset |
799 result1 = data_array[result, 1] # random tags | 799 result1 = data_array[result, 1] # random tags |
800 result2 = data_array[:, 1] # all tags | 800 result2 = data_array[:, 1] # all tags |
801 print("size of the whole dataset= ", len(result2)) | 801 print("size of the whole dataset= ", len(result2)) |
806 chunks_sample = numpy.array_split(result1, nproc) | 806 chunks_sample = numpy.array_split(result1, nproc) |
807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | 807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) |
808 proc_pool.close() | 808 proc_pool.close() |
809 proc_pool.join() | 809 proc_pool.join() |
810 ham = numpy.concatenate(ham).astype(int) | 810 ham = numpy.concatenate(ham).astype(int) |
811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | 811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: |
812 # for h, tag in zip(ham, result1): | 812 # for h, tag in zip(ham, result1): |
813 # output_file1.write("{}\t{}\n".format(tag, h)) | 813 # output_file1.write("{}\t{}\n".format(tag, h)) |
814 | 814 |
815 # HD analysis for chimeric reads | 815 # HD analysis for chimeric reads |
816 proc_pool_b = Pool(nproc) | 816 proc_pool_b = Pool(nproc) |
817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | 817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) |
818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | 818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) |
832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) | 832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) |
833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), | 833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), |
834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) | 834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) |
835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), | 835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), |
836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) | 836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) |
837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), | 837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) |
838 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | |
839 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | 838 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), |
840 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | 839 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) |
841 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: | |
842 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): | |
843 # output_file2.write( | |
844 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) | |
845 | 840 |
846 lenTags = len(data_array) | 841 lenTags = len(data_array) |
847 | 842 |
848 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | 843 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags |
849 seq = numpy.array(data_array[result, 1]) # tags of sample | 844 seq = numpy.array(data_array[result, 1]) # tags of sample |
854 seq = numpy.tile(seq, 2) | 849 seq = numpy.tile(seq, 2) |
855 ham = numpy.tile(ham, 2) | 850 ham = numpy.tile(ham, 2) |
856 | 851 |
857 # prepare data for different kinds of plots | 852 # prepare data for different kinds of plots |
858 # distribution of FSs separated after HD | 853 # distribution of FSs separated after HD |
859 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,rel=False) | 854 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) |
860 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | 855 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS |
861 | 856 |
862 ## get FS for all tags with min HD of analysis of chimeric reads | 857 # get FS for all tags with min HD of analysis of chimeric reads |
863 # there are more tags than sample size in the plot, because one tag can have multiple minimas | 858 # there are more tags than sample size in the plot, because one tag can have multiple minimas |
864 seqDic = dict(zip(seq, quant)) | 859 seqDic = dict(zip(seq, quant)) |
865 lst_minHD_tags = [] | 860 lst_minHD_tags = [] |
866 for i in minHD_tags: | 861 for i in minHD_tags: |
867 lst_minHD_tags.append(seqDic.get(i)) | 862 lst_minHD_tags.append(seqDic.get(i)) |
881 if len(minHD_tags_zeros) != 0: | 876 if len(minHD_tags_zeros) != 0: |
882 lst_minHD_tags_zeros = [] | 877 lst_minHD_tags_zeros = [] |
883 for i in minHD_tags_zeros: | 878 for i in minHD_tags_zeros: |
884 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | 879 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads |
885 | 880 |
886 # histogram with HD of non-identical half | 881 # histogram with HD of non-identical half |
887 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | 882 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) |
888 lst_minHD_tags_zeros, diff_zeros) | 883 # family size distribution of non-identical half |
889 # family size distribution of non-identical half | 884 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) |
890 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( | 885 |
891 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) | 886 # plot Hamming Distance with Family size distribution |
892 | 887 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars) |
893 ##################################################################################################################### | 888 |
894 ################## plot Hamming Distance with Family size distribution ############################## | 889 # Plot FSD with separation after |
895 ##################################################################################################################### | |
896 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | |
897 subtitle="Hamming distance separated by family size", title_file1=name_file, | |
898 lenTags=lenTags,xlabel="HD", nr_above_bars=nr_above_bars) | |
899 | |
900 ########################## Plot FSD with separation after ############################################### | |
901 ###################################################################################################################### | |
902 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 890 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, |
903 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", | 891 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", |
904 pdf=pdf,relative=False, title_file1=name_file, diff=False) | 892 pdf=pdf, relative=False, title_file1=name_file, diff=False) |
905 | 893 |
906 ########################## Plot HD within tags ######################################################## | 894 # Plot HD within tags |
907 ###################################################################################################################### | 895 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) |
908 # plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | 896 |
909 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2min , HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | 897 # Plot difference between HD's separated after FSD |
910 | |
911 | |
912 ########################## Plot difference between HD's separated after FSD #################################### | |
913 ###################################################################################################################### | |
914 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 898 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, |
915 subtitle="Delta Hamming distance within tags", | 899 subtitle="Delta Hamming distance within tags", |
916 title_file1=name_file, lenTags=lenTags, | 900 title_file1=name_file, lenTags=lenTags, |
917 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) | 901 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) |
918 | 902 |
919 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 903 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
920 subtitle="Chimera Analysis: relative delta Hamming distances", | 904 subtitle="Chimera Analysis: relative delta Hamming distances", |
921 title_file1=name_file, lenTags=lenTags, | 905 title_file1=name_file, lenTags=lenTags, |
922 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) | 906 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) |
923 | 907 |
924 #################### Plot FSD separated after difference between HD's ##################################### | |
925 ######################################################################################################################## | |
926 # plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, | |
927 # subtitle="Family size distribution separated by delta Hamming distances within the tags", | |
928 # pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant) | |
929 | |
930 # plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf, | |
931 # subtitle="Family size distribution separated by delta Hamming distances within the tags", | |
932 # relative=True, diff=True, title_file1=name_file) | |
933 | |
934 | |
935 # plots for chimeric reads | 908 # plots for chimeric reads |
936 if len(minHD_tags_zeros) != 0: | 909 if len(minHD_tags_zeros) != 0: |
937 ## HD | 910 # HD |
938 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | 911 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, |
939 subtitle="Hamming distance of the non-identical half of chimeras", | 912 subtitle="Hamming distance of the non-identical half of chimeras", |
940 title_file1=name_file, lenTags=lenTags,xlabel="HD", relative=False, nr_above_bars=nr_above_bars) | 913 title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars) |
941 | 914 |
942 ## FSD | 915 # print all data to a CSV file |
943 # plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, | 916 # HD |
944 # originalCounts=quant, pdf=pdf, | |
945 # subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras", | |
946 # relative=False, diff=False, title_file1=name_file) | |
947 | |
948 ### print all data to a CSV file | |
949 #### HD #### | |
950 summary, sumCol = createTableHD(list1, "HD=") | 917 summary, sumCol = createTableHD(list1, "HD=") |
951 overallSum = sum(sumCol) # sum of columns in table | 918 overallSum = sum(sumCol) # sum of columns in table |
952 | 919 |
953 #### FSD #### | 920 # FSD |
954 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) | 921 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) |
955 overallSum5 = sum(sumCol5) | 922 overallSum5 = sum(sumCol5) |
956 | 923 |
957 ### HD of both parts of the tag #### | 924 # HD of both parts of the tag |
958 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf2,numpy.array(minHDs)]) | 925 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) |
959 overallSum9 = sum(sumCol9) | 926 overallSum9 = sum(sumCol9) |
960 | 927 |
961 ## HD | 928 # HD |
962 # absolute difference | 929 # absolute difference |
963 summary11, sumCol11 = createTableHD(listDifference1, "diff=") | 930 summary11, sumCol11 = createTableHD(listDifference1, "diff=") |
964 overallSum11 = sum(sumCol11) | 931 overallSum11 = sum(sumCol11) |
965 # relative difference and all tags | 932 # relative difference and all tags |
966 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | 933 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") |
967 overallSum13 = sum(sumCol13) | 934 overallSum13 = sum(sumCol13) |
968 | 935 |
969 ## FSD | |
970 # absolute difference | |
971 # summary19, sumCol19 = createTableFSD2(familySizeList1_diff) | |
972 # overallSum19 = sum(sumCol19) | |
973 # relative difference | |
974 # summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) | |
975 # overallSum21 = sum(sumCol21) | |
976 | |
977 # chimeric reads | 936 # chimeric reads |
978 if len(minHD_tags_zeros) != 0: | 937 if len(minHD_tags_zeros) != 0: |
979 # absolute difference and tags where at least one half has HD=0 | 938 # absolute difference and tags where at least one half has HD=0 |
980 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") | 939 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") |
981 overallSum15 = sum(sumCol15) | 940 overallSum15 = sum(sumCol15) |
982 # absolute difference and tags where at least one half has HD=0 | |
983 # summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) | |
984 # overallSum23 = sum(sumCol23) | |
985 | 941 |
986 output_file.write("{}\n".format(name_file)) | 942 output_file.write("{}\n".format(name_file)) |
987 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | 943 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( |
988 numpy.concatenate(list1)), lenTags, lenTags)) | 944 numpy.concatenate(list1)), lenTags, lenTags)) |
989 | 945 |
990 ### HD ### | 946 # HD |
991 createFileHD(summary, sumCol, overallSum, output_file, | 947 createFileHD(summary, sumCol, overallSum, output_file, |
992 "Hamming distance separated by family size", sep) | 948 "Hamming distance separated by family size", sep) |
993 ### FSD ### | 949 # FSD |
994 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | 950 createFileFSD2(summary5, sumCol5, overallSum5, output_file, |
995 "Family size distribution separated by Hamming distance", sep, | 951 "Family size distribution separated by Hamming distance", sep, |
996 diff=False) | 952 diff=False) |
997 | 953 |
998 count = numpy.bincount(quant) | 954 count = numpy.bincount(quant) |
999 #output_file.write("{}{}\n".format(sep, name_file)) | 955 # output_file.write("{}{}\n".format(sep, name_file)) |
1000 output_file.write("\n") | 956 output_file.write("\n") |
1001 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) | 957 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) |
1002 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | 958 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) |
1003 output_file.write( | 959 output_file.write( |
1004 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | 960 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) |
1005 | 961 |
1006 ### HD within tags ### | 962 # HD within tags |
1007 output_file.write( | 963 output_file.write( |
1008 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | 964 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" |
1009 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | 965 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") |
1010 output_file.write( | 966 output_file.write( |
1011 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | 967 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( |
1017 createFileHD(summary11, sumCol11, overallSum11, output_file, | 973 createFileHD(summary11, sumCol11, overallSum11, output_file, |
1018 "Absolute delta Hamming distances within the tag", sep) | 974 "Absolute delta Hamming distances within the tag", sep) |
1019 createFileHD(summary13, sumCol13, overallSum13, output_file, | 975 createFileHD(summary13, sumCol13, overallSum13, output_file, |
1020 "Chimera analysis: relative delta Hamming distances", sep) | 976 "Chimera analysis: relative delta Hamming distances", sep) |
1021 | 977 |
1022 # createFileFSD2(summary19, sumCol19, overallSum19, output_file, | |
1023 # "Family size distribution separated by absolute delta Hamming distance", | |
1024 # sep) | |
1025 # createFileFSD2(summary21, sumCol21, overallSum21, output_file, | |
1026 # "Family size distribution separated by relative delta Hamming distance", | |
1027 # sep, rel=True) | |
1028 | |
1029 if len(minHD_tags_zeros) != 0: | 978 if len(minHD_tags_zeros) != 0: |
1030 output_file.write( | 979 output_file.write( |
1031 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | 980 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") |
1032 createFileHD(summary15, sumCol15, overallSum15, output_file, | 981 createFileHD(summary15, sumCol15, overallSum15, output_file, |
1033 "Hamming distances of non-zero half", sep) | 982 "Hamming distances of non-zero half", sep) |
1034 # createFileFSD2(summary23, sumCol23, overallSum23, output_file, | |
1035 # "Family size distribution separated by Hamming distance of non-zero half", | |
1036 # sep, diff=False) | |
1037 output_file.write("\n") | 983 output_file.write("\n") |
1038 | |
1039 | 984 |
1040 | 985 |
1041 if __name__ == '__main__': | 986 if __name__ == '__main__': |
1042 sys.exit(Hamming_Distance_Analysis(sys.argv)) | 987 sys.exit(Hamming_Distance_Analysis(sys.argv)) |