Mercurial > repos > mheinzl > hd
comparison hd.py @ 0:239c4448a163 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author | mheinzl |
---|---|
date | Thu, 10 May 2018 07:30:27 -0400 |
parents | |
children | 7414792e1cb8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:239c4448a163 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Hamming distance analysis of SSCSs | |
4 # | |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | |
6 # Contact: monika.heinzl@edumail.at | |
7 # | |
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input. | |
9 # The program produces a plot which shows a histogram of Hamming distances separated after family sizes, | |
10 # a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS. | |
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads | |
12 # and finally a CSV file with the data of the plots. | |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | |
14 # The tool can run on a certain number of processors, which can be defined by the user. | |
15 | |
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / | |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | |
18 | |
19 import numpy | |
20 import itertools | |
21 import operator | |
22 import matplotlib.pyplot as plt | |
23 import os.path | |
24 import cPickle as pickle | |
25 from multiprocessing.pool import Pool | |
26 from functools import partial | |
27 from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD | |
28 from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2 | |
29 from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2 | |
30 from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag | |
31 from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2 | |
32 import argparse | |
33 import sys | |
34 import os | |
35 from matplotlib.backends.backend_pdf import PdfPages | |
36 | |
37 def hamming(array1, array2): | |
38 res = 99 * numpy.ones(len(array1)) | |
39 i = 0 | |
40 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
41 for a in array1: | |
42 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest | |
43 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero | |
44 #print(i) | |
45 i += 1 | |
46 return res | |
47 | |
48 def hamming_difference(array1, array2, mate_b): | |
49 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | |
50 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | |
51 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | |
52 | |
53 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | |
54 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | |
55 | |
56 diff11 = [] | |
57 relativeDiffList = [] | |
58 ham1 = [] | |
59 ham2 = [] | |
60 min_valueList = [] | |
61 min_tagsList = [] | |
62 diff11_zeros = [] | |
63 min_tagsList_zeros = [] | |
64 i = 0 # counter, only used to see how many HDs of tags were already calculated | |
65 if mate_b is False: # HD calculation for all a's | |
66 half1_mate1 = array1_half | |
67 half2_mate1 = array1_half2 | |
68 half1_mate2 = array2_half | |
69 half2_mate2 = array2_half2 | |
70 elif mate_b is True: # HD calculation for all b's | |
71 half1_mate1 = array1_half2 | |
72 half2_mate1 = array1_half | |
73 half1_mate2 = array2_half2 | |
74 half2_mate2 = array2_half | |
75 | |
76 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | |
77 ## exclude identical tag from array2, to prevent comparison to itself | |
78 sameTag = numpy.where(array2 == tag) | |
79 indexArray2 = numpy.arange(0, len(array2), 1) | |
80 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data | |
81 | |
82 # all tags without identical tag | |
83 array2_half_withoutSame = half1_mate2[index_withoutSame] | |
84 array2_half2_withoutSame = half2_mate2[index_withoutSame] | |
85 #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | |
86 | |
87 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | |
88 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | |
89 min_index = numpy.where(dist == dist.min()) # get index of min HD | |
90 min_value = dist[min_index] # get minimum HDs | |
91 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD | |
92 #min_tag = array2_withoutSame[min_index] # get whole tag with min HD | |
93 | |
94 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | |
95 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | |
96 for d_1, d_2 in zip(min_value, dist2): | |
97 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | |
98 d = d_2 | |
99 d2 = d_1 | |
100 else: # half1, corrects the variable of the HD from both halfs if it is a or b | |
101 d = d_1 | |
102 d2 = d_2 | |
103 min_valueList.append(d + d2) | |
104 min_tagsList.append(tag) | |
105 ham1.append(d) | |
106 ham2.append(d2) | |
107 difference1 = abs(d - d2) | |
108 diff11.append(difference1) | |
109 rel_difference = round(float(difference1) / (d + d2), 1) | |
110 relativeDiffList.append(rel_difference) | |
111 | |
112 #### tags which have identical parts: | |
113 if d == 0 or d2 == 0: | |
114 min_tagsList_zeros.append(tag) | |
115 difference1_zeros = abs(d - d2) | |
116 diff11_zeros.append(difference1_zeros) | |
117 #print(i) | |
118 i += 1 | |
119 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) | |
120 | |
121 def readFileReferenceFree(file): | |
122 with open(file, 'r') as dest_f: | |
123 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | |
124 integers = numpy.array(data_array[:, 0]).astype(int) | |
125 return(integers, data_array) | |
126 | |
127 def hammingDistanceWithFS(quant, ham): | |
128 quant = numpy.asarray(quant) | |
129 maximum = max(ham) | |
130 minimum = min(ham) | |
131 ham = numpy.asarray(ham) | |
132 | |
133 singletons = numpy.where(quant == 1)[0] | |
134 data = ham[singletons] | |
135 | |
136 hd2 = numpy.where(quant == 2)[0] | |
137 data2 = ham[hd2] | |
138 | |
139 hd3 = numpy.where(quant == 3)[0] | |
140 data3 = ham[hd3] | |
141 | |
142 hd4 = numpy.where(quant == 4)[0] | |
143 data4 = ham[hd4] | |
144 | |
145 hd5 = numpy.where((quant >= 5) & (quant <= 10))[0] | |
146 data5 = ham[hd5] | |
147 | |
148 hd6 = numpy.where(quant > 10)[0] | |
149 data6 = ham[hd6] | |
150 | |
151 list1 = [data, data2, data3, data4, data5, data6] | |
152 return(list1, maximum, minimum) | |
153 | |
154 def familySizeDistributionWithHD(fs, ham, diff=False, rel = True): | |
155 hammingDistances = numpy.unique(ham) | |
156 fs = numpy.asarray(fs) | |
157 | |
158 ham = numpy.asarray(ham) | |
159 bigFamilies2 = numpy.where(fs > 19)[0] | |
160 if len(bigFamilies2) != 0: | |
161 fs[bigFamilies2] = 20 | |
162 maximum = max(fs) | |
163 minimum = min(fs) | |
164 if diff is True: | |
165 hd0 = numpy.where(ham == 0)[0] | |
166 data0 = fs[hd0] | |
167 | |
168 if rel is True: | |
169 hd1 = numpy.where(ham == 0.1)[0] | |
170 else: | |
171 hd1 = numpy.where(ham == 1)[0] | |
172 data = fs[hd1] | |
173 | |
174 if rel is True: | |
175 hd2 = numpy.where(ham == 0.2)[0] | |
176 else: | |
177 hd2 = numpy.where(ham == 2)[0] | |
178 data2 = fs[hd2] | |
179 | |
180 if rel is True: | |
181 hd3 = numpy.where(ham == 0.3)[0] | |
182 else: | |
183 hd3 = numpy.where(ham == 3)[0] | |
184 data3 = fs[hd3] | |
185 | |
186 if rel is True: | |
187 hd4 = numpy.where(ham == 0.4)[0] | |
188 else: | |
189 hd4 = numpy.where(ham == 4)[0] | |
190 data4 = fs[hd4] | |
191 | |
192 if rel is True: | |
193 hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] | |
194 else: | |
195 hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] | |
196 data5 = fs[hd5] | |
197 | |
198 if rel is True: | |
199 hd6 = numpy.where(ham > 0.8)[0] | |
200 else: | |
201 hd6 = numpy.where(ham > 8)[0] | |
202 data6 = fs[hd6] | |
203 | |
204 if diff is True: | |
205 list1 = [data0,data, data2, data3, data4, data5, data6] | |
206 else: | |
207 list1 = [data, data2, data3, data4, data5, data6] | |
208 | |
209 return(list1, hammingDistances, maximum, minimum) | |
210 | |
211 def make_argparser(): | |
212 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') | |
213 parser.add_argument('--inputFile', | |
214 help='Tabular File with three columns: ab or ba, tag and family size.') | |
215 parser.add_argument('--inputName1') | |
216 parser.add_argument('--inputFile2',default=None, | |
217 help='Tabular File with three columns: ab or ba, tag and family size.') | |
218 parser.add_argument('--inputName2') | |
219 parser.add_argument('--sample_size', default=1000,type=int, | |
220 help='Sample size of Hamming distance analysis.') | |
221 parser.add_argument('--sep', default=",", | |
222 help='Separator in the csv file.') | |
223 parser.add_argument('--subset_tag', default=0,type=int, | |
224 help='The tag is shortened to the given number.') | |
225 parser.add_argument('--nproc', default=4,type=int, | |
226 help='The tool runs with the given number of processors.') | |
227 parser.add_argument('--only_DCS', action="store_false", # default=False, type=bool, | |
228 help='Only tags of the DCSs are included in the HD analysis') | |
229 | |
230 parser.add_argument('--minFS', default=1, type=int, | |
231 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') | |
232 parser.add_argument('--maxFS', default=0, type=int, | |
233 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') | |
234 | |
235 parser.add_argument('--output_csv', default="data.csv", type=str, | |
236 help='Name of the csv file.') | |
237 parser.add_argument('--output_pdf', default="data.pdf", type=str, | |
238 help='Name of the pdf file.') | |
239 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, | |
240 help='Name of the pdf file.') | |
241 parser.add_argument('--output_csv2', default="data2.csv", type=str, | |
242 help='Name of the csv file.') | |
243 | |
244 return parser | |
245 | |
246 def Hamming_Distance_Analysis(argv): | |
247 parser = make_argparser() | |
248 args = parser.parse_args(argv[1:]) | |
249 | |
250 file1 = args.inputFile | |
251 name1 = args.inputName1 | |
252 | |
253 file2 = args.inputFile2 | |
254 name2 = args.inputName2 | |
255 | |
256 index_size = args.sample_size | |
257 title_savedFile_pdf = args.output_pdf | |
258 title_savedFile_pdf2 = args.output_pdf2 | |
259 | |
260 title_savedFile_csv = args.output_csv | |
261 title_savedFile_csv2 = args.output_csv2 | |
262 | |
263 sep = args.sep | |
264 onlyDuplicates = args.only_DCS | |
265 minFS = args.minFS | |
266 maxFS = args.maxFS | |
267 | |
268 subset = args.subset_tag | |
269 nproc = args.nproc | |
270 | |
271 ### input checks | |
272 if index_size < 0: | |
273 print("index_size is a negative integer.") | |
274 exit(2) | |
275 | |
276 if nproc <= 0: | |
277 print("nproc is smaller or equal zero") | |
278 exit(3) | |
279 | |
280 if type(sep) is not str or len(sep)>1: | |
281 print("sep must be a single character.") | |
282 exit(4) | |
283 | |
284 if subset < 0: | |
285 print("subset_tag is smaller or equal zero.") | |
286 exit(5) | |
287 | |
288 ### PLOT ### | |
289 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | |
290 plt.rcParams['xtick.labelsize'] = 12 | |
291 plt.rcParams['ytick.labelsize'] = 12 | |
292 plt.rcParams['patch.edgecolor'] = "#000000" | |
293 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | |
294 | |
295 if file2 != str(None): | |
296 files = [file1, file2] | |
297 name1 = name1.split(".tabular")[0] | |
298 name2 = name2.split(".tabular")[0] | |
299 names = [name1, name2] | |
300 pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] | |
301 csv_files = [title_savedFile_csv, title_savedFile_csv2] | |
302 else: | |
303 files = [file1] | |
304 name1 = name1.split(".tabular")[0] | |
305 names = [name1] | |
306 pdf_files = [title_savedFile_pdf] | |
307 csv_files = [title_savedFile_csv] | |
308 | |
309 print(type(onlyDuplicates)) | |
310 print(onlyDuplicates) | |
311 | |
312 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): | |
313 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: | |
314 print("dataset: ", name_file) | |
315 integers, data_array = readFileReferenceFree(f) | |
316 data_array = numpy.array(data_array) | |
317 int_f = numpy.array(data_array[:, 0]).astype(int) | |
318 data_array = data_array[numpy.where(int_f >= minFS)] | |
319 integers = integers[integers >= minFS] | |
320 | |
321 # select family size for tags | |
322 if maxFS > 0: | |
323 int_f2 = numpy.array(data_array[:, 0]).astype(int) | |
324 data_array = data_array[numpy.where(int_f2 <= maxFS)] | |
325 integers = integers[integers <= maxFS] | |
326 | |
327 print("min FS", min(integers)) | |
328 print("max FS", max(integers)) | |
329 | |
330 tags = data_array[:, 2] | |
331 seq = data_array[:, 1] | |
332 | |
333 if onlyDuplicates is True: | |
334 # find all unique tags and get the indices for ALL tags, but only once | |
335 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
336 d = u[c > 1] | |
337 | |
338 # get family sizes, tag for duplicates | |
339 duplTags_double = integers[numpy.in1d(seq, d)] | |
340 duplTags = duplTags_double[0::2] # ab of DCS | |
341 duplTagsBA = duplTags_double[1::2] # ba of DCS | |
342 | |
343 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | |
344 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | |
345 | |
346 data_array = numpy.column_stack((duplTags, duplTags_seq)) | |
347 data_array = numpy.column_stack((data_array, duplTags_tag)) | |
348 integers = numpy.array(data_array[:, 0]).astype(int) | |
349 print("DCS in whole dataset", len(data_array)) | |
350 | |
351 ## HD analysis for a subset of the tag | |
352 if subset > 0: | |
353 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | |
354 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | |
355 | |
356 flanking_region_float = float((len(tag1[0]) - subset)) / 2 | |
357 flanking_region = int(flanking_region_float) | |
358 if flanking_region_float % 2 == 0: | |
359 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) | |
360 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) | |
361 else: | |
362 flanking_region_rounded = int(round(flanking_region, 1)) | |
363 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded | |
364 tag1_shorten = numpy.array( | |
365 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) | |
366 tag2_shorten = numpy.array( | |
367 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | |
368 | |
369 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | |
370 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | |
371 | |
372 print("length of tag= ", len(data_array[0, 1])) | |
373 # select sample: if no size given --> all vs. all comparison | |
374 if index_size == 0: | |
375 result = numpy.arange(0, len(data_array), 1) | |
376 else: | |
377 result = numpy.random.choice(len(integers), size=index_size, | |
378 replace=False) # array of random sequences of size=index.size | |
379 | |
380 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | |
381 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | |
382 | |
383 # comparison random tags to whole dataset | |
384 result1 = data_array[result, 1] # random tags | |
385 result2 = data_array[:, 1] # all tags | |
386 print("size of the whole dataset= ", len(result2)) | |
387 print("sample size= ", len(result1)) | |
388 | |
389 # HD analysis of whole tag | |
390 proc_pool = Pool(nproc) | |
391 chunks_sample = numpy.array_split(result1, nproc) | |
392 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | |
393 proc_pool.close() | |
394 proc_pool.join() | |
395 ham = numpy.concatenate(ham).astype(int) | |
396 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | |
397 # for h, tag in zip(ham, result1): | |
398 # output_file1.write("{}\t{}\n".format(tag, h)) | |
399 | |
400 # HD analysis for chimeric reads | |
401 proc_pool_b = Pool(nproc) | |
402 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | |
403 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | |
404 proc_pool_b.close() | |
405 proc_pool_b.join() | |
406 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), | |
407 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) | |
408 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), | |
409 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) | |
410 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), | |
411 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) | |
412 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), | |
413 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) | |
414 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), | |
415 numpy.concatenate([item_b[4] for item_b in diff_list_b]))) | |
416 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), | |
417 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) | |
418 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), | |
419 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) | |
420 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), | |
421 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) | |
422 | |
423 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: | |
424 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): | |
425 # output_file2.write( | |
426 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) | |
427 | |
428 lenTags = len(data_array) | |
429 | |
430 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | |
431 seq = numpy.array(data_array[result, 1]) # tags of sample | |
432 ham = numpy.asarray(ham) # HD for sample of tags | |
433 | |
434 if onlyDuplicates is True: # ab and ba strands of DCSs | |
435 quant = numpy.concatenate((quant, duplTagsBA[result])) | |
436 seq = numpy.tile(seq, 2) | |
437 ham = numpy.tile(ham, 2) | |
438 | |
439 # prepare data for different kinds of plots | |
440 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | |
441 # distribution of FSs separated after HD | |
442 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, | |
443 rel=False) | |
444 | |
445 ## get FS for all tags with min HD of analysis of chimeric reads | |
446 # there are more tags than sample size in the plot, because one tag can have multiple minimas | |
447 seqDic = dict(zip(seq, quant)) | |
448 lst_minHD_tags = [] | |
449 for i in minHD_tags: | |
450 lst_minHD_tags.append(seqDic.get(i)) | |
451 | |
452 # histogram with absolute and relative difference between HDs of both parts of the tag | |
453 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | |
454 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | |
455 rel_Diff) | |
456 | |
457 # family size distribution separated after the difference between HDs of both parts of the tag | |
458 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( | |
459 lst_minHD_tags, diff, diff=True, rel=False) | |
460 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( | |
461 lst_minHD_tags, rel_Diff, diff=True, rel=True) | |
462 | |
463 # chimeric read analysis: tags which have HD=0 in one of the halfs | |
464 if len(minHD_tags_zeros) != 0: | |
465 lst_minHD_tags_zeros = [] | |
466 for i in minHD_tags_zeros: | |
467 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | |
468 | |
469 # histogram with HD of non-identical half | |
470 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | |
471 lst_minHD_tags_zeros, diff_zeros) | |
472 # family size distribution of non-identical half | |
473 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( | |
474 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) | |
475 | |
476 ########################## Plot HD within tags ######################################################## | |
477 ###################################################################################################################### | |
478 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | |
479 | |
480 ##################################################################################################################### | |
481 ################## plot Hamming Distance with Family size distribution ############################## | |
482 ##################################################################################################################### | |
483 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | |
484 subtitle="Overall hamming distance with separation after family size", title_file1=name_file, | |
485 lenTags=lenTags,xlabel="Hamming distance") | |
486 | |
487 ########################## Plot FSD with separation after HD ############################################### | |
488 ######################################################################################################################## | |
489 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | |
490 quant=quant, subtitle="Family size distribution with separation after hamming distance", | |
491 pdf=pdf,relative=False, title_file1=name_file, diff=False) | |
492 | |
493 ########################## Plot difference between HD's separated after FSD ########################################## | |
494 ######################################################################################################################## | |
495 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | |
496 subtitle="Delta Hamming distances within tags with separation after family size", | |
497 title_file1=name_file, lenTags=lenTags, | |
498 xlabel="absolute delta Hamming distance", relative=False) | |
499 | |
500 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | |
501 subtitle="Relative delta Hamming distances within tags with separation after family size", | |
502 title_file1=name_file, lenTags=lenTags, | |
503 xlabel="relative delta Hamming distance", relative=True) | |
504 | |
505 #################### Plot FSD separated after difference between HD's ##################################### | |
506 ######################################################################################################################## | |
507 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, | |
508 subtitle="Family size distribution with separation after delta Hamming distances within the tags", | |
509 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) | |
510 | |
511 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, | |
512 subtitle="Family size distribution with separation after delta Hamming distances within the tags", | |
513 relative=True, diff=True, title_file1=name_file) | |
514 | |
515 | |
516 # plots for chimeric reads | |
517 if len(minHD_tags_zeros) != 0: | |
518 ## HD | |
519 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | |
520 subtitle="Hamming Distance of the non-identical half with separation after family size" | |
521 "\n(at least one half is identical with the half of the min. tag)\n", | |
522 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) | |
523 | |
524 ## FSD | |
525 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, | |
526 quant=quant, pdf=pdf, | |
527 subtitle="Family size distribution with separation after hamming distances from the non-identical half\n" | |
528 "(at least one half is identical with the half of the min. tag)\n", | |
529 relative=False, diff=False, title_file1=name_file) | |
530 | |
531 ### print all data to a CSV file | |
532 #### HD #### | |
533 summary, sumCol = createTableHD(list1, "HD=") | |
534 overallSum = sum(sumCol) # sum of columns in table | |
535 | |
536 #### FSD #### | |
537 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) | |
538 overallSum5 = sum(sumCol5) | |
539 | |
540 ### HD of both parts of the tag #### | |
541 summary9, sumCol9 = createTableHDwithTags([numpy.array(minHDs), HDhalf1, HDhalf2]) | |
542 overallSum9 = sum(sumCol9) | |
543 | |
544 ## HD | |
545 # absolute difference | |
546 summary11, sumCol11 = createTableHD(listDifference1, "diff=") | |
547 overallSum11 = sum(sumCol11) | |
548 # relative difference and all tags | |
549 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | |
550 overallSum13 = sum(sumCol13) | |
551 | |
552 ## FSD | |
553 # absolute difference | |
554 summary19, sumCol19 = createTableFSD2(familySizeList1_diff) | |
555 overallSum19 = sum(sumCol19) | |
556 # relative difference | |
557 summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) | |
558 overallSum21 = sum(sumCol21) | |
559 | |
560 # chimeric reads | |
561 if len(minHD_tags_zeros) != 0: | |
562 # absolute difference and tags where at least one half has HD=0 | |
563 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=") | |
564 overallSum15 = sum(sumCol15) | |
565 # absolute difference and tags where at least one half has HD=0 | |
566 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) | |
567 overallSum23 = sum(sumCol23) | |
568 | |
569 output_file.write("{}\n".format(f)) | |
570 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | |
571 numpy.concatenate(list1)), lenTags, lenTags)) | |
572 | |
573 ### HD ### | |
574 createFileHD(summary, sumCol, overallSum, output_file, | |
575 "Hamming distance with separation after family size: file1", sep) | |
576 ### FSD ### | |
577 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | |
578 "Family size distribution with separation after hamming distances: file1", sep, | |
579 diff=False) | |
580 | |
581 count = numpy.bincount(quant) | |
582 output_file.write("{}{}\n".format(sep, f)) | |
583 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) | |
584 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | |
585 output_file.write( | |
586 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | |
587 | |
588 ### HD within tags ### | |
589 output_file.write( | |
590 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | |
591 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | |
592 output_file.write( | |
593 "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | |
594 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) | |
595 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | |
596 | |
597 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | |
598 "Hamming distance of each half in the tag: file1", sep) | |
599 createFileHD(summary11, sumCol11, overallSum11, output_file, | |
600 "Absolute delta Hamming distances within the tag: file1", sep) | |
601 createFileHD(summary13, sumCol13, overallSum13, output_file, | |
602 "Relative delta Hamming distances within the tag: file1", sep) | |
603 | |
604 createFileFSD2(summary19, sumCol19, overallSum19, output_file, | |
605 "Family size distribution with separation after absolute delta Hamming distances: file1", | |
606 sep) | |
607 createFileFSD2(summary21, sumCol21, overallSum21, output_file, | |
608 "Family size distribution with separation after relative delta Hamming distances: file1", | |
609 sep, rel=True) | |
610 | |
611 if len(minHD_tags_zeros) != 0: | |
612 output_file.write( | |
613 "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | |
614 createFileHD(summary15, sumCol15, overallSum15, output_file, | |
615 "Hamming distances of non-zero half: file1", sep) | |
616 createFileFSD2(summary23, sumCol23, overallSum23, output_file, | |
617 "Family size distribution with separation after Hamming distances of non-zero half: file1", | |
618 sep, diff=False) | |
619 output_file.write("\n") | |
620 | |
621 | |
622 | |
623 if __name__ == '__main__': | |
624 sys.exit(Hamming_Distance_Analysis(sys.argv)) |