Mercurial > repos > mheinzl > hd
comparison hd.py @ 22:7e570ba56b83 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Wed, 27 Feb 2019 04:50:56 -0500 |
parents | 9919024d7778 |
children | 9e384b0741f1 |
comparison
equal
deleted
inserted
replaced
21:9919024d7778 | 22:7e570ba56b83 |
---|---|
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads | 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads |
12 # and finally a CSV file with the data of the plots. | 12 # and finally a CSV file with the data of the plots. |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / | 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular |
18 | 18 |
19 import argparse | 19 import argparse |
20 import itertools | 20 import itertools |
21 import operator | 21 import operator |
22 import sys | 22 import sys |
23 from collections import Counter | 23 from collections import Counter, defaultdict |
24 from functools import partial | 24 from functools import partial |
25 from multiprocessing.pool import Pool | 25 from multiprocessing.pool import Pool |
26 | 26 |
27 import matplotlib.pyplot as plt | 27 import matplotlib.pyplot as plt |
28 import numpy | 28 import numpy |
92 | 92 |
93 pdf.savefig(fig, bbox_inches="tight") | 93 pdf.savefig(fig, bbox_inches="tight") |
94 plt.close("all") | 94 plt.close("all") |
95 | 95 |
96 | 96 |
97 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True): | 97 def plotHDwithFSD(list1, maximumX, minimumX, subtitle, lenTags, title_file1, pdf, xlabel, relative=False, nr_above_bars=True, nr_unique_chimeras=0, len_sample=0): |
98 if relative is True: | 98 if relative is True: |
99 step = 0.1 | 99 step = 0.1 |
100 else: | 100 else: |
101 step = 1 | 101 step = 1 |
102 | 102 |
142 xy=(label, x_label + len(con_list1) * 0.01), | 142 xy=(label, x_label + len(con_list1) * 0.01), |
143 xycoords="data", color="#000066", fontsize=10) | 143 xycoords="data", color="#000066", fontsize=10) |
144 | 144 |
145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) | 145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) |
146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
147 if nr_unique_chimeras != 0 and len_sample != 0: | |
148 if relative == True: | |
149 legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, | |
150 int(nr_unique_chimeras) / float(len_sample)) | |
151 else: | |
152 legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) | |
153 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | |
147 | 154 |
148 pdf.savefig(fig, bbox_inches="tight") | 155 pdf.savefig(fig, bbox_inches="tight") |
149 plt.close("all") | 156 plt.close("all") |
150 plt.clf() | 157 plt.clf() |
151 | 158 |
156 | 163 |
157 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags | 164 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags |
158 | 165 |
159 maximumX = numpy.amax(numpy.concatenate(ham_partial)) | 166 maximumX = numpy.amax(numpy.concatenate(ham_partial)) |
160 minimumX = numpy.amin(numpy.concatenate(ham_partial)) | 167 minimumX = numpy.amin(numpy.concatenate(ham_partial)) |
161 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham_partial)))) | 168 maximumY = numpy.amax(numpy.array(numpy.concatenate(map(lambda x: numpy.bincount(x), ham_partial)))) |
162 | 169 |
163 if len(range(minimumX, maximumX)) == 0: | 170 if len(range(minimumX, maximumX)) == 0: |
164 range1 = minimumX | 171 range1 = minimumX |
165 else: | 172 else: |
166 range1 = range(minimumX, maximumX + 2) | 173 range1 = range(minimumX, maximumX + 2) |
446 return res | 453 return res |
447 | 454 |
448 | 455 |
449 def hamming_difference(array1, array2, mate_b): | 456 def hamming_difference(array1, array2, mate_b): |
450 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 457 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
458 | |
451 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 | 459 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 |
452 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | 460 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 |
453 | 461 |
454 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | 462 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 |
455 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | 463 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 |
471 ham2min = [] | 479 ham2min = [] |
472 min_valueList = [] | 480 min_valueList = [] |
473 min_tagsList = [] | 481 min_tagsList = [] |
474 diff11_zeros = [] | 482 diff11_zeros = [] |
475 min_tagsList_zeros = [] | 483 min_tagsList_zeros = [] |
484 max_tag_list = [] | |
476 i = 0 # counter, only used to see how many HDs of tags were already calculated | 485 i = 0 # counter, only used to see how many HDs of tags were already calculated |
477 if mate_b is False: # HD calculation for all a's | 486 if mate_b is False: # HD calculation for all a's |
478 half1_mate1 = array1_half | 487 half1_mate1 = array1_half |
479 half2_mate1 = array1_half2 | 488 half2_mate1 = array1_half2 |
480 half1_mate2 = array2_half | 489 half1_mate2 = array2_half |
485 half1_mate2 = array2_half2 | 494 half1_mate2 = array2_half2 |
486 half2_mate2 = array2_half | 495 half2_mate2 = array2_half |
487 | 496 |
488 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | 497 for a, b, tag in zip(half1_mate1, half2_mate1, array1): |
489 # exclude identical tag from array2, to prevent comparison to itself | 498 # exclude identical tag from array2, to prevent comparison to itself |
490 sameTag = numpy.where(array2 == tag) | 499 sameTag = numpy.where(array2 == tag)[0] |
491 indexArray2 = numpy.arange(0, len(array2), 1) | 500 indexArray2 = numpy.arange(0, len(array2), 1) |
492 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data | 501 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data |
493 | 502 |
494 # all tags without identical tag | 503 # all tags without identical tag |
495 array2_half_withoutSame = half1_mate2[index_withoutSame] | 504 array2_half_withoutSame = half1_mate2[index_withoutSame] |
496 array2_half2_withoutSame = half2_mate2[index_withoutSame] | 505 array2_half2_withoutSame = half2_mate2[index_withoutSame] |
497 # array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | 506 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) |
498 | 507 |
499 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | 508 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in |
500 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | 509 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" |
501 min_index = numpy.where(dist == dist.min()) # get index of min HD | 510 min_index = numpy.where(dist == dist.min())[0] # get index of min HD |
502 min_value = dist[min_index] # get minimum HDs | 511 min_value = dist[min_index] # get minimum HDs |
503 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD | 512 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD |
504 # min_tag = array2_withoutSame[min_index] # get whole tag with min HD | 513 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD |
505 | 514 |
506 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | 515 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in |
507 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | 516 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" |
517 dist2 = [dist_second_half.max()] | |
518 min_value = [dist.min()] | |
519 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | |
520 max_tag = min_tag_array2[max_index] | |
521 | |
508 for d, d2 in zip(min_value, dist2): | 522 for d, d2 in zip(min_value, dist2): |
509 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | 523 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b |
510 ham2.append(d) | 524 ham2.append(d) |
511 ham2min.append(d2) | 525 ham2min.append(d2) |
512 else: # half1, corrects the variable of the HD from both halfs if it is a or b | 526 else: # half1, corrects the variable of the HD from both halfs if it is a or b |
520 rel_difference = round(float(difference1) / (d + d2), 1) | 534 rel_difference = round(float(difference1) / (d + d2), 1) |
521 relativeDiffList.append(rel_difference) | 535 relativeDiffList.append(rel_difference) |
522 | 536 |
523 # tags which have identical parts: | 537 # tags which have identical parts: |
524 if d == 0 or d2 == 0: | 538 if d == 0 or d2 == 0: |
525 min_tagsList_zeros.append(tag) | 539 min_tagsList_zeros.append(numpy.array(tag)) |
526 difference1_zeros = abs(d - d2) | 540 difference1_zeros = abs(d - d2) # hd of non-identical part |
527 diff11_zeros.append(difference1_zeros) | 541 diff11_zeros.append(difference1_zeros) |
542 max_tag_list.append(numpy.array(max_tag)) | |
543 | |
528 i += 1 | 544 i += 1 |
529 | 545 |
530 # print(i) | 546 # print(i) |
531 # diff11 = [st for st in diff11 if st != 999] | 547 # diff11 = [st for st in diff11 if st != 999] |
532 # ham1 = [st for st in ham1 if st != 999] | 548 # ham1 = [st for st in ham1 if st != 999] |
534 # min_valueList = [st for st in min_valueList if st != 999] | 550 # min_valueList = [st for st in min_valueList if st != 999] |
535 # min_tagsList = [st for st in min_tagsList if st != 999] | 551 # min_tagsList = [st for st in min_tagsList if st != 999] |
536 # relativeDiffList = [st for st in relativeDiffList if st != 999] | 552 # relativeDiffList = [st for st in relativeDiffList if st != 999] |
537 # diff11_zeros = [st for st in diff11_zeros if st != 999] | 553 # diff11_zeros = [st for st in diff11_zeros if st != 999] |
538 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | 554 # min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] |
539 | 555 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min, max_tag_list]) |
540 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min]) | |
541 | 556 |
542 | 557 |
543 def readFileReferenceFree(file): | 558 def readFileReferenceFree(file): |
544 with open(file, 'r') as dest_f: | 559 with open(file, 'r') as dest_f: |
545 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | 560 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') |
636 def make_argparser(): | 651 def make_argparser(): |
637 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') | 652 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') |
638 parser.add_argument('--inputFile', | 653 parser.add_argument('--inputFile', |
639 help='Tabular File with three columns: ab or ba, tag and family size.') | 654 help='Tabular File with three columns: ab or ba, tag and family size.') |
640 parser.add_argument('--inputName1') | 655 parser.add_argument('--inputName1') |
641 # parser.add_argument('--inputFile2', default=None, | |
642 # help='Tabular File with three columns: ab or ba, tag and family size.') | |
643 # parser.add_argument('--inputName2') | |
644 parser.add_argument('--sample_size', default=1000, type=int, | 656 parser.add_argument('--sample_size', default=1000, type=int, |
645 help='Sample size of Hamming distance analysis.') | 657 help='Sample size of Hamming distance analysis.') |
646 parser.add_argument('--subset_tag', default=0, type=int, | 658 parser.add_argument('--subset_tag', default=0, type=int, |
647 help='The tag is shortened to the given number.') | 659 help='The tag is shortened to the given number.') |
648 parser.add_argument('--nproc', default=4, type=int, | 660 parser.add_argument('--nproc', default=4, type=int, |
659 | 671 |
660 parser.add_argument('--output_tabular', default="data.tabular", type=str, | 672 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
661 help='Name of the tabular file.') | 673 help='Name of the tabular file.') |
662 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 674 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
663 help='Name of the pdf file.') | 675 help='Name of the pdf file.') |
664 # parser.add_argument('--output_pdf2', default="data2.pdf", type=str, | 676 parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str, |
665 # help='Name of the pdf file.') | 677 help='Name of the tabular file with all chimeric tags.') |
666 # parser.add_argument('--output_tabular2', default="data2.tabular", type=str, | |
667 # help='Name of the tabular file.') | |
668 | |
669 return parser | 678 return parser |
670 | 679 |
671 | 680 |
672 def Hamming_Distance_Analysis(argv): | 681 def Hamming_Distance_Analysis(argv): |
673 parser = make_argparser() | 682 parser = make_argparser() |
674 args = parser.parse_args(argv[1:]) | 683 args = parser.parse_args(argv[1:]) |
675 | |
676 file1 = args.inputFile | 684 file1 = args.inputFile |
677 name1 = args.inputName1 | 685 name1 = args.inputName1 |
678 | |
679 # file2 = args.inputFile2 | |
680 # name2 = args.inputName2 | |
681 | |
682 index_size = args.sample_size | 686 index_size = args.sample_size |
683 title_savedFile_pdf = args.output_pdf | 687 title_savedFile_pdf = args.output_pdf |
684 # title_savedFile_pdf2 = args.output_pdf2 | |
685 | |
686 title_savedFile_csv = args.output_tabular | 688 title_savedFile_csv = args.output_tabular |
687 # title_savedFile_csv2 = args.output_tabular2 | 689 output_chimeras_tabular = args.output_chimeras_tabular |
688 | 690 |
689 sep = "\t" | 691 sep = "\t" |
690 onlyDuplicates = args.only_DCS | 692 onlyDuplicates = args.only_DCS |
691 minFS = args.minFS | 693 minFS = args.minFS |
692 maxFS = args.maxFS | 694 maxFS = args.maxFS |
693 nr_above_bars = args.nr_above_bars | 695 nr_above_bars = args.nr_above_bars |
694 | |
695 subset = args.subset_tag | 696 subset = args.subset_tag |
696 nproc = args.nproc | 697 nproc = args.nproc |
697 | 698 |
698 # input checks | 699 # input checks |
699 if index_size < 0: | 700 if index_size < 0: |
713 plt.rcParams['xtick.labelsize'] = 14 | 714 plt.rcParams['xtick.labelsize'] = 14 |
714 plt.rcParams['ytick.labelsize'] = 14 | 715 plt.rcParams['ytick.labelsize'] = 14 |
715 plt.rcParams['patch.edgecolor'] = "#000000" | 716 plt.rcParams['patch.edgecolor'] = "#000000" |
716 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 717 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
717 | 718 |
718 # if file2 != str(None): | |
719 # files = [file1, file2] | |
720 # name1 = name1.split(".tabular")[0] | |
721 # name2 = name2.split(".tabular")[0] | |
722 # names = [name1, name2] | |
723 # pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] | |
724 # csv_files = [title_savedFile_csv, title_savedFile_csv2] | |
725 # else: | |
726 # f = file1 | |
727 name1 = name1.split(".tabular")[0] | 719 name1 = name1.split(".tabular")[0] |
728 # name_file = name1 | 720 |
729 # pdf_f = title_savedFile_pdf | |
730 # csv_f = title_savedFile_csv | |
731 | |
732 #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): | |
733 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: | 721 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: |
734 print("dataset: ", name1) | 722 print("dataset: ", name1) |
735 integers, data_array = readFileReferenceFree(file1) | 723 integers, data_array = readFileReferenceFree(file1) |
736 data_array = numpy.array(data_array) | 724 data_array = numpy.array(data_array) |
725 print("total nr of tags:", len(data_array)) | |
726 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] | |
727 if len(n) != 0: # delete tags with N in the tag from data | |
728 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) | |
729 index_whole_array = numpy.arange(0, len(data_array), 1) | |
730 index_withoutN_inTag = numpy.delete(index_whole_array, n) | |
731 data_array = data_array[index_withoutN_inTag, :] | |
732 integers = integers[index_withoutN_inTag] | |
733 print("total nr of tags without Ns:", len(data_array)) | |
734 | |
735 data_array_whole_dataset = data_array | |
736 | |
737 int_f = numpy.array(data_array[:, 0]).astype(int) | 737 int_f = numpy.array(data_array[:, 0]).astype(int) |
738 data_array = data_array[numpy.where(int_f >= minFS)] | 738 data_array = data_array[numpy.where(int_f >= minFS)] |
739 integers = integers[integers >= minFS] | 739 integers = integers[integers >= minFS] |
740 | 740 |
741 # select family size for tags | 741 # select family size for tags |
787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | 787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) |
788 | 788 |
789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | 789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) |
790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | 790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) |
791 | 791 |
792 print("length of tag= ", len(data_array[0, 1])) | 792 print("length of tag:", len(data_array[0, 1])) |
793 # select sample: if no size given --> all vs. all comparison | 793 # select sample: if no size given --> all vs. all comparison |
794 if index_size == 0: | 794 if index_size == 0: |
795 result = numpy.arange(0, len(data_array), 1) | 795 result = numpy.arange(0, len(data_array), 1) |
796 else: | 796 else: |
797 result = numpy.random.choice(len(integers), size=index_size, | 797 result = numpy.random.choice(len(integers), size=index_size, |
798 replace=False) # array of random sequences of size=index.size | 798 replace=False) # array of random sequences of size=index.size |
799 # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags | |
800 # result = numpy.random.choice(unique_indices, size=index_size, | |
801 # replace=False) # array of random sequences of size=index.size | |
799 | 802 |
800 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | 803 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: |
801 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | 804 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) |
802 | 805 |
803 # comparison random tags to whole dataset | 806 # comparison random tags to whole dataset |
804 result1 = data_array[result, 1] # random tags | 807 result1 = data_array[result, 1] # random tags |
805 result2 = data_array[:, 1] # all tags | 808 result2 = data_array[:, 1] # all tags |
806 print("size of the whole dataset= ", len(result2)) | 809 print("sample size:", len(result1)) |
807 print("sample size= ", len(result1)) | |
808 | 810 |
809 # HD analysis of whole tag | 811 # HD analysis of whole tag |
810 proc_pool = Pool(nproc) | 812 proc_pool = Pool(nproc) |
811 chunks_sample = numpy.array_split(result1, nproc) | 813 chunks_sample = numpy.array_split(result1, nproc) |
812 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | 814 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) |
816 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | 818 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: |
817 # for h, tag in zip(ham, result1): | 819 # for h, tag in zip(ham, result1): |
818 # output_file1.write("{}\t{}\n".format(tag, h)) | 820 # output_file1.write("{}\t{}\n".format(tag, h)) |
819 | 821 |
820 # HD analysis for chimeric reads | 822 # HD analysis for chimeric reads |
823 # result2 = data_array_whole_dataset[:,1] | |
824 | |
821 proc_pool_b = Pool(nproc) | 825 proc_pool_b = Pool(nproc) |
822 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | 826 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) |
823 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | 827 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) |
824 proc_pool_b.close() | 828 proc_pool_b.close() |
825 proc_pool_b.join() | 829 proc_pool_b.join() |
842 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), | 846 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), |
843 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | 847 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) |
844 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | 848 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), |
845 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | 849 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) |
846 | 850 |
851 chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a], | |
852 [item_b[10] for item_b in diff_list_b])) | |
853 | |
854 chimera_tags = [x for x in chimera_tags if x != []] | |
855 chimera_tags_new = [] | |
856 | |
857 for i in chimera_tags: | |
858 if len(i) > 1: | |
859 for t in i: | |
860 chimera_tags_new.append(t) | |
861 else: | |
862 chimera_tags_new.extend(i) | |
863 | |
864 chimeras_dic = defaultdict(list) | |
865 for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | |
866 chimeras_dic[t1].append(t2) | |
867 | |
868 lst_unique_chimeras = [] | |
869 with open(output_chimeras_tabular, "w") as output_file1: | |
870 unique_chimeras = numpy.unique(minHD_tags_zeros) | |
871 sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 | |
872 sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 | |
873 | |
874 output_file1.write("sample tag\tsimilar tag\n") | |
875 for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): | |
876 max_tags = numpy.concatenate(chimeras_dic.get(tag1)) | |
877 | |
878 if tag1 in chimeras_dic.values(): | |
879 continue | |
880 else: | |
881 lst_unique_chimeras.append(tag1) | |
882 | |
883 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 | |
884 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 | |
885 | |
886 new_format = [] | |
887 for i in range(len(max_tags)): | |
888 if a == chimera_half_a[i]: | |
889 max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) | |
890 new_format.append(max_tag) | |
891 | |
892 elif b == chimera_half_b[i]: | |
893 max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) | |
894 new_format.append(max_tag) | |
895 | |
896 sample_tag = "{} {}".format(a, b) | |
897 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | |
898 output_file1.write( | |
899 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " | |
900 "The tags were separated by an empty space into their halves and the * marks the identical half.") | |
901 | |
902 nr_chimeric_tags = len(lst_unique_chimeras) | |
903 print("nr of unique chimeras:", nr_chimeric_tags) | |
904 | |
847 lenTags = len(data_array) | 905 lenTags = len(data_array) |
906 len_sample = len(result1) | |
848 | 907 |
849 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | 908 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags |
850 seq = numpy.array(data_array[result, 1]) # tags of sample | 909 seq = numpy.array(data_array[result, 1]) # tags of sample |
851 ham = numpy.asarray(ham) # HD for sample of tags | 910 ham = numpy.asarray(ham) # HD for sample of tags |
852 | 911 |
853 if onlyDuplicates is True: # ab and ba strands of DCSs | 912 if onlyDuplicates is True: # ab and ba strands of DCSs |
854 quant = numpy.concatenate((quant, duplTagsBA[result])) | 913 quant = numpy.concatenate((quant, duplTagsBA[result])) |
855 seq = numpy.tile(seq, 2) | 914 seq = numpy.tile(seq, 2) |
856 ham = numpy.tile(ham, 2) | 915 ham = numpy.tile(ham, 2) |
916 diff = numpy.tile(diff, 2) | |
917 rel_Diff = numpy.tile(rel_Diff, 2) | |
918 diff_zeros = numpy.tile(diff_zeros, 2) | |
857 | 919 |
858 # prepare data for different kinds of plots | 920 # prepare data for different kinds of plots |
859 # distribution of FSs separated after HD | 921 # distribution of FSs separated after HD |
860 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) | 922 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) |
861 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | 923 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS |
862 | 924 |
863 # get FS for all tags with min HD of analysis of chimeric reads | 925 # get FS for all tags with min HD of analysis of chimeric reads |
864 # there are more tags than sample size in the plot, because one tag can have multiple minimas | 926 # there are more tags than sample size in the plot, because one tag can have multiple minimas |
865 seqDic = dict(zip(seq, quant)) | 927 if onlyDuplicates: |
928 seqDic = defaultdict(list) | |
929 for s, q in zip(seq, quant): | |
930 seqDic[s].append(q) | |
931 else: | |
932 seqDic = dict(zip(seq, quant)) | |
933 | |
934 | |
866 lst_minHD_tags = [] | 935 lst_minHD_tags = [] |
867 for i in minHD_tags: | 936 for i in minHD_tags: |
868 lst_minHD_tags.append(seqDic.get(i)) | 937 lst_minHD_tags.append(seqDic.get(i)) |
938 | |
939 if onlyDuplicates: | |
940 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], | |
941 [item_b[1] for item_b in lst_minHD_tags])).astype(int) | |
942 # else: | |
943 # lst_minHD_tags = numpy.concatenate(lst_minHD_tags) | |
869 | 944 |
870 # histogram with absolute and relative difference between HDs of both parts of the tag | 945 # histogram with absolute and relative difference between HDs of both parts of the tag |
871 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | 946 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) |
872 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | 947 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, |
873 rel_Diff) | 948 rel_Diff) |
874 | 949 |
875 # family size distribution separated after the difference between HDs of both parts of the tag | 950 # family size distribution separated after the difference between HDs of both parts of the tag |
876 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( | 951 # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( |
877 lst_minHD_tags, diff, diff=True, rel=False) | 952 # lst_minHD_tags, diff, diff=True, rel=False) |
878 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( | 953 # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( |
879 lst_minHD_tags, rel_Diff, diff=True, rel=True) | 954 # lst_minHD_tags, rel_Diff, diff=True, rel=True) |
880 | 955 |
881 # chimeric read analysis: tags which have HD=0 in one of the halfs | 956 # chimeric read analysis: tags which have HD=0 in one of the halfs |
882 if len(minHD_tags_zeros) != 0: | 957 if len(minHD_tags_zeros) != 0: |
883 lst_minHD_tags_zeros = [] | 958 lst_minHD_tags_zeros = [] |
884 for i in minHD_tags_zeros: | 959 for i in minHD_tags_zeros: |
885 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | 960 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads |
886 | 961 if onlyDuplicates: |
887 # histogram with HD of non-identical half | 962 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], |
888 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | 963 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) |
964 | |
965 # histogram with HD of non-identical half | |
966 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | |
889 lst_minHD_tags_zeros, diff_zeros) | 967 lst_minHD_tags_zeros, diff_zeros) |
890 # family size distribution of non-identical half | 968 |
891 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( | |
892 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) | |
893 | |
894 # plot Hamming Distance with Family size distribution | 969 # plot Hamming Distance with Family size distribution |
895 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | 970 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, |
896 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, | 971 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, |
897 xlabel="HD", nr_above_bars=nr_above_bars) | 972 xlabel="HD", nr_above_bars=nr_above_bars) |
898 | 973 |
912 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) | 987 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) |
913 | 988 |
914 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 989 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
915 subtitle="Chimera Analysis: relative delta Hamming distances", | 990 subtitle="Chimera Analysis: relative delta Hamming distances", |
916 title_file1=name1, lenTags=lenTags, | 991 title_file1=name1, lenTags=lenTags, |
917 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) | 992 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
918 | 993 |
919 # plots for chimeric reads | 994 # plots for chimeric reads |
920 if len(minHD_tags_zeros) != 0: | 995 if len(minHD_tags_zeros) != 0: |
921 # HD | 996 # HD |
922 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | 997 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras", |
923 subtitle="Hamming distance of the non-identical half of chimeras", | |
924 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, | 998 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, |
925 nr_above_bars=nr_above_bars) | 999 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
926 | 1000 |
927 # print all data to a CSV file | 1001 # print all data to a CSV file |
928 # HD | 1002 # HD |
929 summary, sumCol = createTableHD(list1, "HD=") | 1003 summary, sumCol = createTableHD(list1, "HD=") |
930 overallSum = sum(sumCol) # sum of columns in table | 1004 overallSum = sum(sumCol) # sum of columns in table |
972 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) | 1046 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) |
973 | 1047 |
974 # HD within tags | 1048 # HD within tags |
975 output_file.write( | 1049 output_file.write( |
976 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | 1050 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" |
977 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | 1051 "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n" |
978 output_file.write( | 1052 "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n") |
979 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | 1053 |
980 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) | |
981 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | 1054 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) |
982 | 1055 |
983 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | 1056 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, |
984 "Hamming distance of each half in the tag", sep) | 1057 "Hamming distance of each half in the tag", sep) |
985 createFileHD(summary11, sumCol11, overallSum11, output_file, | 1058 createFileHD(summary11, sumCol11, overallSum11, output_file, |
986 "Absolute delta Hamming distances within the tag", sep) | 1059 "Absolute delta Hamming distances within the tag", sep) |
1060 | |
987 createFileHD(summary13, sumCol13, overallSum13, output_file, | 1061 createFileHD(summary13, sumCol13, overallSum13, output_file, |
988 "Chimera analysis: relative delta Hamming distances", sep) | 1062 "Chimera analysis: relative delta Hamming distances", sep) |
989 | 1063 |
990 if len(minHD_tags_zeros) != 0: | 1064 if len(minHD_tags_zeros) != 0: |
991 output_file.write( | 1065 output_file.write( |
992 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | 1066 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n") |
1067 output_file.write( | |
1068 "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n") | |
1069 output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample))) | |
993 createFileHD(summary15, sumCol15, overallSum15, output_file, | 1070 createFileHD(summary15, sumCol15, overallSum15, output_file, |
994 "Hamming distances of non-zero half", sep) | 1071 "Hamming distances of non-zero half", sep) |
1072 | |
995 output_file.write("\n") | 1073 output_file.write("\n") |
996 | 1074 |
997 | 1075 |
998 if __name__ == '__main__': | 1076 if __name__ == '__main__': |
999 sys.exit(Hamming_Distance_Analysis(sys.argv)) | 1077 sys.exit(Hamming_Distance_Analysis(sys.argv)) |
1078 |