Mercurial > repos > mheinzl > hd
comparison hd.py @ 25:9e384b0741f1 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Tue, 14 May 2019 03:29:37 -0400 |
parents | 7e570ba56b83 |
children | df1fc5cedc8b |
comparison
equal
deleted
inserted
replaced
24:3bc67ac46740 | 25:9e384b0741f1 |
---|---|
12 # and finally a CSV file with the data of the plots. | 12 # and finally a CSV file with the data of the plots. |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / | 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular |
18 | 18 |
19 import argparse | 19 import argparse |
20 import itertools | 20 import itertools |
21 import operator | 21 import operator |
22 import sys | 22 import sys |
23 from collections import Counter, defaultdict | 23 from collections import Counter, defaultdict |
24 from functools import partial | 24 from functools import partial |
25 from multiprocessing.pool import Pool | 25 from multiprocessing.pool import Pool |
26 import random | |
27 import os | |
26 | 28 |
27 import matplotlib.pyplot as plt | 29 import matplotlib.pyplot as plt |
28 import numpy | 30 import numpy |
29 from matplotlib.backends.backend_pdf import PdfPages | 31 from matplotlib.backends.backend_pdf import PdfPages |
30 | 32 |
140 else: | 142 else: |
141 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), | 143 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), |
142 xy=(label, x_label + len(con_list1) * 0.01), | 144 xy=(label, x_label + len(con_list1) * 0.01), |
143 xycoords="data", color="#000066", fontsize=10) | 145 xycoords="data", color="#000066", fontsize=10) |
144 | 146 |
145 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) | 147 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts)) |
146 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 148 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) |
147 if nr_unique_chimeras != 0 and len_sample != 0: | 149 |
148 if relative == True: | 150 # if nr_unique_chimeras != 0 and len_sample != 0: |
149 legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, | 151 # if relative == True: |
150 int(nr_unique_chimeras) / float(len_sample)) | 152 # legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras, |
151 else: | 153 # int(nr_unique_chimeras) / float(len_sample)) |
152 legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) | 154 # else: |
153 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | 155 # legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample)) |
156 # plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure) | |
154 | 157 |
155 pdf.savefig(fig, bbox_inches="tight") | 158 pdf.savefig(fig, bbox_inches="tight") |
156 plt.close("all") | 159 plt.close("all") |
157 plt.clf() | 160 plt.clf() |
158 | 161 |
159 | 162 |
160 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf): | 163 def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample): |
161 fig = plt.figure(figsize=(6, 8)) | 164 fig = plt.figure(figsize=(6, 8)) |
162 plt.subplots_adjust(bottom=0.1) | 165 plt.subplots_adjust(bottom=0.1) |
163 | 166 |
164 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags | 167 ham_partial = [sum1, sum1min, sum2, sum2min, numpy.array(min_value)] # new hd within tags |
165 | 168 |
170 if len(range(minimumX, maximumX)) == 0: | 173 if len(range(minimumX, maximumX)) == 0: |
171 range1 = minimumX | 174 range1 = minimumX |
172 else: | 175 else: |
173 range1 = range(minimumX, maximumX + 2) | 176 range1 = range(minimumX, maximumX + 2) |
174 | 177 |
175 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) | 178 plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1) |
176 | 179 |
177 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | 180 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) |
178 plt.suptitle('Hamming distances within tags', fontsize=14) | 181 plt.suptitle('Hamming distances within tags', fontsize=14) |
179 # plt.title(title_file1, fontsize=12) | 182 # plt.title(title_file1, fontsize=12) |
180 plt.xlabel("HD", fontsize=14) | 183 plt.xlabel("HD", fontsize=14) |
182 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 185 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
183 | 186 |
184 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) | 187 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) |
185 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | 188 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) |
186 # plt.ylim(0, maximumY * 1.2) | 189 # plt.ylim(0, maximumY * 1.2) |
187 | 190 legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial))) |
188 legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) | 191 |
189 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 192 # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) |
193 plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure) | |
190 pdf.savefig(fig, bbox_inches="tight") | 194 pdf.savefig(fig, bbox_inches="tight") |
191 plt.close("all") | 195 plt.close("all") |
192 plt.clf() | 196 plt.clf() |
193 | 197 |
194 | 198 |
491 elif mate_b is True: # HD calculation for all b's | 495 elif mate_b is True: # HD calculation for all b's |
492 half1_mate1 = array1_half2 | 496 half1_mate1 = array1_half2 |
493 half2_mate1 = array1_half | 497 half2_mate1 = array1_half |
494 half1_mate2 = array2_half2 | 498 half1_mate2 = array2_half2 |
495 half2_mate2 = array2_half | 499 half2_mate2 = array2_half |
500 # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True) | |
501 # print(len(half1_mate1)) | |
502 # half2_mate1 = half2_mate1[index_halves] | |
503 # array1 = array1[index_halves] | |
496 | 504 |
497 for a, b, tag in zip(half1_mate1, half2_mate1, array1): | 505 for a, b, tag in zip(half1_mate1, half2_mate1, array1): |
498 # exclude identical tag from array2, to prevent comparison to itself | 506 # exclude identical tag from array2, to prevent comparison to itself |
499 sameTag = numpy.where(array2 == tag)[0] | 507 sameTag = numpy.where(array2 == tag)[0] |
500 indexArray2 = numpy.arange(0, len(array2), 1) | 508 indexArray2 = numpy.arange(0, len(array2), 1) |
506 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) | 514 array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) |
507 | 515 |
508 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in | 516 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in |
509 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" | 517 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" |
510 min_index = numpy.where(dist == dist.min())[0] # get index of min HD | 518 min_index = numpy.where(dist == dist.min())[0] # get index of min HD |
511 min_value = dist[min_index] # get minimum HDs | 519 min_value = dist.min() |
520 # min_value = dist[min_index] # get minimum HDs | |
512 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD | 521 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD |
513 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD | 522 min_tag_array2 = array2_withoutSame[min_index] # get whole tag with min HD |
514 | 523 |
515 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in | 524 dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" |
516 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | 525 max_value = dist_second_half.max() |
517 dist2 = [dist_second_half.max()] | |
518 min_value = [dist.min()] | |
519 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | 526 max_index = numpy.where(dist_second_half == dist_second_half.max())[0] # get index of max HD |
520 max_tag = min_tag_array2[max_index] | 527 max_tag = min_tag_array2[max_index] |
521 | 528 |
522 for d, d2 in zip(min_value, dist2): | 529 # for d, d2 in zip(min_value, max_value): |
523 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | 530 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b |
524 ham2.append(d) | 531 ham2.append(min_value) |
525 ham2min.append(d2) | 532 ham2min.append(max_value) |
526 else: # half1, corrects the variable of the HD from both halfs if it is a or b | 533 else: # half1, corrects the variable of the HD from both halfs if it is a or b |
527 ham1.append(d) | 534 ham1.append(min_value) |
528 ham1min.append(d2) | 535 ham1min.append(max_value) |
529 | 536 |
530 min_valueList.append(d + d2) | 537 min_valueList.append(min_value + max_value) |
531 min_tagsList.append(tag) | 538 min_tagsList.append(tag) |
532 difference1 = abs(d - d2) | 539 difference1 = abs(min_value - max_value) |
533 diff11.append(difference1) | 540 diff11.append(difference1) |
534 rel_difference = round(float(difference1) / (d + d2), 1) | 541 rel_difference = round(float(difference1) / (min_value + max_value), 1) |
535 relativeDiffList.append(rel_difference) | 542 relativeDiffList.append(rel_difference) |
536 | 543 |
537 # tags which have identical parts: | 544 # tags which have identical parts: |
538 if d == 0 or d2 == 0: | 545 if min_value == 0 or max_value == 0: |
539 min_tagsList_zeros.append(numpy.array(tag)) | 546 min_tagsList_zeros.append(numpy.array(tag)) |
540 difference1_zeros = abs(d - d2) # hd of non-identical part | 547 difference1_zeros = abs(min_value - max_value) # hd of non-identical part |
541 diff11_zeros.append(difference1_zeros) | 548 diff11_zeros.append(difference1_zeros) |
542 max_tag_list.append(numpy.array(max_tag)) | 549 max_tag_list.append(max_tag) |
550 else: | |
551 min_tagsList_zeros.append(None) | |
552 diff11_zeros.append(None) | |
553 max_tag_list.append(numpy.array(["None"])) | |
554 | |
555 # max_tag_list.append(numpy.array(max_tag)) | |
543 | 556 |
544 i += 1 | 557 i += 1 |
545 | 558 |
546 # print(i) | 559 # print(i) |
547 # diff11 = [st for st in diff11 if st != 999] | 560 # diff11 = [st for st in diff11 if st != 999] |
665 parser.add_argument('--minFS', default=1, type=int, | 678 parser.add_argument('--minFS', default=1, type=int, |
666 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') | 679 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') |
667 parser.add_argument('--maxFS', default=0, type=int, | 680 parser.add_argument('--maxFS', default=0, type=int, |
668 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') | 681 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') |
669 parser.add_argument('--nr_above_bars', action="store_true", | 682 parser.add_argument('--nr_above_bars', action="store_true", |
670 help='If no, values above bars in the histrograms are removed') | 683 help='If no, values above bars in the histograms are removed') |
671 | 684 |
672 parser.add_argument('--output_tabular', default="data.tabular", type=str, | 685 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
673 help='Name of the tabular file.') | 686 help='Name of the tabular file.') |
674 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 687 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
675 help='Name of the pdf file.') | 688 help='Name of the pdf file.') |
676 parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str, | 689 parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str, |
677 help='Name of the tabular file with all chimeric tags.') | 690 help='Name of the tabular file with all chimeric tags.') |
691 | |
678 return parser | 692 return parser |
679 | 693 |
680 | 694 |
681 def Hamming_Distance_Analysis(argv): | 695 def Hamming_Distance_Analysis(argv): |
696 | |
682 parser = make_argparser() | 697 parser = make_argparser() |
683 args = parser.parse_args(argv[1:]) | 698 args = parser.parse_args(argv[1:]) |
699 | |
684 file1 = args.inputFile | 700 file1 = args.inputFile |
685 name1 = args.inputName1 | 701 name1 = args.inputName1 |
702 | |
686 index_size = args.sample_size | 703 index_size = args.sample_size |
687 title_savedFile_pdf = args.output_pdf | 704 title_savedFile_pdf = args.output_pdf |
688 title_savedFile_csv = args.output_tabular | 705 title_savedFile_csv = args.output_tabular |
689 output_chimeras_tabular = args.output_chimeras_tabular | 706 output_chimeras_tabular = args.output_chimeras_tabular |
690 | 707 |
691 sep = "\t" | 708 sep = "\t" |
692 onlyDuplicates = args.only_DCS | 709 onlyDuplicates = args.only_DCS |
693 minFS = args.minFS | 710 minFS = args.minFS |
694 maxFS = args.maxFS | 711 maxFS = args.maxFS |
695 nr_above_bars = args.nr_above_bars | 712 nr_above_bars = args.nr_above_bars |
713 | |
696 subset = args.subset_tag | 714 subset = args.subset_tag |
697 nproc = args.nproc | 715 nproc = args.nproc |
698 | 716 |
699 # input checks | 717 # input checks |
700 if index_size < 0: | 718 if index_size < 0: |
720 | 738 |
721 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: | 739 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: |
722 print("dataset: ", name1) | 740 print("dataset: ", name1) |
723 integers, data_array = readFileReferenceFree(file1) | 741 integers, data_array = readFileReferenceFree(file1) |
724 data_array = numpy.array(data_array) | 742 data_array = numpy.array(data_array) |
725 print("total nr of tags:", len(data_array)) | 743 print("total nr of tags with Ns:", len(data_array)) |
726 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] | 744 n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x] |
727 if len(n) != 0: # delete tags with N in the tag from data | 745 if len(n) != 0: # delete tags with N in the tag from data |
728 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) | 746 print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array)) |
729 index_whole_array = numpy.arange(0, len(data_array), 1) | 747 index_whole_array = numpy.arange(0, len(data_array), 1) |
730 index_withoutN_inTag = numpy.delete(index_whole_array, n) | 748 index_withoutN_inTag = numpy.delete(index_whole_array, n) |
731 data_array = data_array[index_withoutN_inTag, :] | 749 data_array = data_array[index_withoutN_inTag, :] |
732 integers = integers[index_withoutN_inTag] | 750 integers = integers[index_withoutN_inTag] |
733 print("total nr of tags without Ns:", len(data_array)) | 751 print("total nr of tags without Ns:", len(data_array)) |
734 | 752 |
735 data_array_whole_dataset = data_array | |
736 | |
737 int_f = numpy.array(data_array[:, 0]).astype(int) | 753 int_f = numpy.array(data_array[:, 0]).astype(int) |
738 data_array = data_array[numpy.where(int_f >= minFS)] | 754 data_array = data_array[numpy.where(int_f >= minFS)] |
739 integers = integers[integers >= minFS] | 755 integers = integers[integers >= minFS] |
740 | 756 |
741 # select family size for tags | 757 # select family size for tags |
742 if maxFS > 0: | 758 if maxFS > 0: |
743 int_f2 = numpy.array(data_array[:, 0]).astype(int) | 759 int_f2 = numpy.array(data_array[:, 0]).astype(int) |
744 data_array = data_array[numpy.where(int_f2 <= maxFS)] | 760 data_array = data_array[numpy.where(int_f2 <= maxFS)] |
745 integers = integers[integers <= maxFS] | 761 integers = integers[integers <= maxFS] |
746 | 762 |
747 print("min FS", min(integers)) | |
748 print("max FS", max(integers)) | |
749 | |
750 tags = data_array[:, 2] | |
751 seq = data_array[:, 1] | |
752 | |
753 if onlyDuplicates is True: | 763 if onlyDuplicates is True: |
764 tags = data_array[:, 2] | |
765 seq = data_array[:, 1] | |
766 | |
754 # find all unique tags and get the indices for ALL tags, but only once | 767 # find all unique tags and get the indices for ALL tags, but only once |
755 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | 768 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) |
756 d = u[c > 1] | 769 d = u[c > 1] |
757 | 770 |
758 # get family sizes, tag for duplicates | 771 # get family sizes, tag for duplicates |
761 duplTagsBA = duplTags_double[1::2] # ba of DCS | 774 duplTagsBA = duplTags_double[1::2] # ba of DCS |
762 | 775 |
763 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | 776 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab |
764 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | 777 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags |
765 | 778 |
779 if minFS > 1: | |
780 duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)] | |
781 duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)] | |
782 duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 | |
783 | |
766 data_array = numpy.column_stack((duplTags, duplTags_seq)) | 784 data_array = numpy.column_stack((duplTags, duplTags_seq)) |
767 data_array = numpy.column_stack((data_array, duplTags_tag)) | 785 data_array = numpy.column_stack((data_array, duplTags_tag)) |
768 integers = numpy.array(data_array[:, 0]).astype(int) | 786 integers = numpy.array(data_array[:, 0]).astype(int) |
769 print("DCS in whole dataset", len(data_array)) | 787 print("DCS in whole dataset", len(data_array)) |
788 | |
789 print("min FS", min(integers)) | |
790 print("max FS", max(integers)) | |
770 | 791 |
771 # HD analysis for a subset of the tag | 792 # HD analysis for a subset of the tag |
772 if subset > 0: | 793 if subset > 0: |
773 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | 794 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) |
774 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | 795 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) |
787 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | 808 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) |
788 | 809 |
789 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | 810 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) |
790 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | 811 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) |
791 | 812 |
792 print("length of tag:", len(data_array[0, 1])) | 813 print("length of tag= ", len(data_array[0, 1])) |
793 # select sample: if no size given --> all vs. all comparison | 814 # select sample: if no size given --> all vs. all comparison |
794 if index_size == 0: | 815 if index_size == 0: |
795 result = numpy.arange(0, len(data_array), 1) | 816 result = numpy.arange(0, len(data_array), 1) |
796 else: | 817 else: |
797 result = numpy.random.choice(len(integers), size=index_size, | 818 numpy.random.shuffle(data_array) |
798 replace=False) # array of random sequences of size=index.size | 819 unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags |
799 # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags | 820 result = numpy.random.choice(unique_indices, size=index_size, replace=False) # array of random sequences of size=index.size |
800 # result = numpy.random.choice(unique_indices, size=index_size, | 821 |
801 # replace=False) # array of random sequences of size=index.size | 822 # result = numpy.random.choice(len(integers), size=index_size, |
823 # replace=False) # array of random sequences of size=index.size | |
824 # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0] | |
802 | 825 |
803 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | 826 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: |
804 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | 827 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) |
805 | 828 |
806 # comparison random tags to whole dataset | 829 # comparison random tags to whole dataset |
807 result1 = data_array[result, 1] # random tags | 830 result1 = data_array[result, 1] # random tags |
808 result2 = data_array[:, 1] # all tags | 831 result2 = data_array[:, 1] # all tags |
809 print("sample size:", len(result1)) | 832 print("sample size= ", len(result1)) |
810 | 833 |
811 # HD analysis of whole tag | 834 # HD analysis of whole tag |
812 proc_pool = Pool(nproc) | 835 proc_pool = Pool(nproc) |
813 chunks_sample = numpy.array_split(result1, nproc) | 836 chunks_sample = numpy.array_split(result1, nproc) |
814 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | 837 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) |
817 ham = numpy.concatenate(ham).astype(int) | 840 ham = numpy.concatenate(ham).astype(int) |
818 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | 841 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: |
819 # for h, tag in zip(ham, result1): | 842 # for h, tag in zip(ham, result1): |
820 # output_file1.write("{}\t{}\n".format(tag, h)) | 843 # output_file1.write("{}\t{}\n".format(tag, h)) |
821 | 844 |
822 # HD analysis for chimeric reads | 845 # # HD analysis for chimeric reads |
823 # result2 = data_array_whole_dataset[:,1] | 846 # result2 = data_array_whole_dataset[:,1] |
824 | 847 |
825 proc_pool_b = Pool(nproc) | 848 proc_pool_b = Pool(nproc) |
826 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | 849 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) |
827 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | 850 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) |
828 proc_pool_b.close() | 851 proc_pool_b.close() |
829 proc_pool_b.join() | 852 proc_pool_b.join() |
830 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), | |
831 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) | |
832 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), | 853 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), |
833 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) | 854 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) |
834 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), | 855 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), |
835 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) | 856 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) |
836 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), | 857 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), |
837 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) | 858 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) |
838 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), | |
839 numpy.concatenate([item_b[4] for item_b in diff_list_b]))) | |
840 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), | |
841 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) | |
842 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), | |
843 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) | |
844 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), | |
845 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) | |
846 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), | 859 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), |
847 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | 860 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) |
848 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | 861 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), |
849 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | 862 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) |
850 | 863 |
851 chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a], | 864 rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a]) |
852 [item_b[10] for item_b in diff_list_b])) | 865 rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b]) |
853 | 866 diff1 = numpy.concatenate([item[0] for item in diff_list_a]) |
854 chimera_tags = [x for x in chimera_tags if x != []] | 867 diff2 = numpy.concatenate([item[0] for item in diff_list_b]) |
855 chimera_tags_new = [] | 868 |
856 | 869 diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a]) |
857 for i in chimera_tags: | 870 diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b]) |
858 if len(i) > 1: | 871 minHD_tags = numpy.concatenate([item[4] for item in diff_list_a]) |
859 for t in i: | 872 minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a]) |
860 chimera_tags_new.append(t) | 873 minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b]) |
861 else: | 874 chim_tags = [item[10] for item in diff_list_a] |
862 chimera_tags_new.extend(i) | 875 chim_tags2 = [item[10] for item in diff_list_b] |
863 | 876 chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i] |
864 chimeras_dic = defaultdict(list) | 877 chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i] |
865 for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | 878 |
866 chimeras_dic[t1].append(t2) | 879 rel_Diff = [] |
867 | 880 diff_zeros = [] |
868 lst_unique_chimeras = [] | 881 minHD_tags_zeros = [] |
882 diff = [] | |
883 chimera_tags = [] | |
884 for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \ | |
885 zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2): | |
886 rel_Diff.append(max(rel1, rel2)) | |
887 diff.append(max(d1, d2)) | |
888 | |
889 if all(i is not None for i in [zeros1, zeros2]): | |
890 diff_zeros.append(max(zeros1, zeros2)) | |
891 minHD_tags_zeros.append(str(tag1)) | |
892 tags = [ctag1, ctag2] | |
893 chimera_tags.append(tags) | |
894 elif zeros1 is not None and zeros2 is None: | |
895 diff_zeros.append(zeros1) | |
896 minHD_tags_zeros.append(str(tag1)) | |
897 chimera_tags.append(ctag1) | |
898 elif zeros1 is None and zeros2 is not None: | |
899 diff_zeros.append(zeros2) | |
900 minHD_tags_zeros.append(str(tag2)) | |
901 chimera_tags.append(ctag2) | |
902 | |
903 chimera_tags_new = chimera_tags | |
904 #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new)) | |
905 # chimeras_dic = defaultdict(list) | |
906 # | |
907 # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new): | |
908 # if len(t2) >1 and type(t2) is not numpy.ndarray: | |
909 # t2 = numpy.concatenate(t2) | |
910 # chimeras_dic[t1].append(t2) | |
911 | |
869 with open(output_chimeras_tabular, "w") as output_file1: | 912 with open(output_chimeras_tabular, "w") as output_file1: |
870 unique_chimeras = numpy.unique(minHD_tags_zeros) | 913 output_file1.write("chimera tag\tsimilar tag with HD=0\n") |
871 sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 | 914 for i in range(len(minHD_tags_zeros)): |
872 sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 | 915 tag1 = minHD_tags_zeros[i] |
873 | 916 sample_half_a = tag1[0:(len(tag1)) / 2] |
874 output_file1.write("sample tag\tsimilar tag\n") | 917 sample_half_b = tag1[len(tag1) / 2:len(tag1)] |
875 for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): | 918 |
876 max_tags = numpy.concatenate(chimeras_dic.get(tag1)) | 919 max_tags = chimera_tags_new[i] |
877 | 920 if isinstance(max_tags, list) and len(max_tags) > 1: |
878 if tag1 in chimeras_dic.values(): | 921 max_tags = numpy.concatenate(max_tags) |
879 continue | 922 #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray: |
880 else: | 923 # print(max_tags) |
881 lst_unique_chimeras.append(tag1) | 924 # max_tags = numpy.concatenate(max_tags) |
925 max_tags = numpy.unique(max_tags) | |
882 | 926 |
883 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 | 927 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 |
884 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 | 928 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 |
885 | 929 |
886 new_format = [] | 930 new_format = [] |
887 for i in range(len(max_tags)): | 931 for j in range(len(max_tags)): |
888 if a == chimera_half_a[i]: | 932 if sample_half_a == chimera_half_a[j]: |
889 max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) | 933 max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j]) |
890 new_format.append(max_tag) | 934 new_format.append(max_tag) |
891 | 935 |
892 elif b == chimera_half_b[i]: | 936 elif sample_half_b == chimera_half_b[j]: |
893 max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) | 937 max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j]) |
894 new_format.append(max_tag) | 938 new_format.append(max_tag) |
895 | 939 |
896 sample_tag = "{} {}".format(a, b) | 940 sample_tag = "{} {}".format(sample_half_a, sample_half_b) |
897 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | 941 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) |
898 output_file1.write( | 942 output_file1.write( |
899 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " | 943 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " |
900 "The tags were separated by an empty space into their halves and the * marks the identical half.") | 944 "The tags were separated by an empty space into their halves and the * marks the identical half.") |
901 | 945 |
902 nr_chimeric_tags = len(lst_unique_chimeras) | 946 # unique_chimeras = numpy.array(minHD_tags_zeros) |
903 print("nr of unique chimeras:", nr_chimeric_tags) | 947 # |
948 # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras]) # mate1 part1 | |
949 # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras]) # mate1 part 2 | |
950 # | |
951 # output_file1.write("sample tag\tsimilar tag\n") | |
952 # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b): | |
953 # max_tags = numpy.concatenate(chimeras_dic.get(tag1)) | |
954 # max_tags = numpy.unique(max_tags) | |
955 # | |
956 # chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags]) # mate1 part1 | |
957 # chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags]) # mate1 part 2 | |
958 # | |
959 # new_format = [] | |
960 # for i in range(len(max_tags)): | |
961 # if a == chimera_half_a[i]: | |
962 # max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i]) | |
963 # new_format.append(max_tag) | |
964 # | |
965 # elif b == chimera_half_b[i]: | |
966 # max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i]) | |
967 # new_format.append(max_tag) | |
968 # | |
969 # sample_tag = "{} {}".format(a, b) | |
970 # output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format))) | |
971 # output_file1.write( | |
972 # "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n " | |
973 # "The tags were separated by an empty space into their halves and the * marks the identical half.") | |
974 | |
975 nr_chimeric_tags = len(minHD_tags_zeros) | |
976 print("nr of unique chimeras", nr_chimeric_tags) | |
904 | 977 |
905 lenTags = len(data_array) | 978 lenTags = len(data_array) |
906 len_sample = len(result1) | 979 len_sample = len(result1) |
907 | 980 |
908 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | 981 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags |
929 for s, q in zip(seq, quant): | 1002 for s, q in zip(seq, quant): |
930 seqDic[s].append(q) | 1003 seqDic[s].append(q) |
931 else: | 1004 else: |
932 seqDic = dict(zip(seq, quant)) | 1005 seqDic = dict(zip(seq, quant)) |
933 | 1006 |
934 | |
935 lst_minHD_tags = [] | 1007 lst_minHD_tags = [] |
936 for i in minHD_tags: | 1008 for i in minHD_tags: |
937 lst_minHD_tags.append(seqDic.get(i)) | 1009 lst_minHD_tags.append(seqDic.get(i)) |
938 | 1010 |
939 if onlyDuplicates: | 1011 if onlyDuplicates: |
940 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], | 1012 lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int) |
941 [item_b[1] for item_b in lst_minHD_tags])).astype(int) | |
942 # else: | |
943 # lst_minHD_tags = numpy.concatenate(lst_minHD_tags) | |
944 | 1013 |
945 # histogram with absolute and relative difference between HDs of both parts of the tag | 1014 # histogram with absolute and relative difference between HDs of both parts of the tag |
946 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | 1015 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) |
947 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | 1016 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, |
948 rel_Diff) | 1017 rel_Diff) |
949 | |
950 # family size distribution separated after the difference between HDs of both parts of the tag | |
951 # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( | |
952 # lst_minHD_tags, diff, diff=True, rel=False) | |
953 # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( | |
954 # lst_minHD_tags, rel_Diff, diff=True, rel=True) | |
955 | |
956 # chimeric read analysis: tags which have HD=0 in one of the halfs | 1018 # chimeric read analysis: tags which have HD=0 in one of the halfs |
957 if len(minHD_tags_zeros) != 0: | 1019 if len(minHD_tags_zeros) != 0: |
958 lst_minHD_tags_zeros = [] | 1020 lst_minHD_tags_zeros = [] |
959 for i in minHD_tags_zeros: | 1021 for i in minHD_tags_zeros: |
960 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | 1022 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads |
961 if onlyDuplicates: | 1023 if onlyDuplicates: |
962 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], | 1024 lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) |
963 [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int) | 1025 |
964 | |
965 # histogram with HD of non-identical half | 1026 # histogram with HD of non-identical half |
966 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( | 1027 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) |
967 lst_minHD_tags_zeros, diff_zeros) | 1028 |
968 | |
969 # plot Hamming Distance with Family size distribution | 1029 # plot Hamming Distance with Family size distribution |
970 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | 1030 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, |
971 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, | 1031 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, |
972 xlabel="HD", nr_above_bars=nr_above_bars) | 1032 xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample) |
973 | 1033 |
974 # Plot FSD with separation after | 1034 # Plot FSD with separation after |
975 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 1035 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, |
976 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", | 1036 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", |
977 pdf=pdf, relative=False, title_file1=name1, diff=False) | 1037 pdf=pdf, relative=False, title_file1=name1, diff=False) |
978 | 1038 |
979 # Plot HD within tags | 1039 # Plot HD within tags |
980 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, | 1040 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, |
981 title_file1=name1) | 1041 title_file1=name1, len_sample=len_sample) |
982 | 1042 |
983 # Plot difference between HD's separated after FSD | 1043 # Plot difference between HD's separated after FSD |
984 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 1044 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, |
985 subtitle="Delta Hamming distance within tags", | 1045 subtitle="Delta Hamming distance within tags", |
986 title_file1=name1, lenTags=lenTags, | 1046 title_file1=name1, lenTags=lenTags, |
987 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) | 1047 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample) |
988 | 1048 |
989 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 1049 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
990 subtitle="Chimera Analysis: relative delta Hamming distances", | 1050 subtitle="Chimera Analysis: relative delta Hamming distances", |
991 title_file1=name1, lenTags=lenTags, | 1051 title_file1=name1, lenTags=lenTags, |
992 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | 1052 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
993 | 1053 |
994 # plots for chimeric reads | 1054 # plots for chimeric reads |
995 if len(minHD_tags_zeros) != 0: | 1055 if len(minHD_tags_zeros) != 0: |
996 # HD | 1056 # HD |
997 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras", | 1057 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras", |
998 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, | 1058 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, |
999 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) | 1059 nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample) |
1000 | 1060 |
1001 # print all data to a CSV file | 1061 # print all data to a CSV file |
1002 # HD | 1062 # HD |
1045 output_file.write( | 1105 output_file.write( |
1046 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) | 1106 "relative frequency:{}{}\n\n".format(sep, float(max_fs[len(max_fs) - 1]) / sum(max_fs))) |
1047 | 1107 |
1048 # HD within tags | 1108 # HD within tags |
1049 output_file.write( | 1109 output_file.write( |
1050 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | 1110 "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n" |
1051 "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n" | 1111 "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n" |
1052 "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n") | 1112 "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n" |
1113 "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD For simplicity we used the maximum value between the delta values in the end.\n" | |
1114 "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n") | |
1053 | 1115 |
1054 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | 1116 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) |
1055 | 1117 |
1056 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | 1118 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, |
1057 "Hamming distance of each half in the tag", sep) | 1119 "Hamming distance of each half in the tag", sep) |
1061 createFileHD(summary13, sumCol13, overallSum13, output_file, | 1123 createFileHD(summary13, sumCol13, overallSum13, output_file, |
1062 "Chimera analysis: relative delta Hamming distances", sep) | 1124 "Chimera analysis: relative delta Hamming distances", sep) |
1063 | 1125 |
1064 if len(minHD_tags_zeros) != 0: | 1126 if len(minHD_tags_zeros) != 0: |
1065 output_file.write( | 1127 output_file.write( |
1066 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n") | 1128 "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n") |
1067 output_file.write( | |
1068 "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n") | |
1069 output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample))) | |
1070 createFileHD(summary15, sumCol15, overallSum15, output_file, | 1129 createFileHD(summary15, sumCol15, overallSum15, output_file, |
1071 "Hamming distances of non-zero half", sep) | 1130 "Hamming distances of chimeras", sep) |
1072 | 1131 |
1073 output_file.write("\n") | 1132 output_file.write("\n") |
1074 | 1133 |
1075 | 1134 |
1076 if __name__ == '__main__': | 1135 if __name__ == '__main__': |
1077 sys.exit(Hamming_Distance_Analysis(sys.argv)) | 1136 sys.exit(Hamming_Distance_Analysis(sys.argv)) |
1078 |