Mercurial > repos > mheinzl > hd
comparison hd.py @ 20:b084b6a8e3ac draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit e76960d95c059a78d880ed5ecd6202f54b091025
author | mheinzl |
---|---|
date | Fri, 14 Dec 2018 04:31:21 -0500 |
parents | 2e9f7ea7ae93 |
children | 9919024d7778 |
comparison
equal
deleted
inserted
replaced
19:2e9f7ea7ae93 | 20:b084b6a8e3ac |
---|---|
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads | 11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads |
12 # and finally a CSV file with the data of the plots. | 12 # and finally a CSV file with the data of the plots. |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / | 16 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular |
18 | 18 |
19 import argparse | 19 import argparse |
20 import itertools | 20 import itertools |
21 import operator | 21 import operator |
22 import sys | 22 import sys |
172 | 172 |
173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) | 173 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2)) |
174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) | 174 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) |
175 # plt.ylim(0, maximumY * 1.2) | 175 # plt.ylim(0, maximumY * 1.2) |
176 | 176 |
177 legend = "sample size= {:,} against {:,}".format(sum(ham_partial[4]), lenTags) | 177 legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags) |
178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 178 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
179 pdf.savefig(fig, bbox_inches="tight") | 179 pdf.savefig(fig, bbox_inches="tight") |
180 plt.close("all") | 180 plt.close("all") |
181 plt.clf() | 181 plt.clf() |
182 | 182 |
632 def make_argparser(): | 632 def make_argparser(): |
633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') | 633 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') |
634 parser.add_argument('--inputFile', | 634 parser.add_argument('--inputFile', |
635 help='Tabular File with three columns: ab or ba, tag and family size.') | 635 help='Tabular File with three columns: ab or ba, tag and family size.') |
636 parser.add_argument('--inputName1') | 636 parser.add_argument('--inputName1') |
637 parser.add_argument('--inputFile2', default=None, | 637 # parser.add_argument('--inputFile2', default=None, |
638 help='Tabular File with three columns: ab or ba, tag and family size.') | 638 # help='Tabular File with three columns: ab or ba, tag and family size.') |
639 parser.add_argument('--inputName2') | 639 # parser.add_argument('--inputName2') |
640 parser.add_argument('--sample_size', default=1000, type=int, | 640 parser.add_argument('--sample_size', default=1000, type=int, |
641 help='Sample size of Hamming distance analysis.') | 641 help='Sample size of Hamming distance analysis.') |
642 parser.add_argument('--subset_tag', default=0, type=int, | 642 parser.add_argument('--subset_tag', default=0, type=int, |
643 help='The tag is shortened to the given number.') | 643 help='The tag is shortened to the given number.') |
644 parser.add_argument('--nproc', default=4, type=int, | 644 parser.add_argument('--nproc', default=4, type=int, |
655 | 655 |
656 parser.add_argument('--output_tabular', default="data.tabular", type=str, | 656 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
657 help='Name of the tabular file.') | 657 help='Name of the tabular file.') |
658 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 658 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
659 help='Name of the pdf file.') | 659 help='Name of the pdf file.') |
660 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, | 660 # parser.add_argument('--output_pdf2', default="data2.pdf", type=str, |
661 help='Name of the pdf file.') | 661 # help='Name of the pdf file.') |
662 parser.add_argument('--output_tabular2', default="data2.tabular", type=str, | 662 # parser.add_argument('--output_tabular2', default="data2.tabular", type=str, |
663 help='Name of the tabular file.') | 663 # help='Name of the tabular file.') |
664 | 664 |
665 return parser | 665 return parser |
666 | 666 |
667 | 667 |
668 def Hamming_Distance_Analysis(argv): | 668 def Hamming_Distance_Analysis(argv): |
670 args = parser.parse_args(argv[1:]) | 670 args = parser.parse_args(argv[1:]) |
671 | 671 |
672 file1 = args.inputFile | 672 file1 = args.inputFile |
673 name1 = args.inputName1 | 673 name1 = args.inputName1 |
674 | 674 |
675 file2 = args.inputFile2 | 675 # file2 = args.inputFile2 |
676 name2 = args.inputName2 | 676 # name2 = args.inputName2 |
677 | 677 |
678 index_size = args.sample_size | 678 index_size = args.sample_size |
679 title_savedFile_pdf = args.output_pdf | 679 title_savedFile_pdf = args.output_pdf |
680 title_savedFile_pdf2 = args.output_pdf2 | 680 # title_savedFile_pdf2 = args.output_pdf2 |
681 | 681 |
682 title_savedFile_csv = args.output_tabular | 682 title_savedFile_csv = args.output_tabular |
683 title_savedFile_csv2 = args.output_tabular2 | 683 # title_savedFile_csv2 = args.output_tabular2 |
684 | 684 |
685 sep = "\t" | 685 sep = "\t" |
686 onlyDuplicates = args.only_DCS | 686 onlyDuplicates = args.only_DCS |
687 minFS = args.minFS | 687 minFS = args.minFS |
688 maxFS = args.maxFS | 688 maxFS = args.maxFS |
709 plt.rcParams['xtick.labelsize'] = 14 | 709 plt.rcParams['xtick.labelsize'] = 14 |
710 plt.rcParams['ytick.labelsize'] = 14 | 710 plt.rcParams['ytick.labelsize'] = 14 |
711 plt.rcParams['patch.edgecolor'] = "#000000" | 711 plt.rcParams['patch.edgecolor'] = "#000000" |
712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 712 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
713 | 713 |
714 if file2 != str(None): | 714 # if file2 != str(None): |
715 files = [file1, file2] | 715 # files = [file1, file2] |
716 name1 = name1.split(".tabular")[0] | 716 # name1 = name1.split(".tabular")[0] |
717 name2 = name2.split(".tabular")[0] | 717 # name2 = name2.split(".tabular")[0] |
718 names = [name1, name2] | 718 # names = [name1, name2] |
719 pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] | 719 # pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] |
720 csv_files = [title_savedFile_csv, title_savedFile_csv2] | 720 # csv_files = [title_savedFile_csv, title_savedFile_csv2] |
721 else: | 721 # else: |
722 files = [file1] | 722 # f = file1 |
723 name1 = name1.split(".tabular")[0] | 723 name1 = name1.split(".tabular")[0] |
724 names = [name1] | 724 # name_file = name1 |
725 pdf_files = [title_savedFile_pdf] | 725 # pdf_f = title_savedFile_pdf |
726 csv_files = [title_savedFile_csv] | 726 # csv_f = title_savedFile_csv |
727 | 727 |
728 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): | 728 #for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): |
729 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: | 729 with open(title_savedFile_csv, "w") as output_file, PdfPages(title_savedFile_pdf) as pdf: |
730 print("dataset: ", name_file) | 730 print("dataset: ", name1) |
731 integers, data_array = readFileReferenceFree(f) | 731 integers, data_array = readFileReferenceFree(file1) |
732 data_array = numpy.array(data_array) | 732 data_array = numpy.array(data_array) |
733 int_f = numpy.array(data_array[:, 0]).astype(int) | 733 int_f = numpy.array(data_array[:, 0]).astype(int) |
734 data_array = data_array[numpy.where(int_f >= minFS)] | 734 data_array = data_array[numpy.where(int_f >= minFS)] |
735 integers = integers[integers >= minFS] | 735 integers = integers[integers >= minFS] |
736 | 736 |
737 # select family size for tags | 737 # select family size for tags |
738 if maxFS > 0: | 738 if maxFS > 0: |
739 int_f2 = numpy.array(data_array[:, 0]).astype(int) | 739 int_f2 = numpy.array(data_array[:, 0]).astype(int) |
740 data_array = data_array[numpy.where(int_f2 <= maxFS)] | 740 data_array = data_array[numpy.where(int_f2 <= maxFS)] |
741 integers = integers[integers <= maxFS] | 741 integers = integers[integers <= maxFS] |
742 | 742 |
743 print("min FS", min(integers)) | 743 print("min FS", min(integers)) |
744 print("max FS", max(integers)) | 744 print("max FS", max(integers)) |
745 | 745 |
746 tags = data_array[:, 2] | 746 tags = data_array[:, 2] |
747 seq = data_array[:, 1] | 747 seq = data_array[:, 1] |
748 | 748 |
749 if onlyDuplicates is True: | 749 if onlyDuplicates is True: |
750 # find all unique tags and get the indices for ALL tags, but only once | 750 # find all unique tags and get the indices for ALL tags, but only once |
751 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | 751 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) |
752 d = u[c > 1] | 752 d = u[c > 1] |
753 | 753 |
754 # get family sizes, tag for duplicates | 754 # get family sizes, tag for duplicates |
755 duplTags_double = integers[numpy.in1d(seq, d)] | 755 duplTags_double = integers[numpy.in1d(seq, d)] |
756 duplTags = duplTags_double[0::2] # ab of DCS | 756 duplTags = duplTags_double[0::2] # ab of DCS |
757 duplTagsBA = duplTags_double[1::2] # ba of DCS | 757 duplTagsBA = duplTags_double[1::2] # ba of DCS |
758 | 758 |
759 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab | 759 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab |
760 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags | 760 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags |
761 | 761 |
762 data_array = numpy.column_stack((duplTags, duplTags_seq)) | 762 data_array = numpy.column_stack((duplTags, duplTags_seq)) |
763 data_array = numpy.column_stack((data_array, duplTags_tag)) | 763 data_array = numpy.column_stack((data_array, duplTags_tag)) |
764 integers = numpy.array(data_array[:, 0]).astype(int) | 764 integers = numpy.array(data_array[:, 0]).astype(int) |
765 print("DCS in whole dataset", len(data_array)) | 765 print("DCS in whole dataset", len(data_array)) |
766 | 766 |
767 # HD analysis for a subset of the tag | 767 # HD analysis for a subset of the tag |
768 if subset > 0: | 768 if subset > 0: |
769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) | 769 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) |
770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) | 770 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) |
771 | 771 |
772 flanking_region_float = float((len(tag1[0]) - subset)) / 2 | 772 flanking_region_float = float((len(tag1[0]) - subset)) / 2 |
773 flanking_region = int(flanking_region_float) | 773 flanking_region = int(flanking_region_float) |
774 if flanking_region_float % 2 == 0: | 774 if flanking_region_float % 2 == 0: |
775 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) | 775 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) |
776 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) | 776 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) |
777 else: | |
778 flanking_region_rounded = int(round(flanking_region, 1)) | |
779 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded | |
780 tag1_shorten = numpy.array( | |
781 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) | |
782 tag2_shorten = numpy.array( | |
783 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) | |
784 | |
785 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) | |
786 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) | |
787 | |
788 print("length of tag= ", len(data_array[0, 1])) | |
789 # select sample: if no size given --> all vs. all comparison | |
790 if index_size == 0: | |
791 result = numpy.arange(0, len(data_array), 1) | |
792 else: | 777 else: |
793 result = numpy.random.choice(len(integers), size=index_size, replace=False) # array of random sequences of size=index.size | 778 flanking_region_rounded = int(round(flanking_region, 1)) |
794 | 779 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded |
795 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: | 780 tag1_shorten = numpy.array( |
796 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) | 781 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) |
797 | 782 tag2_shorten = numpy.array( |
798 # comparison random tags to whole dataset | 783 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) |
799 result1 = data_array[result, 1] # random tags | 784 |
800 result2 = data_array[:, 1] # all tags | 785 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) |
801 print("size of the whole dataset= ", len(result2)) | 786 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) |
802 print("sample size= ", len(result1)) | 787 |
803 | 788 print("length of tag= ", len(data_array[0, 1])) |
804 # HD analysis of whole tag | 789 # select sample: if no size given --> all vs. all comparison |
805 proc_pool = Pool(nproc) | 790 if index_size == 0: |
806 chunks_sample = numpy.array_split(result1, nproc) | 791 result = numpy.arange(0, len(data_array), 1) |
807 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) | 792 else: |
808 proc_pool.close() | 793 result = numpy.random.choice(len(integers), size=index_size, |
809 proc_pool.join() | 794 replace=False) # array of random sequences of size=index.size |
810 ham = numpy.concatenate(ham).astype(int) | 795 |
811 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: | 796 # with open("index_result1_{}.pkl".format(app_f), "wb") as o: |
812 # for h, tag in zip(ham, result1): | 797 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) |
813 # output_file1.write("{}\t{}\n".format(tag, h)) | 798 |
814 | 799 # comparison random tags to whole dataset |
815 # HD analysis for chimeric reads | 800 result1 = data_array[result, 1] # random tags |
816 proc_pool_b = Pool(nproc) | 801 result2 = data_array[:, 1] # all tags |
817 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) | 802 print("size of the whole dataset= ", len(result2)) |
818 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) | 803 print("sample size= ", len(result1)) |
819 proc_pool_b.close() | 804 |
820 proc_pool_b.join() | 805 # HD analysis of whole tag |
821 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), | 806 proc_pool = Pool(nproc) |
822 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) | 807 chunks_sample = numpy.array_split(result1, nproc) |
823 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), | 808 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) |
824 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) | 809 proc_pool.close() |
825 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), | 810 proc_pool.join() |
826 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) | 811 ham = numpy.concatenate(ham).astype(int) |
827 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), | 812 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: |
828 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) | 813 # for h, tag in zip(ham, result1): |
829 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), | 814 # output_file1.write("{}\t{}\n".format(tag, h)) |
830 numpy.concatenate([item_b[4] for item_b in diff_list_b]))) | 815 |
831 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), | 816 # HD analysis for chimeric reads |
832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) | 817 proc_pool_b = Pool(nproc) |
833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), | 818 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) |
834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) | 819 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) |
835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), | 820 proc_pool_b.close() |
836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) | 821 proc_pool_b.join() |
837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | 822 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), |
838 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | 823 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) |
839 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | 824 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), |
840 | 825 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) |
841 lenTags = len(data_array) | 826 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), |
842 | 827 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) |
843 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags | 828 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), |
844 seq = numpy.array(data_array[result, 1]) # tags of sample | 829 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) |
845 ham = numpy.asarray(ham) # HD for sample of tags | 830 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), |
846 | 831 numpy.concatenate([item_b[4] for item_b in diff_list_b]))) |
847 if onlyDuplicates is True: # ab and ba strands of DCSs | 832 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), |
848 quant = numpy.concatenate((quant, duplTagsBA[result])) | 833 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) |
849 seq = numpy.tile(seq, 2) | 834 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), |
850 ham = numpy.tile(ham, 2) | 835 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) |
851 | 836 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), |
852 # prepare data for different kinds of plots | 837 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) |
853 # distribution of FSs separated after HD | 838 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), |
854 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) | 839 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) |
855 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | 840 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), |
856 | 841 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) |
857 # get FS for all tags with min HD of analysis of chimeric reads | 842 |
858 # there are more tags than sample size in the plot, because one tag can have multiple minimas | 843 lenTags = len(data_array) |
859 seqDic = dict(zip(seq, quant)) | 844 |
860 lst_minHD_tags = [] | 845 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags |
861 for i in minHD_tags: | 846 seq = numpy.array(data_array[result, 1]) # tags of sample |
862 lst_minHD_tags.append(seqDic.get(i)) | 847 ham = numpy.asarray(ham) # HD for sample of tags |
863 | 848 |
864 # histogram with absolute and relative difference between HDs of both parts of the tag | 849 if onlyDuplicates is True: # ab and ba strands of DCSs |
865 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) | 850 quant = numpy.concatenate((quant, duplTagsBA[result])) |
866 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, | 851 seq = numpy.tile(seq, 2) |
867 rel_Diff) | 852 ham = numpy.tile(ham, 2) |
868 | 853 |
869 # family size distribution separated after the difference between HDs of both parts of the tag | 854 # prepare data for different kinds of plots |
870 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( | 855 # distribution of FSs separated after HD |
871 lst_minHD_tags, diff, diff=True, rel=False) | 856 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, rel=False) |
872 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( | 857 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS |
873 lst_minHD_tags, rel_Diff, diff=True, rel=True) | 858 |
874 | 859 # get FS for all tags with min HD of analysis of chimeric reads |
875 # chimeric read analysis: tags which have HD=0 in one of the halfs | 860 # there are more tags than sample size in the plot, because one tag can have multiple minimas |
876 if len(minHD_tags_zeros) != 0: | 861 seqDic = dict(zip(seq, quant)) |
877 lst_minHD_tags_zeros = [] | 862 lst_minHD_tags = [] |
878 for i in minHD_tags_zeros: | 863 for i in minHD_tags: |
879 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads | 864 lst_minHD_tags.append(seqDic.get(i)) |
880 | 865 |
881 # histogram with HD of non-identical half | 866 # histogram with absolute and relative difference between HDs of both parts of the tag |
882 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros) | 867 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) |
883 # family size distribution of non-identical half | 868 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, |
884 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) | 869 rel_Diff) |
885 | 870 |
886 # plot Hamming Distance with Family size distribution | 871 # family size distribution separated after the difference between HDs of both parts of the tag |
887 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, subtitle="Hamming distance separated by family size", title_file1=name_file, lenTags=lenTags, xlabel="HD", nr_above_bars=nr_above_bars) | 872 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( |
888 | 873 lst_minHD_tags, diff, diff=True, rel=False) |
889 # Plot FSD with separation after | 874 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( |
890 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 875 lst_minHD_tags, rel_Diff, diff=True, rel=True) |
891 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", | 876 |
892 pdf=pdf, relative=False, title_file1=name_file, diff=False) | 877 # chimeric read analysis: tags which have HD=0 in one of the halfs |
893 | 878 if len(minHD_tags_zeros) != 0: |
894 # Plot HD within tags | 879 lst_minHD_tags_zeros = [] |
895 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | 880 for i in minHD_tags_zeros: |
896 | 881 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads |
897 # Plot difference between HD's separated after FSD | 882 |
898 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 883 # histogram with HD of non-identical half |
899 subtitle="Delta Hamming distance within tags", | 884 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( |
900 title_file1=name_file, lenTags=lenTags, | 885 lst_minHD_tags_zeros, diff_zeros) |
901 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) | 886 # family size distribution of non-identical half |
902 | 887 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( |
903 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 888 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) |
904 subtitle="Chimera Analysis: relative delta Hamming distances", | 889 |
905 title_file1=name_file, lenTags=lenTags, | 890 # plot Hamming Distance with Family size distribution |
906 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) | 891 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, |
907 | 892 subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags, |
908 # plots for chimeric reads | 893 xlabel="HD", nr_above_bars=nr_above_bars) |
909 if len(minHD_tags_zeros) != 0: | 894 |
910 # HD | 895 # Plot FSD with separation after |
911 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | 896 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, |
912 subtitle="Hamming distance of the non-identical half of chimeras", | 897 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", |
913 title_file1=name_file, lenTags=lenTags, xlabel="HD", relative=False, nr_above_bars=nr_above_bars) | 898 pdf=pdf, relative=False, title_file1=name1, diff=False) |
914 | 899 |
915 # print all data to a CSV file | 900 # Plot HD within tags |
901 plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, | |
902 title_file1=name1) | |
903 | |
904 # Plot difference between HD's separated after FSD | |
905 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | |
906 subtitle="Delta Hamming distance within tags", | |
907 title_file1=name1, lenTags=lenTags, | |
908 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) | |
909 | |
910 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | |
911 subtitle="Chimera Analysis: relative delta Hamming distances", | |
912 title_file1=name1, lenTags=lenTags, | |
913 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) | |
914 | |
915 # plots for chimeric reads | |
916 if len(minHD_tags_zeros) != 0: | |
916 # HD | 917 # HD |
917 summary, sumCol = createTableHD(list1, "HD=") | 918 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, |
918 overallSum = sum(sumCol) # sum of columns in table | 919 subtitle="Hamming distance of the non-identical half of chimeras", |
919 | 920 title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False, |
920 # FSD | 921 nr_above_bars=nr_above_bars) |
921 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) | 922 |
922 overallSum5 = sum(sumCol5) | 923 # print all data to a CSV file |
923 | 924 # HD |
924 # HD of both parts of the tag | 925 summary, sumCol = createTableHD(list1, "HD=") |
925 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) | 926 overallSum = sum(sumCol) # sum of columns in table |
926 overallSum9 = sum(sumCol9) | 927 |
927 | 928 # FSD |
928 # HD | 929 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) |
929 # absolute difference | 930 overallSum5 = sum(sumCol5) |
930 summary11, sumCol11 = createTableHD(listDifference1, "diff=") | 931 |
931 overallSum11 = sum(sumCol11) | 932 # HD of both parts of the tag |
932 # relative difference and all tags | 933 summary9, sumCol9 = createTableHDwithTags([HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, numpy.array(minHDs)]) |
933 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | 934 overallSum9 = sum(sumCol9) |
934 overallSum13 = sum(sumCol13) | 935 |
935 | 936 # HD |
936 # chimeric reads | 937 # absolute difference |
937 if len(minHD_tags_zeros) != 0: | 938 summary11, sumCol11 = createTableHD(listDifference1, "diff=") |
938 # absolute difference and tags where at least one half has HD=0 | 939 overallSum11 = sum(sumCol11) |
939 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") | 940 # relative difference and all tags |
940 overallSum15 = sum(sumCol15) | 941 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") |
941 | 942 overallSum13 = sum(sumCol13) |
942 output_file.write("{}\n".format(name_file)) | 943 |
943 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | 944 # chimeric reads |
944 numpy.concatenate(list1)), lenTags, lenTags)) | 945 if len(minHD_tags_zeros) != 0: |
945 | 946 # absolute difference and tags where at least one half has HD=0 |
946 # HD | 947 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=") |
947 createFileHD(summary, sumCol, overallSum, output_file, | 948 overallSum15 = sum(sumCol15) |
948 "Hamming distance separated by family size", sep) | 949 |
949 # FSD | 950 output_file.write("{}\n".format(name1)) |
950 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | 951 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( |
951 "Family size distribution separated by Hamming distance", sep, | 952 numpy.concatenate(list1)), lenTags, lenTags)) |
952 diff=False) | 953 |
953 | 954 # HD |
954 count = numpy.bincount(quant) | 955 createFileHD(summary, sumCol, overallSum, output_file, |
955 # output_file.write("{}{}\n".format(sep, name_file)) | 956 "Hamming distance separated by family size", sep) |
956 output_file.write("\n") | 957 # FSD |
957 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) | 958 createFileFSD2(summary5, sumCol5, overallSum5, output_file, |
958 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | 959 "Family size distribution separated by Hamming distance", sep, |
960 diff=False) | |
961 | |
962 count = numpy.bincount(quant) | |
963 # output_file.write("{}{}\n".format(sep, name1)) | |
964 output_file.write("\n") | |
965 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) | |
966 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | |
967 output_file.write( | |
968 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | |
969 | |
970 # HD within tags | |
971 output_file.write( | |
972 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | |
973 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | |
974 output_file.write( | |
975 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | |
976 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) | |
977 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | |
978 | |
979 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | |
980 "Hamming distance of each half in the tag", sep) | |
981 createFileHD(summary11, sumCol11, overallSum11, output_file, | |
982 "Absolute delta Hamming distances within the tag", sep) | |
983 createFileHD(summary13, sumCol13, overallSum13, output_file, | |
984 "Chimera analysis: relative delta Hamming distances", sep) | |
985 | |
986 if len(minHD_tags_zeros) != 0: | |
959 output_file.write( | 987 output_file.write( |
960 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | 988 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") |
961 | 989 createFileHD(summary15, sumCol15, overallSum15, output_file, |
962 # HD within tags | 990 "Hamming distances of non-zero half", sep) |
963 output_file.write( | 991 output_file.write("\n") |
964 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | |
965 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | |
966 output_file.write( | |
967 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | |
968 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) | |
969 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | |
970 | |
971 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | |
972 "Hamming distance of each half in the tag", sep) | |
973 createFileHD(summary11, sumCol11, overallSum11, output_file, | |
974 "Absolute delta Hamming distances within the tag", sep) | |
975 createFileHD(summary13, sumCol13, overallSum13, output_file, | |
976 "Chimera analysis: relative delta Hamming distances", sep) | |
977 | |
978 if len(minHD_tags_zeros) != 0: | |
979 output_file.write( | |
980 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | |
981 createFileHD(summary15, sumCol15, overallSum15, output_file, | |
982 "Hamming distances of non-zero half", sep) | |
983 output_file.write("\n") | |
984 | 992 |
985 | 993 |
986 if __name__ == '__main__': | 994 if __name__ == '__main__': |
987 sys.exit(Hamming_Distance_Analysis(sys.argv)) | 995 sys.exit(Hamming_Distance_Analysis(sys.argv)) |