Mercurial > repos > mheinzl > hd
comparison hd.py @ 14:883e6381ba29 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 38f5c032262361131c645812dd3dc639be6a5f4e
author | mheinzl |
---|---|
date | Wed, 23 May 2018 14:14:10 -0400 |
parents | 5b0a95f205ad |
children | cf7874bb4934 |
comparison
equal
deleted
inserted
replaced
13:5b0a95f205ad | 14:883e6381ba29 |
---|---|
12 # and finally a CSV file with the data of the plots. | 12 # and finally a CSV file with the data of the plots. |
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. | 13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input. |
14 # The tool can run on a certain number of processors, which can be defined by the user. | 14 # The tool can run on a certain number of processors, which can be defined by the user. |
15 | 15 |
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / | 16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / |
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | 17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf |
18 | 18 |
19 import numpy | 19 import numpy |
20 import itertools | 20 import itertools |
21 import operator | 21 import operator |
22 import matplotlib.pyplot as plt | 22 import matplotlib.pyplot as plt |
90 | 90 |
91 pdf.savefig(fig, bbox_inches="tight") | 91 pdf.savefig(fig, bbox_inches="tight") |
92 plt.close("all") | 92 plt.close("all") |
93 | 93 |
94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, | 94 def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf, |
95 xlabel,relative=False): | 95 xlabel,relative=False, nr_above_bars = True): |
96 if relative is True: | 96 if relative is True: |
97 step = 0.1 | 97 step = 0.1 |
98 else: | 98 else: |
99 step = 1 | 99 step = 1 |
100 | 100 |
128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | 128 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) |
129 plt.xticks(numpy.arange(0, maximumX + step, step)) | 129 plt.xticks(numpy.arange(0, maximumX + step, step)) |
130 | 130 |
131 plt.ylim((0, maximumY * 1.2)) | 131 plt.ylim((0, maximumY * 1.2)) |
132 | 132 |
133 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] | 133 if nr_above_bars is True: |
134 for x_label, label in zip(counts, bin_centers): # labels for values | 134 bin_centers = -0.4 * numpy.diff(bins) + bins[:-1] |
135 if x_label == 0: | 135 for x_label, label in zip(counts, bin_centers): # labels for values |
136 continue | 136 if x_label == 0: |
137 else: | 137 continue |
138 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), | 138 else: |
139 xy=(label, x_label + len(con_list1) * 0.01), | 139 plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1), |
140 xycoords="data", color="#000066",fontsize=10) | 140 xy=(label, x_label + len(con_list1) * 0.01), |
141 | 141 xycoords="data", color="#000066",fontsize=10) |
142 | |
142 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) | 143 legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags) |
143 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 144 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
144 | 145 |
145 pdf.savefig(fig, bbox_inches="tight") | 146 pdf.savefig(fig, bbox_inches="tight") |
146 plt.close("all") | 147 plt.close("all") |
147 plt.clf() | 148 plt.clf() |
148 | 149 |
149 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): | 150 def plotHDwithinSeq_Sum2(sum1, sum2,sum1min, sum2min, min_value, lenTags, title_file1, pdf): |
150 fig = plt.figure(figsize=(6, 8)) | 151 fig = plt.figure(figsize=(6, 8)) |
151 plt.subplots_adjust(bottom=0.1) | 152 plt.subplots_adjust(bottom=0.1) |
152 | 153 |
153 ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags | 154 #ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags |
155 ham = [sum1, sum2, sum1min, sum2min, numpy.array(min_value)] # new hd within tags | |
156 | |
154 | 157 |
155 maximumX = numpy.amax(numpy.concatenate(ham)) | 158 maximumX = numpy.amax(numpy.concatenate(ham)) |
156 minimumX = numpy.amin(numpy.concatenate(ham)) | 159 minimumX = numpy.amin(numpy.concatenate(ham)) |
157 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) | 160 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) |
158 | 161 |
160 range1 = minimumX | 163 range1 = minimumX |
161 else: | 164 else: |
162 range1 = range(minimumX, maximumX + 2) | 165 range1 = range(minimumX, maximumX + 2) |
163 | 166 |
164 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, | 167 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, |
165 label=[ "HD a", "HD b","HD a+b"], | 168 # label=[ "HD a", "HD b","HD a+b"], |
166 bins=range1, color=[ "#58ACFA", "#FA5858","#585858"], edgecolor='black', linewidth=1) | 169 label=[ "HD a","HD b'", "HD b", "HD a'", "HD a+b"], |
170 #bins=range1, color=[ "#58ACFA", "#FA5858","#585858"], | |
171 color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], | |
172 edgecolor='black', linewidth=1) | |
167 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | 173 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) |
168 plt.suptitle('Hamming distances within tags', fontsize=14) | 174 plt.suptitle('Hamming distances within tags', fontsize=14) |
169 #plt.title(title_file1, fontsize=12) | 175 #plt.title(title_file1, fontsize=12) |
170 plt.xlabel("Hamming Distance", fontsize=14) | 176 plt.xlabel("HD", fontsize=14) |
171 plt.ylabel("Absolute Frequency", fontsize=14) | 177 plt.ylabel("Absolute Frequency", fontsize=14) |
172 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 178 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
173 | 179 |
174 | 180 |
175 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) | 181 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) |
446 | 452 |
447 diff11 = [] | 453 diff11 = [] |
448 relativeDiffList = [] | 454 relativeDiffList = [] |
449 ham1 = [] | 455 ham1 = [] |
450 ham2 = [] | 456 ham2 = [] |
457 ham1min = [] | |
458 ham2min = [] | |
451 min_valueList = [] | 459 min_valueList = [] |
452 min_tagsList = [] | 460 min_tagsList = [] |
453 diff11_zeros = [] | 461 diff11_zeros = [] |
454 min_tagsList_zeros = [] | 462 min_tagsList_zeros = [] |
455 i = 0 # counter, only used to see how many HDs of tags were already calculated | 463 i = 0 # counter, only used to see how many HDs of tags were already calculated |
486 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" | 494 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" |
487 for d_1, d_2 in zip(min_value, dist2): | 495 for d_1, d_2 in zip(min_value, dist2): |
488 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b | 496 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b |
489 d = d_2 | 497 d = d_2 |
490 d2 = d_1 | 498 d2 = d_1 |
499 ham2.append(d) | |
500 ham2min.append(d2) | |
491 else: # half1, corrects the variable of the HD from both halfs if it is a or b | 501 else: # half1, corrects the variable of the HD from both halfs if it is a or b |
492 d = d_1 | 502 d = d_1 |
493 d2 = d_2 | 503 d2 = d_2 |
504 ham1.append(d) | |
505 ham1min.append(d2) | |
506 | |
494 min_valueList.append(d + d2) | 507 min_valueList.append(d + d2) |
495 min_tagsList.append(tag) | 508 min_tagsList.append(tag) |
496 ham1.append(d) | 509 # ham1.append(d) |
497 ham2.append(d2) | 510 # ham2.append(d2) |
498 difference1 = abs(d - d2) | 511 difference1 = abs(d - d2) |
499 diff11.append(difference1) | 512 diff11.append(difference1) |
500 rel_difference = round(float(difference1) / (d + d2), 1) | 513 rel_difference = round(float(difference1) / (d + d2), 1) |
501 relativeDiffList.append(rel_difference) | 514 relativeDiffList.append(rel_difference) |
502 | 515 |
515 #min_tagsList = [st for st in min_tagsList if st != 999] | 528 #min_tagsList = [st for st in min_tagsList if st != 999] |
516 #relativeDiffList = [st for st in relativeDiffList if st != 999] | 529 #relativeDiffList = [st for st in relativeDiffList if st != 999] |
517 #diff11_zeros = [st for st in diff11_zeros if st != 999] | 530 #diff11_zeros = [st for st in diff11_zeros if st != 999] |
518 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | 531 #min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] |
519 | 532 |
520 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) | 533 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros, ham1min, ham2min]) |
521 | 534 |
522 def readFileReferenceFree(file): | 535 def readFileReferenceFree(file): |
523 with open(file, 'r') as dest_f: | 536 with open(file, 'r') as dest_f: |
524 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | 537 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') |
525 integers = numpy.array(data_array[:, 0]).astype(int) | 538 integers = numpy.array(data_array[:, 0]).astype(int) |
630 | 643 |
631 parser.add_argument('--minFS', default=1, type=int, | 644 parser.add_argument('--minFS', default=1, type=int, |
632 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') | 645 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') |
633 parser.add_argument('--maxFS', default=0, type=int, | 646 parser.add_argument('--maxFS', default=0, type=int, |
634 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') | 647 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') |
635 | 648 parser.add_argument('--nr_above_bars', action="store_true", # default=False, type=bool, |
649 help='If no, values above bars in the histrograms are removed') | |
650 | |
636 parser.add_argument('--output_csv', default="data.csv", type=str, | 651 parser.add_argument('--output_csv', default="data.csv", type=str, |
637 help='Name of the csv file.') | 652 help='Name of the csv file.') |
638 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 653 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
639 help='Name of the pdf file.') | 654 help='Name of the pdf file.') |
640 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, | 655 parser.add_argument('--output_pdf2', default="data2.pdf", type=str, |
663 | 678 |
664 sep = args.sep | 679 sep = args.sep |
665 onlyDuplicates = args.only_DCS | 680 onlyDuplicates = args.only_DCS |
666 minFS = args.minFS | 681 minFS = args.minFS |
667 maxFS = args.maxFS | 682 maxFS = args.maxFS |
683 nr_above_bars = args.nr_above_bars | |
684 | |
668 | 685 |
669 subset = args.subset_tag | 686 subset = args.subset_tag |
670 nproc = args.nproc | 687 nproc = args.nproc |
671 | 688 |
672 ### input checks | 689 ### input checks |
815 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) | 832 numpy.concatenate([item_b[5] for item_b in diff_list_b]))) |
816 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), | 833 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), |
817 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) | 834 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) |
818 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), | 835 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), |
819 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) | 836 numpy.concatenate([item_b[7] for item_b in diff_list_b]))) |
820 | 837 HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]), |
838 numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int) | |
839 HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]), | |
840 numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int) | |
821 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: | 841 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: |
822 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): | 842 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): |
823 # output_file2.write( | 843 # output_file2.write( |
824 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) | 844 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) |
825 | 845 |
868 lst_minHD_tags_zeros, diff_zeros) | 888 lst_minHD_tags_zeros, diff_zeros) |
869 # family size distribution of non-identical half | 889 # family size distribution of non-identical half |
870 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( | 890 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( |
871 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) | 891 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) |
872 | 892 |
873 ########################## Plot HD within tags ######################################################## | |
874 ###################################################################################################################### | |
875 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | |
876 | |
877 ##################################################################################################################### | 893 ##################################################################################################################### |
878 ################## plot Hamming Distance with Family size distribution ############################## | 894 ################## plot Hamming Distance with Family size distribution ############################## |
879 ##################################################################################################################### | 895 ##################################################################################################################### |
880 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | 896 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, |
881 subtitle="Hamming distance separated by family size", title_file1=name_file, | 897 subtitle="Hamming distance separated by family size", title_file1=name_file, |
882 lenTags=lenTags,xlabel="Hamming distance") | 898 lenTags=lenTags,xlabel="HD", nr_above_bars=nr_above_bars) |
883 | 899 |
884 ########################## Plot FSD with separation after HD ############################################### | 900 ########################## Plot FSD with separation after ############################################### |
885 ######################################################################################################################## | 901 ###################################################################################################################### |
886 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 902 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, |
887 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", | 903 originalCounts=quant, subtitle="Family size distribution separated by Hamming distance", |
888 pdf=pdf,relative=False, title_file1=name_file, diff=False) | 904 pdf=pdf,relative=False, title_file1=name_file, diff=False) |
889 | 905 |
890 ########################## Plot difference between HD's separated after FSD ########################################## | 906 ########################## Plot HD within tags ######################################################## |
891 ######################################################################################################################## | 907 ###################################################################################################################### |
908 # plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | |
909 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, HDhalf1min, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) | |
910 | |
911 | |
912 ########################## Plot difference between HD's separated after FSD #################################### | |
913 ###################################################################################################################### | |
892 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 914 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, |
893 subtitle="Delta Hamming distance within tags", | 915 subtitle="Delta Hamming distance within tags", |
894 title_file1=name_file, lenTags=lenTags, | 916 title_file1=name_file, lenTags=lenTags, |
895 xlabel="absolute delta Hamming distance", relative=False) | 917 xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars) |
896 | 918 |
897 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 919 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
898 subtitle="Relative delta Hamming distances within tags", | 920 subtitle="Chimera Analysis: relative delta Hamming distances", |
899 title_file1=name_file, lenTags=lenTags, | 921 title_file1=name_file, lenTags=lenTags, |
900 xlabel="relative delta Hamming distance", relative=True) | 922 xlabel="relative delta HD", relative=True, nr_above_bars=nr_above_bars) |
901 | 923 |
902 #################### Plot FSD separated after difference between HD's ##################################### | 924 #################### Plot FSD separated after difference between HD's ##################################### |
903 ######################################################################################################################## | 925 ######################################################################################################################## |
904 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, | 926 # plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, |
905 subtitle="Family size distribution separated by delta Hamming distances within the tags", | 927 # subtitle="Family size distribution separated by delta Hamming distances within the tags", |
906 pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant) | 928 # pdf=pdf,relative=False, diff=True, title_file1=name_file, originalCounts=quant) |
907 | 929 |
908 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf, | 930 # plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, originalCounts=quant, pdf=pdf, |
909 subtitle="Family size distribution separated by delta Hamming distances within the tags", | 931 # subtitle="Family size distribution separated by delta Hamming distances within the tags", |
910 relative=True, diff=True, title_file1=name_file) | 932 # relative=True, diff=True, title_file1=name_file) |
911 | 933 |
912 | 934 |
913 # plots for chimeric reads | 935 # plots for chimeric reads |
914 if len(minHD_tags_zeros) != 0: | 936 if len(minHD_tags_zeros) != 0: |
915 ## HD | 937 ## HD |
916 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | 938 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, |
917 subtitle="Hamming distance of the non-identical half of chimeras", | 939 subtitle="Hamming distance of the non-identical half of chimeras", |
918 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) | 940 title_file1=name_file, lenTags=lenTags,xlabel="HD", relative=False, nr_above_bars=nr_above_bars) |
919 | 941 |
920 ## FSD | 942 ## FSD |
921 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, | 943 # plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, |
922 originalCounts=quant, pdf=pdf, | 944 # originalCounts=quant, pdf=pdf, |
923 subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras", | 945 # subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras", |
924 relative=False, diff=False, title_file1=name_file) | 946 # relative=False, diff=False, title_file1=name_file) |
925 | 947 |
926 ### print all data to a CSV file | 948 ### print all data to a CSV file |
927 #### HD #### | 949 #### HD #### |
928 summary, sumCol = createTableHD(list1, "HD=") | 950 summary, sumCol = createTableHD(list1, "HD=") |
929 overallSum = sum(sumCol) # sum of columns in table | 951 overallSum = sum(sumCol) # sum of columns in table |
944 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") | 966 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") |
945 overallSum13 = sum(sumCol13) | 967 overallSum13 = sum(sumCol13) |
946 | 968 |
947 ## FSD | 969 ## FSD |
948 # absolute difference | 970 # absolute difference |
949 summary19, sumCol19 = createTableFSD2(familySizeList1_diff) | 971 # summary19, sumCol19 = createTableFSD2(familySizeList1_diff) |
950 overallSum19 = sum(sumCol19) | 972 # overallSum19 = sum(sumCol19) |
951 # relative difference | 973 # relative difference |
952 summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) | 974 # summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) |
953 overallSum21 = sum(sumCol21) | 975 # overallSum21 = sum(sumCol21) |
954 | 976 |
955 # chimeric reads | 977 # chimeric reads |
956 if len(minHD_tags_zeros) != 0: | 978 if len(minHD_tags_zeros) != 0: |
957 # absolute difference and tags where at least one half has HD=0 | 979 # absolute difference and tags where at least one half has HD=0 |
958 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=") | 980 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=") |
959 overallSum15 = sum(sumCol15) | 981 overallSum15 = sum(sumCol15) |
960 # absolute difference and tags where at least one half has HD=0 | 982 # absolute difference and tags where at least one half has HD=0 |
961 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) | 983 # summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) |
962 overallSum23 = sum(sumCol23) | 984 # overallSum23 = sum(sumCol23) |
963 | 985 |
964 output_file.write("{}\n".format(name_file)) | 986 output_file.write("{}\n".format(name_file)) |
965 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | 987 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( |
966 numpy.concatenate(list1)), lenTags, lenTags)) | 988 numpy.concatenate(list1)), lenTags, lenTags)) |
967 | 989 |
992 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | 1014 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, |
993 "Hamming distance of each half in the tag", sep) | 1015 "Hamming distance of each half in the tag", sep) |
994 createFileHD(summary11, sumCol11, overallSum11, output_file, | 1016 createFileHD(summary11, sumCol11, overallSum11, output_file, |
995 "Absolute delta Hamming distances within the tag", sep) | 1017 "Absolute delta Hamming distances within the tag", sep) |
996 createFileHD(summary13, sumCol13, overallSum13, output_file, | 1018 createFileHD(summary13, sumCol13, overallSum13, output_file, |
997 "Relative delta Hamming distances within the tag", sep) | 1019 "Chimera analysis: relative delta Hamming distances", sep) |
998 | 1020 |
999 createFileFSD2(summary19, sumCol19, overallSum19, output_file, | 1021 # createFileFSD2(summary19, sumCol19, overallSum19, output_file, |
1000 "Family size distribution separated by absolute delta Hamming distance", | 1022 # "Family size distribution separated by absolute delta Hamming distance", |
1001 sep) | 1023 # sep) |
1002 createFileFSD2(summary21, sumCol21, overallSum21, output_file, | 1024 # createFileFSD2(summary21, sumCol21, overallSum21, output_file, |
1003 "Family size distribution separated by relative delta Hamming distance", | 1025 # "Family size distribution separated by relative delta Hamming distance", |
1004 sep, rel=True) | 1026 # sep, rel=True) |
1005 | 1027 |
1006 if len(minHD_tags_zeros) != 0: | 1028 if len(minHD_tags_zeros) != 0: |
1007 output_file.write( | 1029 output_file.write( |
1008 "Identifiaction of chimeric reads:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | 1030 "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") |
1009 createFileHD(summary15, sumCol15, overallSum15, output_file, | 1031 createFileHD(summary15, sumCol15, overallSum15, output_file, |
1010 "Hamming distances of non-zero half", sep) | 1032 "Hamming distances of non-zero half", sep) |
1011 createFileFSD2(summary23, sumCol23, overallSum23, output_file, | 1033 # createFileFSD2(summary23, sumCol23, overallSum23, output_file, |
1012 "Family size distribution separated by Hamming distance of non-zero half", | 1034 # "Family size distribution separated by Hamming distance of non-zero half", |
1013 sep, diff=False) | 1035 # sep, diff=False) |
1014 output_file.write("\n") | 1036 output_file.write("\n") |
1015 | 1037 |
1016 | 1038 |
1017 | 1039 |
1018 if __name__ == '__main__': | 1040 if __name__ == '__main__': |