Mercurial > repos > mheinzl > hd
comparison hd.py @ 2:316fbf91dd12 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit f9d5547849dabb59a33a5e998bda4730323d62a9
author | mheinzl |
---|---|
date | Tue, 15 May 2018 10:36:34 -0400 |
parents | 7414792e1cb8 |
children | 82eaf30dd089 |
comparison
equal
deleted
inserted
replaced
1:7414792e1cb8 | 2:316fbf91dd12 |
---|---|
61 color=colors, stacked=True, | 61 color=colors, stacked=True, |
62 rwidth=0.8,alpha=1, align="left", | 62 rwidth=0.8,alpha=1, align="left", |
63 edgecolor="None",bins=range1) | 63 edgecolor="None",bins=range1) |
64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 64 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
65 | 65 |
66 plt.title(title_file1, fontsize=12) | 66 #plt.title(title_file1, fontsize=12) |
67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 67 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
68 plt.xlabel("No. of Family Members", fontsize=12) | 68 plt.xlabel("Family size", fontsize=14) |
69 plt.ylabel("Absolute Frequency", fontsize=12) | 69 plt.ylabel("Absolute Frequency", fontsize=14) |
70 | 70 |
71 ticks = numpy.arange(0, maximumXFS + 1, 1) | 71 ticks = numpy.arange(0, maximumXFS + 1, 1) |
72 ticks1 = map(str, ticks) | 72 ticks1 = map(str, ticks) |
73 if maximumXFS >= 20: | 73 if maximumXFS >= 20: |
74 ticks1[len(ticks1) - 1] = ">=20" | 74 ticks1[len(ticks1) - 1] = ">=20" |
123 range=(0, maximumX + 1)) | 123 range=(0, maximumX + 1)) |
124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) | 124 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1)) |
125 bins = counts[1] # width of bins | 125 bins = counts[1] # width of bins |
126 counts = numpy.array(map(int, counts[0][5])) | 126 counts = numpy.array(map(int, counts[0][5])) |
127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) | 127 plt.suptitle(subtitle, y=1, x=0.5, fontsize=14) |
128 plt.title(title_file1, fontsize=12) | 128 # plt.title(title_file1, fontsize=12) |
129 plt.xlabel(xlabel, fontsize=12) | 129 plt.xlabel(xlabel, fontsize=14) |
130 plt.ylabel("Absolute Frequency", fontsize=12) | 130 plt.ylabel("Absolute Frequency", fontsize=14) |
131 | 131 |
132 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 132 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) | 133 plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1)) |
134 plt.xticks(numpy.arange(0, maximumX + step, step)) | 134 plt.xticks(numpy.arange(0, maximumX + step, step)) |
135 | 135 |
153 | 153 |
154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): | 154 def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf): |
155 fig = plt.figure(figsize=(6, 8)) | 155 fig = plt.figure(figsize=(6, 8)) |
156 plt.subplots_adjust(bottom=0.1) | 156 plt.subplots_adjust(bottom=0.1) |
157 | 157 |
158 ham = [numpy.array(min_value), sum1, sum2] # new hd within tags | 158 ham = [sum1, sum2,numpy.array(min_value)] # new hd within tags |
159 | 159 |
160 maximumX = numpy.amax(numpy.concatenate(ham)) | 160 maximumX = numpy.amax(numpy.concatenate(ham)) |
161 minimumX = numpy.amin(numpy.concatenate(ham)) | 161 minimumX = numpy.amin(numpy.concatenate(ham)) |
162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) | 162 maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham))) |
163 | 163 |
165 range1 = minimumX | 165 range1 = minimumX |
166 else: | 166 else: |
167 range1 = range(minimumX, maximumX + 2) | 167 range1 = range(minimumX, maximumX + 2) |
168 | 168 |
169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, | 169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, |
170 label=["HD of whole tag", "tag1 - a\nvs. tag2 - a", "tag1 - b\nvs. tag2 - b"], | 170 label=[ "HD a", "HD b","HD a+b"], |
171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1) | 171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1) |
172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) | 172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) |
173 plt.suptitle('Hamming distances within tags', fontsize=14) | 173 plt.suptitle('Hamming distances within tags', fontsize=14) |
174 plt.title(title_file1, fontsize=12) | 174 #plt.title(title_file1, fontsize=12) |
175 plt.xlabel("Hamming Distance", fontsize=12) | 175 plt.xlabel("Hamming Distance", fontsize=14) |
176 plt.ylabel("Absolute Frequency", fontsize=12) | 176 plt.ylabel("Absolute Frequency", fontsize=14) |
177 plt.grid(b=True, which='major', color='#424242', linestyle=':') | 177 plt.grid(b=True, which='major', color='#424242', linestyle=':') |
178 | 178 |
179 | 179 |
180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) | 180 plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1)) |
181 plt.xticks(numpy.arange(minimumX - 1, maximumX + 1, 1.0)) | 181 plt.xticks(numpy.arange(0, maximumX + 1, 1.0)) |
182 plt.ylim((0, maximumY * 1.1)) | 182 plt.ylim((0, maximumY * 1.1)) |
183 | 183 |
184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) | 184 legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags) |
185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) | 185 plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure) |
186 pdf.savefig(fig, bbox_inches="tight") | 186 pdf.savefig(fig, bbox_inches="tight") |
403 output_file.write("\n\n") | 403 output_file.write("\n\n") |
404 | 404 |
405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): | 405 def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep): |
406 output_file.write(name) | 406 output_file.write(name) |
407 output_file.write("\n") | 407 output_file.write("\n") |
408 output_file.write("{}HD of whole tag;tag1-half1 vs. tag2-half1{}tag1-half2 vs. tag2-half2{}sum{}\n".format(sep,sep,sep,sep)) | 408 output_file.write("{}HD a+b;HD a{}HD b{}sum{}\n".format(sep,sep,sep,sep)) |
409 for item in summary: | 409 for item in summary: |
410 for nr in item: | 410 for nr in item: |
411 if "HD" not in nr: | 411 if "HD" not in nr: |
412 nr = nr.astype(float) | 412 nr = nr.astype(float) |
413 nr = nr.astype(int) | 413 nr = nr.astype(int) |
417 sumCol = map(int, sumCol) | 417 sumCol = map(int, sumCol) |
418 for el in sumCol: | 418 for el in sumCol: |
419 output_file.write("{}{}".format(el,sep)) | 419 output_file.write("{}{}".format(el,sep)) |
420 output_file.write("{}{}".format(overallSum.astype(int),sep)) | 420 output_file.write("{}{}".format(overallSum.astype(int),sep)) |
421 output_file.write("\n\n") | 421 output_file.write("\n\n") |
422 | |
423 | |
424 | 422 |
425 def hamming(array1, array2): | 423 def hamming(array1, array2): |
426 res = 99 * numpy.ones(len(array1)) | 424 res = 99 * numpy.ones(len(array1)) |
427 i = 0 | 425 i = 0 |
428 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time | 426 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time |
439 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 | 437 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 |
440 | 438 |
441 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 | 439 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 |
442 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 | 440 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 |
443 | 441 |
444 diff11 = [] | 442 diff11 = 999 * numpy.ones(len(array2)) |
445 relativeDiffList = [] | 443 relativeDiffList = 999 * numpy.ones(len(array2)) |
446 ham1 = [] | 444 ham1 = 999 * numpy.ones(len(array2)) |
447 ham2 = [] | 445 ham2 = 999 * numpy.ones(len(array2)) |
448 min_valueList = [] | 446 min_valueList = 999 * numpy.ones(len(array2)) |
449 min_tagsList = [] | 447 min_tagsList = 999 * numpy.ones(len(array2)) |
450 diff11_zeros = [] | 448 diff11_zeros = 999 * numpy.ones(len(array2)) |
451 min_tagsList_zeros = [] | 449 min_tagsList_zeros = 999 * numpy.ones(len(array2)) |
450 | |
451 | |
452 #diff11 = [] | |
453 #relativeDiffList = [] | |
454 #ham1 = [] | |
455 #ham2 = [] | |
456 #min_valueList = [] | |
457 #min_tagsList = [] | |
458 #diff11_zeros = [] | |
459 #min_tagsList_zeros = [] | |
452 i = 0 # counter, only used to see how many HDs of tags were already calculated | 460 i = 0 # counter, only used to see how many HDs of tags were already calculated |
453 if mate_b is False: # HD calculation for all a's | 461 if mate_b is False: # HD calculation for all a's |
454 half1_mate1 = array1_half | 462 half1_mate1 = array1_half |
455 half2_mate1 = array1_half2 | 463 half2_mate1 = array1_half2 |
456 half1_mate2 = array2_half | 464 half1_mate2 = array2_half |
486 d = d_2 | 494 d = d_2 |
487 d2 = d_1 | 495 d2 = d_1 |
488 else: # half1, corrects the variable of the HD from both halfs if it is a or b | 496 else: # half1, corrects the variable of the HD from both halfs if it is a or b |
489 d = d_1 | 497 d = d_1 |
490 d2 = d_2 | 498 d2 = d_2 |
491 min_valueList.append(d + d2) | 499 min_valueList[i] = d + d2 |
492 min_tagsList.append(tag) | 500 min_tagsList[i] = tag |
493 ham1.append(d) | 501 ham1.append[i] = d |
494 ham2.append(d2) | 502 ham2.append[i] = d2 |
495 difference1 = abs(d - d2) | 503 difference1 = abs(d - d2) |
496 diff11.append(difference1) | 504 diff11[i] = difference1 |
497 rel_difference = round(float(difference1) / (d + d2), 1) | 505 rel_difference = round(float(difference1) / (d + d2), 1) |
498 relativeDiffList.append(rel_difference) | 506 relativeDiffList[i] = rel_difference |
499 | 507 |
500 #### tags which have identical parts: | 508 #### tags which have identical parts: |
501 if d == 0 or d2 == 0: | 509 if d == 0 or d2 == 0: |
502 min_tagsList_zeros.append(tag) | 510 min_tagsList_zeros[i] = tag |
503 difference1_zeros = abs(d - d2) | 511 difference1_zeros = abs(d - d2) |
504 diff11_zeros.append(difference1_zeros) | 512 diff11_zeros[i] = difference1_zeros |
513 i += 1 | |
514 | |
505 #print(i) | 515 #print(i) |
506 i += 1 | 516 diff11 = [st for st in diff11 if st != 999] |
517 ham1 = [st for st in ham1 if st != 999] | |
518 ham2 = [st for st in ham2 if st != 999] | |
519 min_valueList = [st for st in min_valueList if st != 999] | |
520 min_tagsList = [st for st in min_tagsList if st != 999] | |
521 relativeDiffList = [st for st in relativeDiffList if st != 999] | |
522 diff11_zeros = [st for st in diff11_zeros if st != 999] | |
523 min_tagsList_zeros = [st for st in min_tagsList_zeros if st != 999] | |
524 | |
507 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) | 525 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) |
508 | 526 |
509 def readFileReferenceFree(file): | 527 def readFileReferenceFree(file): |
510 with open(file, 'r') as dest_f: | 528 with open(file, 'r') as dest_f: |
511 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') | 529 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') |
823 quant = numpy.concatenate((quant, duplTagsBA[result])) | 841 quant = numpy.concatenate((quant, duplTagsBA[result])) |
824 seq = numpy.tile(seq, 2) | 842 seq = numpy.tile(seq, 2) |
825 ham = numpy.tile(ham, 2) | 843 ham = numpy.tile(ham, 2) |
826 | 844 |
827 # prepare data for different kinds of plots | 845 # prepare data for different kinds of plots |
846 # distribution of FSs separated after HD | |
847 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,rel=False) | |
828 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS | 848 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS |
829 # distribution of FSs separated after HD | 849 |
830 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, | |
831 rel=False) | |
832 | |
833 ## get FS for all tags with min HD of analysis of chimeric reads | 850 ## get FS for all tags with min HD of analysis of chimeric reads |
834 # there are more tags than sample size in the plot, because one tag can have multiple minimas | 851 # there are more tags than sample size in the plot, because one tag can have multiple minimas |
835 seqDic = dict(zip(seq, quant)) | 852 seqDic = dict(zip(seq, quant)) |
836 lst_minHD_tags = [] | 853 lst_minHD_tags = [] |
837 for i in minHD_tags: | 854 for i in minHD_tags: |
867 | 884 |
868 ##################################################################################################################### | 885 ##################################################################################################################### |
869 ################## plot Hamming Distance with Family size distribution ############################## | 886 ################## plot Hamming Distance with Family size distribution ############################## |
870 ##################################################################################################################### | 887 ##################################################################################################################### |
871 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, | 888 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, |
872 subtitle="Overall hamming distance with separation after family size", title_file1=name_file, | 889 subtitle="Hamming distance separated by family size", title_file1=name_file, |
873 lenTags=lenTags,xlabel="Hamming distance") | 890 lenTags=lenTags,xlabel="Hamming distance") |
874 | 891 |
875 ########################## Plot FSD with separation after HD ############################################### | 892 ########################## Plot FSD with separation after HD ############################################### |
876 ######################################################################################################################## | 893 ######################################################################################################################## |
877 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, | 894 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, |
878 quant=quant, subtitle="Family size distribution with separation after hamming distance", | 895 quant=quant, subtitle="Family size distribution separated by Hamming distance", |
879 pdf=pdf,relative=False, title_file1=name_file, diff=False) | 896 pdf=pdf,relative=False, title_file1=name_file, diff=False) |
880 | 897 |
881 ########################## Plot difference between HD's separated after FSD ########################################## | 898 ########################## Plot difference between HD's separated after FSD ########################################## |
882 ######################################################################################################################## | 899 ######################################################################################################################## |
883 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, | 900 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, |
884 subtitle="Delta Hamming distances within tags with separation after family size", | 901 subtitle="Delta Hamming distance within tags", |
885 title_file1=name_file, lenTags=lenTags, | 902 title_file1=name_file, lenTags=lenTags, |
886 xlabel="absolute delta Hamming distance", relative=False) | 903 xlabel="abs delta Hamming distance", relative=False) |
887 | 904 |
888 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, | 905 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, |
889 subtitle="Relative delta Hamming distances within tags with separation after family size", | 906 subtitle="Relative delta Hamming distances within tags", |
890 title_file1=name_file, lenTags=lenTags, | 907 title_file1=name_file, lenTags=lenTags, |
891 xlabel="relative delta Hamming distance", relative=True) | 908 xlabel="rel delta Hamming distance", relative=True) |
892 | 909 |
893 #################### Plot FSD separated after difference between HD's ##################################### | 910 #################### Plot FSD separated after difference between HD's ##################################### |
894 ######################################################################################################################## | 911 ######################################################################################################################## |
895 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, | 912 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, |
896 subtitle="Family size distribution with separation after delta Hamming distances within the tags", | 913 subtitle="Family size distribution with delta Hamming distances within the tags", |
897 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) | 914 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) |
898 | 915 |
899 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, | 916 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, |
900 subtitle="Family size distribution with separation after delta Hamming distances within the tags", | 917 subtitle="Family size distribution with delta Hamming distances within the tags", |
901 relative=True, diff=True, title_file1=name_file) | 918 relative=True, diff=True, title_file1=name_file) |
902 | 919 |
903 | 920 |
904 # plots for chimeric reads | 921 # plots for chimeric reads |
905 if len(minHD_tags_zeros) != 0: | 922 if len(minHD_tags_zeros) != 0: |
906 ## HD | 923 ## HD |
907 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, | 924 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, |
908 subtitle="Hamming Distance of the non-identical half with separation after family size" | 925 subtitle="Hamming distance of the non-identical half of chimeras", |
909 "\n(at least one half is identical with the half of the min. tag)\n", | |
910 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) | 926 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) |
911 | 927 |
912 ## FSD | 928 ## FSD |
913 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, | 929 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, |
914 quant=quant, pdf=pdf, | 930 quant=quant, pdf=pdf, |
915 subtitle="Family size distribution with separation after hamming distances from the non-identical half\n" | 931 subtitle="Family size distribution with Hamming distance from the non-identical half of chimeras", |
916 "(at least one half is identical with the half of the min. tag)\n", | |
917 relative=False, diff=False, title_file1=name_file) | 932 relative=False, diff=False, title_file1=name_file) |
918 | 933 |
919 ### print all data to a CSV file | 934 ### print all data to a CSV file |
920 #### HD #### | 935 #### HD #### |
921 summary, sumCol = createTableHD(list1, "HD=") | 936 summary, sumCol = createTableHD(list1, "HD=") |
952 overallSum15 = sum(sumCol15) | 967 overallSum15 = sum(sumCol15) |
953 # absolute difference and tags where at least one half has HD=0 | 968 # absolute difference and tags where at least one half has HD=0 |
954 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) | 969 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) |
955 overallSum23 = sum(sumCol23) | 970 overallSum23 = sum(sumCol23) |
956 | 971 |
957 output_file.write("{}\n".format(f)) | 972 output_file.write("{}\n".format(name_file)) |
958 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( | 973 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( |
959 numpy.concatenate(list1)), lenTags, lenTags)) | 974 numpy.concatenate(list1)), lenTags, lenTags)) |
960 | 975 |
961 ### HD ### | 976 ### HD ### |
962 createFileHD(summary, sumCol, overallSum, output_file, | 977 createFileHD(summary, sumCol, overallSum, output_file, |
963 "Hamming distance with separation after family size: file1", sep) | 978 "Hamming distance separated by family size", sep) |
964 ### FSD ### | 979 ### FSD ### |
965 createFileFSD2(summary5, sumCol5, overallSum5, output_file, | 980 createFileFSD2(summary5, sumCol5, overallSum5, output_file, |
966 "Family size distribution with separation after hamming distances: file1", sep, | 981 "Family size distribution separated by Hamming distance", sep, |
967 diff=False) | 982 diff=False) |
968 | 983 |
969 count = numpy.bincount(quant) | 984 count = numpy.bincount(quant) |
970 output_file.write("{}{}\n".format(sep, f)) | 985 output_file.write("{}{}\n".format(sep, f)) |
971 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) | 986 output_file.write("max. family size:{}{}\n".format(sep, max(quant))) |
976 ### HD within tags ### | 991 ### HD within tags ### |
977 output_file.write( | 992 output_file.write( |
978 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" | 993 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" |
979 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") | 994 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") |
980 output_file.write( | 995 output_file.write( |
981 "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( | 996 "actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( |
982 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) | 997 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) |
983 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) | 998 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) |
984 | 999 |
985 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, | 1000 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, |
986 "Hamming distance of each half in the tag: file1", sep) | 1001 "Hamming distance of each half in the tag", sep) |
987 createFileHD(summary11, sumCol11, overallSum11, output_file, | 1002 createFileHD(summary11, sumCol11, overallSum11, output_file, |
988 "Absolute delta Hamming distances within the tag: file1", sep) | 1003 "Absolute delta Hamming distances within the tag", sep) |
989 createFileHD(summary13, sumCol13, overallSum13, output_file, | 1004 createFileHD(summary13, sumCol13, overallSum13, output_file, |
990 "Relative delta Hamming distances within the tag: file1", sep) | 1005 "Relative delta Hamming distances within the tag", sep) |
991 | 1006 |
992 createFileFSD2(summary19, sumCol19, overallSum19, output_file, | 1007 createFileFSD2(summary19, sumCol19, overallSum19, output_file, |
993 "Family size distribution with separation after absolute delta Hamming distances: file1", | 1008 "Family size distribution separated by absolute delta Hamming distance", |
994 sep) | 1009 sep) |
995 createFileFSD2(summary21, sumCol21, overallSum21, output_file, | 1010 createFileFSD2(summary21, sumCol21, overallSum21, output_file, |
996 "Family size distribution with separation after relative delta Hamming distances: file1", | 1011 "Family size distribution separated by relative delta Hamming distance", |
997 sep, rel=True) | 1012 sep, rel=True) |
998 | 1013 |
999 if len(minHD_tags_zeros) != 0: | 1014 if len(minHD_tags_zeros) != 0: |
1000 output_file.write( | 1015 output_file.write( |
1001 "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") | 1016 "Identifiaction of chimeric reads:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") |
1002 createFileHD(summary15, sumCol15, overallSum15, output_file, | 1017 createFileHD(summary15, sumCol15, overallSum15, output_file, |
1003 "Hamming distances of non-zero half: file1", sep) | 1018 "Hamming distances of non-zero half", sep) |
1004 createFileFSD2(summary23, sumCol23, overallSum23, output_file, | 1019 createFileFSD2(summary23, sumCol23, overallSum23, output_file, |
1005 "Family size distribution with separation after Hamming distances of non-zero half: file1", | 1020 "Family size distribution separated by Hamming distance of non-zero half", |
1006 sep, diff=False) | 1021 sep, diff=False) |
1007 output_file.write("\n") | 1022 output_file.write("\n") |
1008 | 1023 |
1009 | 1024 |
1010 | 1025 |