Mercurial > repos > mheinzl > hd
diff hd.py @ 0:239c4448a163 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author | mheinzl |
---|---|
date | Thu, 10 May 2018 07:30:27 -0400 |
parents | |
children | 7414792e1cb8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hd.py Thu May 10 07:30:27 2018 -0400 @@ -0,0 +1,624 @@ +#!/usr/bin/env python + +# Hamming distance analysis of SSCSs +# +# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) +# Contact: monika.heinzl@edumail.at +# +# Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input. +# The program produces a plot which shows a histogram of Hamming distances separated after family sizes, +# a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS. +# In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads +# and finally a CSV file with the data of the plots. +# It is also possible to perform the HD analysis with shortened tags with given sizes as input. +# The tool can run on a certain number of processors, which can be defined by the user. + +# USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" / +# --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf + +import numpy +import itertools +import operator +import matplotlib.pyplot as plt +import os.path +import cPickle as pickle +from multiprocessing.pool import Pool +from functools import partial +from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD +from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2 +from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2 +from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag +from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2 +import argparse +import sys +import os +from matplotlib.backends.backend_pdf import PdfPages + +def hamming(array1, array2): + res = 99 * numpy.ones(len(array1)) + i = 0 + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + for a in array1: + dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest + res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero + #print(i) + i += 1 + return res + +def hamming_difference(array1, array2, mate_b): + array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time + array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1 + array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2 + + array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1 + array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2 + + diff11 = [] + relativeDiffList = [] + ham1 = [] + ham2 = [] + min_valueList = [] + min_tagsList = [] + diff11_zeros = [] + min_tagsList_zeros = [] + i = 0 # counter, only used to see how many HDs of tags were already calculated + if mate_b is False: # HD calculation for all a's + half1_mate1 = array1_half + half2_mate1 = array1_half2 + half1_mate2 = array2_half + half2_mate2 = array2_half2 + elif mate_b is True: # HD calculation for all b's + half1_mate1 = array1_half2 + half2_mate1 = array1_half + half1_mate2 = array2_half2 + half2_mate2 = array2_half + + for a, b, tag in zip(half1_mate1, half2_mate1, array1): + ## exclude identical tag from array2, to prevent comparison to itself + sameTag = numpy.where(array2 == tag) + indexArray2 = numpy.arange(0, len(array2), 1) + index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data + + # all tags without identical tag + array2_half_withoutSame = half1_mate2[index_withoutSame] + array2_half2_withoutSame = half2_mate2[index_withoutSame] + #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs) + + dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in + array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + min_index = numpy.where(dist == dist.min()) # get index of min HD + min_value = dist[min_index] # get minimum HDs + min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD + #min_tag = array2_withoutSame[min_index] # get whole tag with min HD + + dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in + min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's" + for d_1, d_2 in zip(min_value, dist2): + if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b + d = d_2 + d2 = d_1 + else: # half1, corrects the variable of the HD from both halfs if it is a or b + d = d_1 + d2 = d_2 + min_valueList.append(d + d2) + min_tagsList.append(tag) + ham1.append(d) + ham2.append(d2) + difference1 = abs(d - d2) + diff11.append(difference1) + rel_difference = round(float(difference1) / (d + d2), 1) + relativeDiffList.append(rel_difference) + + #### tags which have identical parts: + if d == 0 or d2 == 0: + min_tagsList_zeros.append(tag) + difference1_zeros = abs(d - d2) + diff11_zeros.append(difference1_zeros) + #print(i) + i += 1 + return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros]) + +def readFileReferenceFree(file): + with open(file, 'r') as dest_f: + data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string') + integers = numpy.array(data_array[:, 0]).astype(int) + return(integers, data_array) + +def hammingDistanceWithFS(quant, ham): + quant = numpy.asarray(quant) + maximum = max(ham) + minimum = min(ham) + ham = numpy.asarray(ham) + + singletons = numpy.where(quant == 1)[0] + data = ham[singletons] + + hd2 = numpy.where(quant == 2)[0] + data2 = ham[hd2] + + hd3 = numpy.where(quant == 3)[0] + data3 = ham[hd3] + + hd4 = numpy.where(quant == 4)[0] + data4 = ham[hd4] + + hd5 = numpy.where((quant >= 5) & (quant <= 10))[0] + data5 = ham[hd5] + + hd6 = numpy.where(quant > 10)[0] + data6 = ham[hd6] + + list1 = [data, data2, data3, data4, data5, data6] + return(list1, maximum, minimum) + +def familySizeDistributionWithHD(fs, ham, diff=False, rel = True): + hammingDistances = numpy.unique(ham) + fs = numpy.asarray(fs) + + ham = numpy.asarray(ham) + bigFamilies2 = numpy.where(fs > 19)[0] + if len(bigFamilies2) != 0: + fs[bigFamilies2] = 20 + maximum = max(fs) + minimum = min(fs) + if diff is True: + hd0 = numpy.where(ham == 0)[0] + data0 = fs[hd0] + + if rel is True: + hd1 = numpy.where(ham == 0.1)[0] + else: + hd1 = numpy.where(ham == 1)[0] + data = fs[hd1] + + if rel is True: + hd2 = numpy.where(ham == 0.2)[0] + else: + hd2 = numpy.where(ham == 2)[0] + data2 = fs[hd2] + + if rel is True: + hd3 = numpy.where(ham == 0.3)[0] + else: + hd3 = numpy.where(ham == 3)[0] + data3 = fs[hd3] + + if rel is True: + hd4 = numpy.where(ham == 0.4)[0] + else: + hd4 = numpy.where(ham == 4)[0] + data4 = fs[hd4] + + if rel is True: + hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0] + else: + hd5 = numpy.where((ham >= 5) & (ham <= 8))[0] + data5 = fs[hd5] + + if rel is True: + hd6 = numpy.where(ham > 0.8)[0] + else: + hd6 = numpy.where(ham > 8)[0] + data6 = fs[hd6] + + if diff is True: + list1 = [data0,data, data2, data3, data4, data5, data6] + else: + list1 = [data, data2, data3, data4, data5, data6] + + return(list1, hammingDistances, maximum, minimum) + +def make_argparser(): + parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data') + parser.add_argument('--inputFile', + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName1') + parser.add_argument('--inputFile2',default=None, + help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputName2') + parser.add_argument('--sample_size', default=1000,type=int, + help='Sample size of Hamming distance analysis.') + parser.add_argument('--sep', default=",", + help='Separator in the csv file.') + parser.add_argument('--subset_tag', default=0,type=int, + help='The tag is shortened to the given number.') + parser.add_argument('--nproc', default=4,type=int, + help='The tool runs with the given number of processors.') + parser.add_argument('--only_DCS', action="store_false", # default=False, type=bool, + help='Only tags of the DCSs are included in the HD analysis') + + parser.add_argument('--minFS', default=1, type=int, + help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis') + parser.add_argument('--maxFS', default=0, type=int, + help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis') + + parser.add_argument('--output_csv', default="data.csv", type=str, + help='Name of the csv file.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, + help='Name of the pdf file.') + parser.add_argument('--output_pdf2', default="data2.pdf", type=str, + help='Name of the pdf file.') + parser.add_argument('--output_csv2', default="data2.csv", type=str, + help='Name of the csv file.') + + return parser + +def Hamming_Distance_Analysis(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + + file1 = args.inputFile + name1 = args.inputName1 + + file2 = args.inputFile2 + name2 = args.inputName2 + + index_size = args.sample_size + title_savedFile_pdf = args.output_pdf + title_savedFile_pdf2 = args.output_pdf2 + + title_savedFile_csv = args.output_csv + title_savedFile_csv2 = args.output_csv2 + + sep = args.sep + onlyDuplicates = args.only_DCS + minFS = args.minFS + maxFS = args.maxFS + + subset = args.subset_tag + nproc = args.nproc + + ### input checks + if index_size < 0: + print("index_size is a negative integer.") + exit(2) + + if nproc <= 0: + print("nproc is smaller or equal zero") + exit(3) + + if type(sep) is not str or len(sep)>1: + print("sep must be a single character.") + exit(4) + + if subset < 0: + print("subset_tag is smaller or equal zero.") + exit(5) + + ### PLOT ### + plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color + plt.rcParams['xtick.labelsize'] = 12 + plt.rcParams['ytick.labelsize'] = 12 + plt.rcParams['patch.edgecolor'] = "#000000" + plt.rc('figure', figsize=(11.69, 8.27)) # A4 format + + if file2 != str(None): + files = [file1, file2] + name1 = name1.split(".tabular")[0] + name2 = name2.split(".tabular")[0] + names = [name1, name2] + pdf_files = [title_savedFile_pdf, title_savedFile_pdf2] + csv_files = [title_savedFile_csv, title_savedFile_csv2] + else: + files = [file1] + name1 = name1.split(".tabular")[0] + names = [name1] + pdf_files = [title_savedFile_pdf] + csv_files = [title_savedFile_csv] + + print(type(onlyDuplicates)) + print(onlyDuplicates) + + for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): + with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: + print("dataset: ", name_file) + integers, data_array = readFileReferenceFree(f) + data_array = numpy.array(data_array) + int_f = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f >= minFS)] + integers = integers[integers >= minFS] + + # select family size for tags + if maxFS > 0: + int_f2 = numpy.array(data_array[:, 0]).astype(int) + data_array = data_array[numpy.where(int_f2 <= maxFS)] + integers = integers[integers <= maxFS] + + print("min FS", min(integers)) + print("max FS", max(integers)) + + tags = data_array[:, 2] + seq = data_array[:, 1] + + if onlyDuplicates is True: + # find all unique tags and get the indices for ALL tags, but only once + u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) + d = u[c > 1] + + # get family sizes, tag for duplicates + duplTags_double = integers[numpy.in1d(seq, d)] + duplTags = duplTags_double[0::2] # ab of DCS + duplTagsBA = duplTags_double[1::2] # ba of DCS + + duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab + duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags + + data_array = numpy.column_stack((duplTags, duplTags_seq)) + data_array = numpy.column_stack((data_array, duplTags_tag)) + integers = numpy.array(data_array[:, 0]).astype(int) + print("DCS in whole dataset", len(data_array)) + + ## HD analysis for a subset of the tag + if subset > 0: + tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]]) + tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]]) + + flanking_region_float = float((len(tag1[0]) - subset)) / 2 + flanking_region = int(flanking_region_float) + if flanking_region_float % 2 == 0: + tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1]) + tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2]) + else: + flanking_region_rounded = int(round(flanking_region, 1)) + flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded + tag1_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1]) + tag2_shorten = numpy.array( + [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2]) + + data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)]) + data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2])) + + print("length of tag= ", len(data_array[0, 1])) + # select sample: if no size given --> all vs. all comparison + if index_size == 0: + result = numpy.arange(0, len(data_array), 1) + else: + result = numpy.random.choice(len(integers), size=index_size, + replace=False) # array of random sequences of size=index.size + + # with open("index_result1_{}.pkl".format(app_f), "wb") as o: + # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL) + + # comparison random tags to whole dataset + result1 = data_array[result, 1] # random tags + result2 = data_array[:, 1] # all tags + print("size of the whole dataset= ", len(result2)) + print("sample size= ", len(result1)) + + # HD analysis of whole tag + proc_pool = Pool(nproc) + chunks_sample = numpy.array_split(result1, nproc) + ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample) + proc_pool.close() + proc_pool.join() + ham = numpy.concatenate(ham).astype(int) + # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1: + # for h, tag in zip(ham, result1): + # output_file1.write("{}\t{}\n".format(tag, h)) + + # HD analysis for chimeric reads + proc_pool_b = Pool(nproc) + diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) + diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) + proc_pool_b.close() + proc_pool_b.join() + diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), + numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int) + HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]), + numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int) + HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]), + numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int) + minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]), + numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int) + minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]), + numpy.concatenate([item_b[4] for item_b in diff_list_b]))) + rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]), + numpy.concatenate([item_b[5] for item_b in diff_list_b]))) + diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]), + numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int) + minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]), + numpy.concatenate([item_b[7] for item_b in diff_list_b]))) + + # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2: + # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags): + # output_file2.write( + # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d)) + + lenTags = len(data_array) + + quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags + seq = numpy.array(data_array[result, 1]) # tags of sample + ham = numpy.asarray(ham) # HD for sample of tags + + if onlyDuplicates is True: # ab and ba strands of DCSs + quant = numpy.concatenate((quant, duplTagsBA[result])) + seq = numpy.tile(seq, 2) + ham = numpy.tile(ham, 2) + + # prepare data for different kinds of plots + list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS + # distribution of FSs separated after HD + familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham, + rel=False) + + ## get FS for all tags with min HD of analysis of chimeric reads + # there are more tags than sample size in the plot, because one tag can have multiple minimas + seqDic = dict(zip(seq, quant)) + lst_minHD_tags = [] + for i in minHD_tags: + lst_minHD_tags.append(seqDic.get(i)) + + # histogram with absolute and relative difference between HDs of both parts of the tag + listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff) + listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags, + rel_Diff) + + # family size distribution separated after the difference between HDs of both parts of the tag + familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD( + lst_minHD_tags, diff, diff=True, rel=False) + familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD( + lst_minHD_tags, rel_Diff, diff=True, rel=True) + + # chimeric read analysis: tags which have HD=0 in one of the halfs + if len(minHD_tags_zeros) != 0: + lst_minHD_tags_zeros = [] + for i in minHD_tags_zeros: + lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads + + # histogram with HD of non-identical half + listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS( + lst_minHD_tags_zeros, diff_zeros) + # family size distribution of non-identical half + familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD( + lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False) + + ########################## Plot HD within tags ######################################################## + ###################################################################################################################### + plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file) + + ##################################################################################################################### + ################## plot Hamming Distance with Family size distribution ############################## + ##################################################################################################################### + plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf, + subtitle="Overall hamming distance with separation after family size", title_file1=name_file, + lenTags=lenTags,xlabel="Hamming distance") + + ########################## Plot FSD with separation after HD ############################################### + ######################################################################################################################## + plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, + quant=quant, subtitle="Family size distribution with separation after hamming distance", + pdf=pdf,relative=False, title_file1=name_file, diff=False) + + ########################## Plot difference between HD's separated after FSD ########################################## + ######################################################################################################################## + plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, + subtitle="Delta Hamming distances within tags with separation after family size", + title_file1=name_file, lenTags=lenTags, + xlabel="absolute delta Hamming distance", relative=False) + + plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, + subtitle="Relative delta Hamming distances within tags with separation after family size", + title_file1=name_file, lenTags=lenTags, + xlabel="relative delta Hamming distance", relative=True) + + #################### Plot FSD separated after difference between HD's ##################################### + ######################################################################################################################## + plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, + subtitle="Family size distribution with separation after delta Hamming distances within the tags", + pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) + + plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, + subtitle="Family size distribution with separation after delta Hamming distances within the tags", + relative=True, diff=True, title_file1=name_file) + + + # plots for chimeric reads + if len(minHD_tags_zeros) != 0: + ## HD + plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, + subtitle="Hamming Distance of the non-identical half with separation after family size" + "\n(at least one half is identical with the half of the min. tag)\n", + title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) + + ## FSD + plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, + quant=quant, pdf=pdf, + subtitle="Family size distribution with separation after hamming distances from the non-identical half\n" + "(at least one half is identical with the half of the min. tag)\n", + relative=False, diff=False, title_file1=name_file) + + ### print all data to a CSV file + #### HD #### + summary, sumCol = createTableHD(list1, "HD=") + overallSum = sum(sumCol) # sum of columns in table + + #### FSD #### + summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False) + overallSum5 = sum(sumCol5) + + ### HD of both parts of the tag #### + summary9, sumCol9 = createTableHDwithTags([numpy.array(minHDs), HDhalf1, HDhalf2]) + overallSum9 = sum(sumCol9) + + ## HD + # absolute difference + summary11, sumCol11 = createTableHD(listDifference1, "diff=") + overallSum11 = sum(sumCol11) + # relative difference and all tags + summary13, sumCol13 = createTableHD(listRelDifference1, "diff=") + overallSum13 = sum(sumCol13) + + ## FSD + # absolute difference + summary19, sumCol19 = createTableFSD2(familySizeList1_diff) + overallSum19 = sum(sumCol19) + # relative difference + summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff) + overallSum21 = sum(sumCol21) + + # chimeric reads + if len(minHD_tags_zeros) != 0: + # absolute difference and tags where at least one half has HD=0 + summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=") + overallSum15 = sum(sumCol15) + # absolute difference and tags where at least one half has HD=0 + summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False) + overallSum23 = sum(sumCol23) + + output_file.write("{}\n".format(f)) + output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len( + numpy.concatenate(list1)), lenTags, lenTags)) + + ### HD ### + createFileHD(summary, sumCol, overallSum, output_file, + "Hamming distance with separation after family size: file1", sep) + ### FSD ### + createFileFSD2(summary5, sumCol5, overallSum5, output_file, + "Family size distribution with separation after hamming distances: file1", sep, + diff=False) + + count = numpy.bincount(quant) + output_file.write("{}{}\n".format(sep, f)) + output_file.write("max. family size:{}{}\n".format(sep, max(quant))) + output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) + output_file.write( + "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) + + ### HD within tags ### + output_file.write( + "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n" + "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n") + output_file.write( + "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format( + len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1)))) + output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2)) + + createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file, + "Hamming distance of each half in the tag: file1", sep) + createFileHD(summary11, sumCol11, overallSum11, output_file, + "Absolute delta Hamming distances within the tag: file1", sep) + createFileHD(summary13, sumCol13, overallSum13, output_file, + "Relative delta Hamming distances within the tag: file1", sep) + + createFileFSD2(summary19, sumCol19, overallSum19, output_file, + "Family size distribution with separation after absolute delta Hamming distances: file1", + sep) + createFileFSD2(summary21, sumCol21, overallSum21, output_file, + "Family size distribution with separation after relative delta Hamming distances: file1", + sep, rel=True) + + if len(minHD_tags_zeros) != 0: + output_file.write( + "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n") + createFileHD(summary15, sumCol15, overallSum15, output_file, + "Hamming distances of non-zero half: file1", sep) + createFileFSD2(summary23, sumCol23, overallSum23, output_file, + "Family size distribution with separation after Hamming distances of non-zero half: file1", + sep, diff=False) + output_file.write("\n") + + + +if __name__ == '__main__': + sys.exit(Hamming_Distance_Analysis(sys.argv))