Mercurial > repos > mheinzl > hd

--- a/hd.py	Wed Feb 27 09:17:04 2019 -0500
+++ b/hd.py	Tue May 14 03:29:37 2019 -0400
@@ -14,7 +14,7 @@
 # The tool can run on a certain number of processors, which can be defined by the user.

 # USAGE: python hd.py --inputFile filename --inputName1 filename --sample_size int /
-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_pdf outputfile_name_pdf --output_tabular outputfile_name_tabular --output_chimeras_tabular outputfile_name_chimeras_tabular
+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular

 import argparse
 import itertools
@@ -23,6 +23,8 @@
 from collections import Counter, defaultdict
 from functools import partial
 from multiprocessing.pool import Pool
+import random
+import os

 import matplotlib.pyplot as plt
 import numpy
@@ -142,22 +144,23 @@
                              xy=(label, x_label + len(con_list1) * 0.01),
                              xycoords="data", color="#000066", fontsize=10)

-    legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
-    plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
-    if nr_unique_chimeras != 0 and len_sample != 0:
-        if relative == True:
-            legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
-                                                                         int(nr_unique_chimeras) / float(len_sample))
-        else:
-            legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
-        plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, sum(counts))
+    plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
+
+    # if nr_unique_chimeras != 0 and len_sample != 0:
+    #     if relative == True:
+    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f}) (rel.diff=1)".format(nr_unique_chimeras,
+    #                                                                      int(nr_unique_chimeras) / float(len_sample))
+    #     else:
+    #         legend = "nr. of unique chimeric tags= {:,} ({:.5f})".format(nr_unique_chimeras, int(nr_unique_chimeras) / float(len_sample))
+    #     plt.text(0.14, -0.09, legend, size=12, transform=plt.gcf().transFigure)

     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
     plt.clf()


-def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf):
+def plotHDwithinSeq_Sum2(sum1, sum1min, sum2, sum2min, min_value, lenTags, title_file1, pdf, len_sample):
     fig = plt.figure(figsize=(6, 8))
     plt.subplots_adjust(bottom=0.1)

@@ -172,7 +175,7 @@
     else:
         range1 = range(minimumX, maximumX + 2)

-    plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=[ "HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)
+    plt.hist(ham_partial, align="left", rwidth=0.8, stacked=False, label=["HD a", "HD b'", "HD b", "HD a'", "HD a+b"], bins=range1, color=["#58ACFA", "#0404B4", "#FE642E", "#B40431", "#585858"], edgecolor='black', linewidth=1)

     plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
     plt.suptitle('Hamming distances within tags', fontsize=14)
@@ -184,9 +187,10 @@
     plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.2))
     plt.xticks(numpy.arange(0, maximumX + 1, 1.0))
     # plt.ylim(0, maximumY * 1.2)
+    legend = "nr. of tags = {:,}\nsample size = {:,}\nnr. of data points = {:,}".format(lenTags, len_sample, len(numpy.concatenate(ham_partial)))

-    legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
-    plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
+    # legend = "sample size= {:,} against {:,}".format(len(numpy.concatenate(ham_partial)), lenTags)
+    plt.text(0.14, -0.05, legend, size=12, transform=plt.gcf().transFigure)
     pdf.savefig(fig, bbox_inches="tight")
     plt.close("all")
     plt.clf()
@@ -493,6 +497,10 @@
         half2_mate1 = array1_half
         half1_mate2 = array2_half2
         half2_mate2 = array2_half
+    # half1_mate1, index_halves = numpy.unique(half1_mate1, return_index=True)
+    # print(len(half1_mate1))
+    # half2_mate1 = half2_mate1[index_halves]
+    # array1 = array1[index_halves]

     for a, b, tag in zip(half1_mate1, half2_mate1, array1):
         # exclude identical tag from array2, to prevent comparison to itself
@@ -508,38 +516,43 @@
         dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
                             array2_half_withoutSame])  # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
         min_index = numpy.where(dist == dist.min())[0]  # get index of min HD
-        min_value = dist[min_index]  # get minimum HDs
+        min_value = dist.min()
+        # min_value = dist[min_index]  # get minimum HDs
         min_tag_half2 = array2_half2_withoutSame[min_index]  # get all "b's" of the tag or all "a's" of the tag with minimum HD
         min_tag_array2 = array2_withoutSame[min_index]  # get whole tag with min HD

-        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
-                             min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
-        dist2 = [dist_second_half.max()]
-        min_value = [dist.min()]
+        dist_second_half = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in min_tag_half2])  # calculate HD of "b" to all "b's" or "a" to all "a's"
+        max_value = dist_second_half.max()
         max_index = numpy.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
         max_tag = min_tag_array2[max_index]

-        for d, d2 in zip(min_value, dist2):
-            if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
-                ham2.append(d)
-                ham2min.append(d2)
-            else:  # half1, corrects the variable of the HD from both halfs if it is a or b
-                ham1.append(d)
-                ham1min.append(d2)
+        # for d, d2 in zip(min_value, max_value):
+        if mate_b is True:  # half2, corrects the variable of the HD from both halfs if it is a or b
+            ham2.append(min_value)
+            ham2min.append(max_value)
+        else:  # half1, corrects the variable of the HD from both halfs if it is a or b
+            ham1.append(min_value)
+            ham1min.append(max_value)

-            min_valueList.append(d + d2)
-            min_tagsList.append(tag)
-            difference1 = abs(d - d2)
-            diff11.append(difference1)
-            rel_difference = round(float(difference1) / (d + d2), 1)
-            relativeDiffList.append(rel_difference)
+        min_valueList.append(min_value + max_value)
+        min_tagsList.append(tag)
+        difference1 = abs(min_value - max_value)
+        diff11.append(difference1)
+        rel_difference = round(float(difference1) / (min_value + max_value), 1)
+        relativeDiffList.append(rel_difference)

-            # tags which have identical parts:
-            if d == 0 or d2 == 0:
-                min_tagsList_zeros.append(numpy.array(tag))
-                difference1_zeros = abs(d - d2)  # hd of non-identical part
-                diff11_zeros.append(difference1_zeros)
-                max_tag_list.append(numpy.array(max_tag))
+        # tags which have identical parts:
+        if min_value == 0 or max_value == 0:
+            min_tagsList_zeros.append(numpy.array(tag))
+            difference1_zeros = abs(min_value - max_value)  # hd of non-identical part
+            diff11_zeros.append(difference1_zeros)
+            max_tag_list.append(max_tag)
+        else:
+            min_tagsList_zeros.append(None)
+            diff11_zeros.append(None)
+            max_tag_list.append(numpy.array(["None"]))
+
+            # max_tag_list.append(numpy.array(max_tag))

         i += 1

@@ -667,22 +680,26 @@
     parser.add_argument('--maxFS', default=0, type=int,
                         help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
     parser.add_argument('--nr_above_bars', action="store_true",
-                        help='If no, values above bars in the histrograms are removed')
+                        help='If no, values above bars in the histograms are removed')

     parser.add_argument('--output_tabular', default="data.tabular", type=str,
                         help='Name of the tabular file.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
                         help='Name of the pdf file.')
-    parser.add_argument('--output_chimeras_tabular', default="data_chimeras.tabular", type=str,
+    parser.add_argument('--output_chimeras_tabular', default="data.tabular", type=str,
                         help='Name of the tabular file with all chimeric tags.')
+
     return parser


 def Hamming_Distance_Analysis(argv):
+
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
+
     file1 = args.inputFile
     name1 = args.inputName1
+
     index_size = args.sample_size
     title_savedFile_pdf = args.output_pdf
     title_savedFile_csv = args.output_tabular
@@ -693,6 +710,7 @@
     minFS = args.minFS
     maxFS = args.maxFS
     nr_above_bars = args.nr_above_bars
+
     subset = args.subset_tag
     nproc = args.nproc

@@ -722,7 +740,7 @@
         print("dataset: ", name1)
         integers, data_array = readFileReferenceFree(file1)
         data_array = numpy.array(data_array)
-        print("total nr of tags:", len(data_array))
+        print("total nr of tags with Ns:", len(data_array))
         n = [i for i, x in enumerate(data_array[:, 1]) if "N" in x]
         if len(n) != 0:  # delete tags with N in the tag from data
             print("nr of tags with N's within tag:", len(n), float(len(n)) / len(data_array))
@@ -732,8 +750,6 @@
             integers = integers[index_withoutN_inTag]
             print("total nr of tags without Ns:", len(data_array))

-        data_array_whole_dataset = data_array
-
         int_f = numpy.array(data_array[:, 0]).astype(int)
         data_array = data_array[numpy.where(int_f >= minFS)]
         integers = integers[integers >= minFS]
@@ -744,13 +760,10 @@
             data_array = data_array[numpy.where(int_f2 <= maxFS)]
             integers = integers[integers <= maxFS]

-        print("min FS", min(integers))
-        print("max FS", max(integers))
+        if onlyDuplicates is True:
+            tags = data_array[:, 2]
+            seq = data_array[:, 1]

-        tags = data_array[:, 2]
-        seq = data_array[:, 1]
-
-        if onlyDuplicates is True:
             # find all unique tags and get the indices for ALL tags, but only once
             u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
             d = u[c > 1]
@@ -763,11 +776,19 @@
             duplTags_tag = tags[numpy.in1d(seq, d)][0::2]  # ab
             duplTags_seq = seq[numpy.in1d(seq, d)][0::2]  # ab - tags

+            if minFS > 1:
+                duplTags_tag = duplTags_tag[(duplTags >= 3) & (duplTagsBA >= 3)]
+                duplTags_seq = duplTags_seq[(duplTags >= 3) & (duplTagsBA >= 3)]
+                duplTags = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)]  # ab+ba with FS>=3
+
             data_array = numpy.column_stack((duplTags, duplTags_seq))
             data_array = numpy.column_stack((data_array, duplTags_tag))
             integers = numpy.array(data_array[:, 0]).astype(int)
             print("DCS in whole dataset", len(data_array))

+        print("min FS", min(integers))
+        print("max FS", max(integers))
+
         # HD analysis for a subset of the tag
         if subset > 0:
             tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
@@ -789,16 +810,18 @@
             data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
             data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))

-        print("length of tag:", len(data_array[0, 1]))
+        print("length of tag= ", len(data_array[0, 1]))
         # select sample: if no size given --> all vs. all comparison
         if index_size == 0:
             result = numpy.arange(0, len(data_array), 1)
         else:
-            result = numpy.random.choice(len(integers), size=index_size,
-                                         replace=False)  # array of random sequences of size=index.size
-            # unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True) # get only unique tags
-            # result = numpy.random.choice(unique_indices, size=index_size,
-            #                        replace=False)  # array of random sequences of size=index.size
+            numpy.random.shuffle(data_array)
+            unique_tags, unique_indices = numpy.unique(data_array[:, 1], return_index=True)  # get only unique tags
+            result = numpy.random.choice(unique_indices, size=index_size, replace=False)  # array of random sequences of size=index.size
+
+            # result = numpy.random.choice(len(integers), size=index_size,
+            #                             replace=False)  # array of random sequences of size=index.size
+            # result = numpy.where(numpy.array(random_tags) == numpy.array(data_array[:,1]))[0]

         # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
         #     pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
@@ -806,7 +829,7 @@
         # comparison random tags to whole dataset
         result1 = data_array[result, 1]  # random tags
         result2 = data_array[:, 1]  # all tags
-        print("sample size:", len(result1))
+        print("sample size= ", len(result1))

         # HD analysis of whole tag
         proc_pool = Pool(nproc)
@@ -819,7 +842,7 @@
         # for h, tag in zip(ham, result1):
         #     output_file1.write("{}\t{}\n".format(tag, h))

-        # HD analysis for chimeric reads
+        # # HD analysis for chimeric reads
         # result2 = data_array_whole_dataset[:,1]

         proc_pool_b = Pool(nproc)
@@ -827,80 +850,130 @@
         diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
         proc_pool_b.close()
         proc_pool_b.join()
-        diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
-                                  numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
         HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
                                      numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
         HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
                                      numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
         minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
                                     numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
-        minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
-                                        numpy.concatenate([item_b[4] for item_b in diff_list_b])))
-        rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
-                                      numpy.concatenate([item_b[5] for item_b in diff_list_b])))
-        diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
-                                        numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
-        minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
-                                              numpy.concatenate([item_b[7] for item_b in diff_list_b])))
         HDhalf1min = numpy.concatenate((numpy.concatenate([item[8] for item in diff_list_a]),
                                         numpy.concatenate([item_b[8] for item_b in diff_list_b]))).astype(int)
         HDhalf2min = numpy.concatenate((numpy.concatenate([item[9] for item in diff_list_a]),
                                         numpy.concatenate([item_b[9] for item_b in diff_list_b]))).astype(int)

-        chimera_tags = numpy.concatenate(([item[10] for item in diff_list_a],
-                                       [item_b[10] for item_b in diff_list_b]))
-
-        chimera_tags = [x for x in chimera_tags if x != []]
-        chimera_tags_new = []
+        rel_Diff1 = numpy.concatenate([item[5] for item in diff_list_a])
+        rel_Diff2 = numpy.concatenate([item[5] for item in diff_list_b])
+        diff1 = numpy.concatenate([item[0] for item in diff_list_a])
+        diff2 = numpy.concatenate([item[0] for item in diff_list_b])

-        for i in chimera_tags:
-            if len(i) > 1:
-                for t in i:
-                    chimera_tags_new.append(t)
-            else:
-                chimera_tags_new.extend(i)
+        diff_zeros1 = numpy.concatenate([item[6] for item in diff_list_a])
+        diff_zeros2 = numpy.concatenate([item[6] for item in diff_list_b])
+        minHD_tags = numpy.concatenate([item[4] for item in diff_list_a])
+        minHD_tags_zeros1 = numpy.concatenate([item[7] for item in diff_list_a])
+        minHD_tags_zeros2 = numpy.concatenate([item[7] for item in diff_list_b])
+        chim_tags = [item[10] for item in diff_list_a]
+        chim_tags2 = [item[10] for item in diff_list_b]
+        chimera_tags1 = [ii if isinstance(i, list) else i for i in chim_tags for ii in i]
+        chimera_tags2 = [ii if isinstance(i, list) else i for i in chim_tags2 for ii in i]
+
+        rel_Diff = []
+        diff_zeros = []
+        minHD_tags_zeros = []
+        diff = []
+        chimera_tags = []
+        for d1, d2, rel1, rel2, zeros1, zeros2, tag1, tag2, ctag1, ctag2 in \
+                zip(diff1, diff2, rel_Diff1, rel_Diff2, diff_zeros1, diff_zeros2, minHD_tags_zeros1, minHD_tags_zeros2, chimera_tags1, chimera_tags2):
+            rel_Diff.append(max(rel1, rel2))
+            diff.append(max(d1, d2))

-        chimeras_dic = defaultdict(list)
-        for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
-            chimeras_dic[t1].append(t2)
+            if all(i is not None for i in [zeros1, zeros2]):
+                diff_zeros.append(max(zeros1, zeros2))
+                minHD_tags_zeros.append(str(tag1))
+                tags = [ctag1, ctag2]
+                chimera_tags.append(tags)
+            elif zeros1 is not None and zeros2 is None:
+                diff_zeros.append(zeros1)
+                minHD_tags_zeros.append(str(tag1))
+                chimera_tags.append(ctag1)
+            elif zeros1 is None and zeros2 is not None:
+                diff_zeros.append(zeros2)
+                minHD_tags_zeros.append(str(tag2))
+                chimera_tags.append(ctag2)

-        lst_unique_chimeras = []
+        chimera_tags_new = chimera_tags
+        #data_chimeraAnalysis = numpy.column_stack((minHD_tags_zeros, chimera_tags_new))
+        # chimeras_dic = defaultdict(list)
+        #
+        # for t1, t2 in zip(minHD_tags_zeros, chimera_tags_new):
+        #     if len(t2) >1 and type(t2) is not numpy.ndarray:
+        #         t2 = numpy.concatenate(t2)
+        #     chimeras_dic[t1].append(t2)
+
         with open(output_chimeras_tabular, "w") as output_file1:
-            unique_chimeras = numpy.unique(minHD_tags_zeros)
-            sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
-            sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
+            output_file1.write("chimera tag\tsimilar tag with HD=0\n")
+            for i in range(len(minHD_tags_zeros)):
+                tag1 = minHD_tags_zeros[i]
+                sample_half_a = tag1[0:(len(tag1)) / 2]
+                sample_half_b = tag1[len(tag1) / 2:len(tag1)]

-            output_file1.write("sample tag\tsimilar tag\n")
-            for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
-                max_tags = numpy.concatenate(chimeras_dic.get(tag1))
-
-                if tag1 in chimeras_dic.values():
-                    continue
-                else:
-                    lst_unique_chimeras.append(tag1)
+                max_tags = chimera_tags_new[i]
+                if isinstance(max_tags, list) and len(max_tags) > 1:
+                    max_tags = numpy.concatenate(max_tags)
+                #if isinstance(max_tags, list): #and type(max_tags) is not numpy.ndarray:
+                #    print(max_tags)
+                #    max_tags = numpy.concatenate(max_tags)
+                max_tags = numpy.unique(max_tags)

                 chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
                 chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2

                 new_format = []
-                for i in range(len(max_tags)):
-                    if a == chimera_half_a[i]:
-                        max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
+                for j in range(len(max_tags)):
+                    if sample_half_a == chimera_half_a[j]:
+                        max_tag = "*{}* {}".format(chimera_half_a[j], chimera_half_b[j])
                         new_format.append(max_tag)

-                    elif b == chimera_half_b[i]:
-                        max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
+                    elif sample_half_b == chimera_half_b[j]:
+                        max_tag = "{} *{}*".format(chimera_half_a[j], chimera_half_b[j])
                         new_format.append(max_tag)

-                sample_tag = "{} {}".format(a, b)
+                sample_tag = "{} {}".format(sample_half_a, sample_half_b)
                 output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
             output_file1.write(
                 "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
                 "The tags were separated by an empty space into their halves and the * marks the identical half.")

-        nr_chimeric_tags = len(lst_unique_chimeras)
-        print("nr of unique chimeras:", nr_chimeric_tags)
+            # unique_chimeras = numpy.array(minHD_tags_zeros)
+            #
+            # sample_half_a = numpy.array([i[0:(len(i)) / 2] for i in unique_chimeras])  # mate1 part1
+            # sample_half_b = numpy.array([i[len(i) / 2:len(i)] for i in unique_chimeras])  # mate1 part 2
+            #
+            # output_file1.write("sample tag\tsimilar tag\n")
+            # for tag1, a, b in zip(unique_chimeras, sample_half_a, sample_half_b):
+            #     max_tags = numpy.concatenate(chimeras_dic.get(tag1))
+            #     max_tags = numpy.unique(max_tags)
+            #
+            #     chimera_half_a = numpy.array([i[0:(len(i)) / 2] for i in max_tags])  # mate1 part1
+            #     chimera_half_b = numpy.array([i[len(i) / 2:len(i)] for i in max_tags])  # mate1 part 2
+            #
+            #     new_format = []
+            #     for i in range(len(max_tags)):
+            #         if a == chimera_half_a[i]:
+            #             max_tag = "*{}* {}".format(chimera_half_a[i], chimera_half_b[i])
+            #             new_format.append(max_tag)
+            #
+            #         elif b == chimera_half_b[i]:
+            #             max_tag = "{} *{}*".format(chimera_half_a[i], chimera_half_b[i])
+            #             new_format.append(max_tag)
+            #
+            #     sample_tag = "{} {}".format(a, b)
+            #     output_file1.write("{}\t{}\n".format(sample_tag, ", ".join(new_format)))
+            # output_file1.write(
+            #     "This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.\n "
+            #     "The tags were separated by an empty space into their halves and the * marks the identical half.")
+
+        nr_chimeric_tags = len(minHD_tags_zeros)
+        print("nr of unique chimeras", nr_chimeric_tags)

         lenTags = len(data_array)
         len_sample = len(result1)
@@ -931,45 +1004,32 @@
         else:
             seqDic = dict(zip(seq, quant))

-
         lst_minHD_tags = []
         for i in minHD_tags:
             lst_minHD_tags.append(seqDic.get(i))

         if onlyDuplicates:
-            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags],
-                                        [item_b[1] for item_b in lst_minHD_tags])).astype(int)
-        # else:
-        #     lst_minHD_tags = numpy.concatenate(lst_minHD_tags)
+            lst_minHD_tags = numpy.concatenate(([item[0] for item in lst_minHD_tags], [item_b[1] for item_b in lst_minHD_tags])).astype(int)

         # histogram with absolute and relative difference between HDs of both parts of the tag
         listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
         listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
                                                                                                  rel_Diff)
-
-        # family size distribution separated after the difference between HDs of both parts of the tag
-        # familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
-        #   lst_minHD_tags, diff, diff=True, rel=False)
-        # familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
-        #    lst_minHD_tags, rel_Diff, diff=True, rel=True)
-
         # chimeric read analysis: tags which have HD=0 in one of the halfs
         if len(minHD_tags_zeros) != 0:
             lst_minHD_tags_zeros = []
             for i in minHD_tags_zeros:
                 lst_minHD_tags_zeros.append(seqDic.get(i))  # get family size for tags of chimeric reads
             if onlyDuplicates:
-                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros],
-                                                [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
-
+                lst_minHD_tags_zeros = numpy.concatenate(([item[0] for item in lst_minHD_tags_zeros], [item_b[1] for item_b in lst_minHD_tags_zeros])).astype(int)
+
             # histogram with HD of non-identical half
-            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
-            lst_minHD_tags_zeros, diff_zeros)
-
+            listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(lst_minHD_tags_zeros, diff_zeros)
+
         # plot Hamming Distance with Family size distribution
         plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
                       subtitle="Hamming distance separated by family size", title_file1=name1, lenTags=lenTags,
-                      xlabel="HD", nr_above_bars=nr_above_bars)
+                      xlabel="HD", nr_above_bars=nr_above_bars, len_sample=len_sample)

         # Plot FSD with separation after
         plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
@@ -978,13 +1038,13 @@

         # Plot HD within tags
         plotHDwithinSeq_Sum2(HDhalf1, HDhalf1min, HDhalf2, HDhalf2min, minHDs, pdf=pdf, lenTags=lenTags,
-                             title_file1=name1)
+                             title_file1=name1, len_sample=len_sample)

         # Plot difference between HD's separated after FSD
         plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
                       subtitle="Delta Hamming distance within tags",
                       title_file1=name1, lenTags=lenTags,
-                      xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars)
+                      xlabel="absolute delta HD", relative=False, nr_above_bars=nr_above_bars, len_sample=len_sample)

         plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
                       subtitle="Chimera Analysis: relative delta Hamming distances",
@@ -994,7 +1054,7 @@
         # plots for chimeric reads
         if len(minHD_tags_zeros) != 0:
             # HD
-            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,subtitle="Hamming distance of the non-identical half of chimeras",
+            plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf, subtitle="Hamming distance of chimeras",
                           title_file1=name1, lenTags=lenTags, xlabel="HD", relative=False,
                           nr_above_bars=nr_above_bars, nr_unique_chimeras=nr_chimeric_tags, len_sample=len_sample)

@@ -1047,9 +1107,11 @@

         # HD within tags
         output_file.write(
-            "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
-            "Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.\n"
-            "In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.\n")
+            "The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).\n"
+            "For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').\n"
+            "Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).\n"
+            "These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.\n"
+            "When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.\n")

         output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))

@@ -1063,16 +1125,12 @@

         if len(minHD_tags_zeros) != 0:
             output_file.write(
-                "Chimeras:\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the Hamming distance of the non-identical half is shown.\n")
-            output_file.write(
-                "Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.\n")
-            output_file.write("real number of chimeric tags{}{}{}{}\n".format(sep, nr_chimeric_tags, sep, int(nr_chimeric_tags) / float(len_sample)))
+                "Chimeras:\nAll tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.\nSo the Hamming distances of the chimeric tags are shown.\n")
             createFileHD(summary15, sumCol15, overallSum15, output_file,
-                         "Hamming distances of non-zero half", sep)
+                         "Hamming distances of chimeras", sep)

         output_file.write("\n")


 if __name__ == '__main__':
     sys.exit(Hamming_Distance_Analysis(sys.argv))
-
--- a/hd.xml	Wed Feb 27 09:17:04 2019 -0500
+++ b/hd.xml	Tue May 14 03:29:37 2019 -0400
@@ -28,11 +28,11 @@
     </outputs>
     <tests>
         <test>
-            <param name="inputFile" value="Test_data.tabular"/>
+            <param name="inputFile" value="hd_data.tab"/>
             <param name="sampleSize" value="0"/>
-            <output name="output_pdf" file="output_file.pdf" lines_diff="6"/>
-            <output name="output_tabular" file="output_file.tabular"/>
-            <output name="output_chimeras_tabular" file="output_file_chimeras.tabular"/>
+            <output name="output_pdf" file="hd_output.pdf" lines_diff="6"/>
+            <output name="output_tabular" file="hd_output.tab"/>
+            <output name="output_chimeras_tabular" file="hd_output_chimeras.tab"/>
         </test>
     </tests>
     <help> <![CDATA[
--- a/test-data/Test_data.tabular	Wed Feb 27 09:17:04 2019 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-1	AAAAAAAAAAAAAACCAAAACTTC	ba
-1	AAAAAAAAAAAAACCAGGCGTCGA	ba
-1	AAAAAAAAAAAAAGCTCCACGTTG	ba
-1	AAAAAAAAAAAAATCGTGGTTTGT	ba
-1	AAAAAAAAAAAAATTCACCCTTGT	ba
-7	AAAAAAAAAAAACACACTTAACTT	ba
-1	AAAAAAAAAAAACAGTGTTGAGAC	ba
-4	AAAAAAAAAAAACCGCTCCTCACA	ba
-1	AAAAAAAAAAAAGGCAACACAGAA	ab
-2	AAAAAAAAAAAATCTTTCTTTGAG	ab
-1	AAAAAAAAAAAATTGGGTTCCTTA	ab
-1	AAAAAAAAAAAGAGTCGCACCCAG	ba
-4	AAAAAAAAAAAGATCGTGGTTTGT	ba
-1	AAAAAAAAAAAGCGCAACACAGAA	ab
-3	AAAAAAAAAAAGGGCAACACAGAA	ab
-1	AAAAAAAAAAAGTAGCCCTAAACG	ab
-1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
-1	AAAAAAAAAAATATCATAGACTCT	ab
-6	AAAAAAAAAAATATTCACCCTTGT	ba
-1	AAAAAAAAAAATATTCGAAAGTTA	ba
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/hd_chimeras_output.tab	Tue May 14 03:29:37 2019 -0400
@@ -0,0 +1,23 @@
+chimera tag	similar tag with HD=0
+AAAAAAAAAAAA AACCAAAACTTC	*AAAAAAAAAAAA* TCTTTCTTTGAG
+AAAAAAAAAAAA ACCAGGCGTCGA	*AAAAAAAAAAAA* AACCAAAACTTC, *AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* TCTTTCTTTGAG, *AAAAAAAAAAAA* TTGGGTTCCTTA
+AAAAAAAAAAAA AGCTCCACGTTG	*AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* CCGCTCCTCACA
+AAAAAAAAAAAA ATCGTGGTTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAG *ATCGTGGTTTGT*
+AAAAAAAAAAAA ATTCACCCTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAT *ATTCACCCTTGT*
+AAAAAAAAAAAA CACACTTAACTT	*AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CCGCTCCTCACA, *AAAAAAAAAAAA* TCTTTCTTTGAG
+AAAAAAAAAAAA CAGTGTTGAGAC	*AAAAAAAAAAAA* ATCGTGGTTTGT, *AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CACACTTAACTT
+AAAAAAAAAAAA CCGCTCCTCACA	*AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CACACTTAACTT
+AAAAAAAAAAAA GGCAACACAGAA	*AAAAAAAAAAAA* ATCGTGGTTTGT, AAAAAAAAAAAG *GGCAACACAGAA*
+AAAAAAAAAAAA TCTTTCTTTGAG	*AAAAAAAAAAAA* AACCAAAACTTC, AAAAAAAAAAAG *TCTTTCTTTGAG*
+AAAAAAAAAAAA TTGGGTTCCTTA	*AAAAAAAAAAAA* ACCAGGCGTCGA, *AAAAAAAAAAAA* GGCAACACAGAA, *AAAAAAAAAAAA* TCTTTCTTTGAG
+AAAAAAAAAAAG AGTCGCACCCAG	*AAAAAAAAAAAG* ATCGTGGTTTGT
+AAAAAAAAAAAG ATCGTGGTTTGT	AAAAAAAAAAAA *ATCGTGGTTTGT*, *AAAAAAAAAAAG* TAGCCCTAAACG
+AAAAAAAAAAAG CGCAACACAGAA	*AAAAAAAAAAAG* ATCGTGGTTTGT
+AAAAAAAAAAAG GGCAACACAGAA	AAAAAAAAAAAA *GGCAACACAGAA*, *AAAAAAAAAAAG* ATCGTGGTTTGT
+AAAAAAAAAAAG TAGCCCTAAACG	*AAAAAAAAAAAG* ATCGTGGTTTGT
+AAAAAAAAAAAG TCTTTCTTTGAG	AAAAAAAAAAAA *TCTTTCTTTGAG*, *AAAAAAAAAAAG* ATCGTGGTTTGT, *AAAAAAAAAAAG* CGCAACACAGAA, *AAAAAAAAAAAG* GGCAACACAGAA
+AAAAAAAAAAAT ATCATAGACTCT	*AAAAAAAAAAAT* ATTCACCCTTGT
+AAAAAAAAAAAT ATTCACCCTTGT	AAAAAAAAAAAA *ATTCACCCTTGT*, *AAAAAAAAAAAT* ATCATAGACTCT
+AAAAAAAAAAAT ATTCGAAAGTTA	*AAAAAAAAAAAT* ATCATAGACTCT, *AAAAAAAAAAAT* ATTCACCCTTGT
+This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.
+ The tags were separated by an empty space into their halves and the * marks the identical half.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/hd_data.tab	Tue May 14 03:29:37 2019 -0400
@@ -0,0 +1,20 @@
+1	AAAAAAAAAAAAAACCAAAACTTC	ba
+1	AAAAAAAAAAAAACCAGGCGTCGA	ba
+1	AAAAAAAAAAAAAGCTCCACGTTG	ba
+1	AAAAAAAAAAAAATCGTGGTTTGT	ba
+1	AAAAAAAAAAAAATTCACCCTTGT	ba
+7	AAAAAAAAAAAACACACTTAACTT	ba
+1	AAAAAAAAAAAACAGTGTTGAGAC	ba
+4	AAAAAAAAAAAACCGCTCCTCACA	ba
+1	AAAAAAAAAAAAGGCAACACAGAA	ab
+2	AAAAAAAAAAAATCTTTCTTTGAG	ab
+1	AAAAAAAAAAAATTGGGTTCCTTA	ab
+1	AAAAAAAAAAAGAGTCGCACCCAG	ba
+4	AAAAAAAAAAAGATCGTGGTTTGT	ba
+1	AAAAAAAAAAAGCGCAACACAGAA	ab
+3	AAAAAAAAAAAGGGCAACACAGAA	ab
+1	AAAAAAAAAAAGTAGCCCTAAACG	ab
+1	AAAAAAAAAAAGTCTTTCTTTGAG	ab
+1	AAAAAAAAAAATATCATAGACTCT	ab
+6	AAAAAAAAAAATATTCACCCTTGT	ba
+1	AAAAAAAAAAATATTCGAAAGTTA	ba
\ No newline at end of file
Binary file test-data/hd_output.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/hd_output.tab	Tue May 14 03:29:37 2019 -0400
@@ -0,0 +1,77 @@
+hd_data.tab
+number of tags per file	20 (from 20) against 20
+
+Hamming distance separated by family size
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=1	5	1	1	1	1	0	9
+HD=6	3	0	0	0	0	0	3
+HD=7	4	0	0	0	1	0	5
+HD=8	2	0	0	1	0	0	3
+sum	14	1	1	2	2	0	20
+
+Family size distribution separated by Hamming distance
+	HD=1	HD=2	HD=3	HD=4	HD=5-8	HD>8	sum
+FS=1	5	0	0	0	9	0	14
+FS=2	1	0	0	0	0	0	1
+FS=3	1	0	0	0	0	0	1
+FS=4	1	0	0	0	1	0	2
+FS=6	1	0	0	0	0	0	1
+FS=7	0	0	0	0	1	0	1
+sum	9	0	0	0	11	0	20
+
+
+max. family size in sample:	7
+absolute frequency:	1
+relative frequency:	0.05
+
+The Hamming distances were calculated by comparing the first halve against all halves and selected the minimum value (HD a).
+For the second half of the tag, we compared them against all tags which resulted in the minimum HD of the previous step and selected the maximum value (HD b').
+Finally, it was possible to calculate the absolute and relative differences between the HDs (absolute and relative delta HD).
+These calculations were repeated, but starting with the second half in the first step to find all possible chimeras in the data (HD b and HD  For simplicity we used the maximum value between the delta values in the end.
+When only tags that can form DCS were allowed in the analysis, family sizes for the forward and reverse (ab and ba) will be included in the plots.
+length of one part of the tag = 12
+
+Hamming distance of each half in the tag
+	HD a	HD b'	HD b	HD a'	HD a+b	sum
+HD=0	20	0	8	1	0	29
+HD=1	0	0	1	19	8	28
+HD=2	0	0	0	0	1	1
+HD=5	0	0	3	0	0	3
+HD=6	0	0	2	0	3	5
+HD=7	0	1	6	0	4	11
+HD=8	0	2	0	0	7	9
+HD=9	0	1	0	0	1	2
+HD=10	0	2	0	0	2	4
+HD=11	0	7	0	0	7	14
+HD=12	0	7	0	0	7	14
+sum	20	20	20	20	40	120
+
+Absolute delta Hamming distances within the tag
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=7	1	0	0	0	0	0	1
+diff=8	1	0	0	0	1	0	2
+diff=9	1	0	0	0	0	0	1
+diff=10	2	0	0	0	0	0	2
+diff=11	4	0	1	1	1	0	7
+diff=12	5	1	0	1	0	0	7
+sum	14	1	1	2	2	0	20
+
+Chimera analysis: relative delta Hamming distances
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+diff=1.0	14	1	1	2	2	0	20
+sum	14	1	1	2	2	0	20
+
+Chimeras:
+All tags were filtered: only those tags where at least one half was identical (HD=0) and therefore, had a relative delta of 1 were kept. These tags are considered as chimeric.
+So the Hamming distances of the chimeric tags are shown.
+Hamming distances of chimeras
+	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
+HD=7	1	0	0	0	0	0	1
+HD=8	1	0	0	0	1	0	2
+HD=9	1	0	0	0	0	0	1
+HD=10	2	0	0	0	0	0	2
+HD=11	4	0	1	1	1	0	7
+HD=12	5	1	0	1	0	0	7
+sum	14	1	1	2	2	0	20
+
+
Binary file test-data/output_file.pdf has changed
--- a/test-data/output_file.tabular	Wed Feb 27 09:17:04 2019 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,86 +0,0 @@
-Test_data
-number of tags per file	20 (from 20) against 20
-
-Hamming distance separated by family size
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
-HD=1	5	1	1	1	1	0	9
-HD=6	3	0	0	0	0	0	3
-HD=7	4	0	0	0	1	0	5
-HD=8	2	0	0	1	0	0	3
-sum	14	1	1	2	2	0	20
-
-Family size distribution separated by Hamming distance
-	HD=1	HD=2	HD=3	HD=4	HD=5-8	HD>8	sum
-FS=1	5	0	0	0	9	0	14
-FS=2	1	0	0	0	0	0	1
-FS=3	1	0	0	0	0	0	1
-FS=4	1	0	0	0	1	0	2
-FS=6	1	0	0	0	0	0	1
-FS=7	0	0	0	0	1	0	1
-sum	9	0	0	0	11	0	20
-
-
-max. family size in sample:	7
-absolute frequency:	1
-relative frequency:	0.05
-
-The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
-Since this calculation was repeated, but starting with the second half to find all possible chimeras in the data, the actual number of tags in the plots differs from the sample size entered by the user.
-In addition, both family sizes of one tag will be included in the plots if only tags of reads that can form a DCS were allowed.
-length of one part of the tag = 12
-
-Hamming distance of each half in the tag
-	HD a	HD b'	HD b	HD a'	HD a+b	sum
-HD=0	20	0	8	1	0	29
-HD=1	0	0	1	19	8	28
-HD=2	0	0	0	0	1	1
-HD=5	0	0	3	0	0	3
-HD=6	0	0	2	0	3	5
-HD=7	0	1	6	0	4	11
-HD=8	0	2	0	0	7	9
-HD=9	0	1	0	0	1	2
-HD=10	0	2	0	0	2	4
-HD=11	0	7	0	0	7	14
-HD=12	0	7	0	0	7	14
-sum	20	20	20	20	40	120
-
-Absolute delta Hamming distances within the tag
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
-diff=0	1	0	0	0	0	0	1
-diff=1	4	1	1	1	1	0	8
-diff=4	3	0	0	0	0	0	3
-diff=5	2	0	0	0	0	0	2
-diff=6	3	0	0	1	1	0	5
-diff=7	2	0	0	0	0	0	2
-diff=8	1	0	0	0	1	0	2
-diff=9	1	0	0	0	0	0	1
-diff=10	2	0	0	0	0	0	2
-diff=11	4	0	1	1	1	0	7
-diff=12	5	1	0	1	0	0	7
-sum	28	2	2	4	4	0	40
-
-Chimera analysis: relative delta Hamming distances
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
-diff=0.0	1	0	0	0	0	0	1
-diff=0.7	5	0	0	0	0	0	5
-diff=0.8	3	0	0	1	1	0	5
-diff=1.0	19	2	2	3	3	0	29
-sum	28	2	2	4	4	0	40
-
-Chimeras:
-All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
-So the Hamming distance of the non-identical half is shown.
-Be aware that the real number of chimeric tags (where rel. diff = 1) is not shown in the plot because of the above reasons.
-real number of chimeric tags	20	1.0
-Hamming distances of non-zero half
-	FS=1	FS=2	FS=3	FS=4	FS=5-10	FS>10	sum
-HD=1	4	1	1	1	1	0	8
-HD=7	2	0	0	0	0	0	2
-HD=8	1	0	0	0	1	0	2
-HD=9	1	0	0	0	0	0	1
-HD=10	2	0	0	0	0	0	2
-HD=11	4	0	1	1	1	0	7
-HD=12	5	1	0	1	0	0	7
-sum	19	2	2	3	3	0	29
-
-
--- a/test-data/output_file_chimeras.tabular	Wed Feb 27 09:17:04 2019 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-sample tag	similar tag
-AAAAAAAAAAAA AACCAAAACTTC	*AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAA ACCAGGCGTCGA	*AAAAAAAAAAAA* AACCAAAACTTC, *AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* TCTTTCTTTGAG, *AAAAAAAAAAAA* TTGGGTTCCTTA
-AAAAAAAAAAAA AGCTCCACGTTG	*AAAAAAAAAAAA* CAGTGTTGAGAC, *AAAAAAAAAAAA* CCGCTCCTCACA
-AAAAAAAAAAAA ATCGTGGTTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAG *ATCGTGGTTTGT*
-AAAAAAAAAAAA ATTCACCCTTGT	*AAAAAAAAAAAA* CAGTGTTGAGAC, AAAAAAAAAAAT *ATTCACCCTTGT*
-AAAAAAAAAAAA CACACTTAACTT	*AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CCGCTCCTCACA, *AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAA CAGTGTTGAGAC	*AAAAAAAAAAAA* ATCGTGGTTTGT, *AAAAAAAAAAAA* ATTCACCCTTGT, *AAAAAAAAAAAA* CACACTTAACTT
-AAAAAAAAAAAA CCGCTCCTCACA	*AAAAAAAAAAAA* AGCTCCACGTTG, *AAAAAAAAAAAA* CACACTTAACTT
-AAAAAAAAAAAA GGCAACACAGAA	*AAAAAAAAAAAA* ATCGTGGTTTGT, AAAAAAAAAAAG *GGCAACACAGAA*
-AAAAAAAAAAAA TCTTTCTTTGAG	*AAAAAAAAAAAA* AACCAAAACTTC, AAAAAAAAAAAG *TCTTTCTTTGAG*
-AAAAAAAAAAAA TTGGGTTCCTTA	*AAAAAAAAAAAA* ACCAGGCGTCGA, *AAAAAAAAAAAA* GGCAACACAGAA, *AAAAAAAAAAAA* TCTTTCTTTGAG
-AAAAAAAAAAAG AGTCGCACCCAG	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG ATCGTGGTTTGT	*AAAAAAAAAAAG* TAGCCCTAAACG, AAAAAAAAAAAA *ATCGTGGTTTGT*
-AAAAAAAAAAAG CGCAACACAGAA	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG GGCAACACAGAA	*AAAAAAAAAAAG* ATCGTGGTTTGT, AAAAAAAAAAAA *GGCAACACAGAA*
-AAAAAAAAAAAG TAGCCCTAAACG	*AAAAAAAAAAAG* ATCGTGGTTTGT
-AAAAAAAAAAAG TCTTTCTTTGAG	*AAAAAAAAAAAG* ATCGTGGTTTGT, *AAAAAAAAAAAG* CGCAACACAGAA, *AAAAAAAAAAAG* GGCAACACAGAA, AAAAAAAAAAAA *TCTTTCTTTGAG*
-AAAAAAAAAAAT ATCATAGACTCT	*AAAAAAAAAAAT* ATTCACCCTTGT
-AAAAAAAAAAAT ATTCACCCTTGT	*AAAAAAAAAAAT* ATCATAGACTCT, AAAAAAAAAAAA *ATTCACCCTTGT*
-AAAAAAAAAAAT ATTCGAAAGTTA	*AAAAAAAAAAAT* ATCATAGACTCT, *AAAAAAAAAAAT* ATTCACCCTTGT
-This file contains all tags that were identified as chimeras as the first column and the corresponding tags which returned a Hamming distance of zero in either the first or the second half of the sample tag as the second column.
- The tags were separated by an empty space into their halves and the * marks the identical half.
\ No newline at end of file