diff hd.py @ 1:7414792e1cb8 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 90ee23e393deb06fa5c15e3778fa23c39a25f7ce
author mheinzl
date Sat, 12 May 2018 04:52:34 -0400
parents 239c4448a163
children 316fbf91dd12
line wrap: on
line diff
--- a/hd.py	Thu May 10 07:30:27 2018 -0400
+++ b/hd.py	Sat May 12 04:52:34 2018 -0400
@@ -24,16 +24,404 @@
 import cPickle as pickle
 from multiprocessing.pool import Pool
 from functools import partial
-from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
-from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
-from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
-from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
-from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
+#from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
+#from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
+#from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
+#from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
+#from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
 import argparse
 import sys
 import os
 from matplotlib.backends.backend_pdf import PdfPages
+from collections import Counter
 
+def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, quant,
+                   title_file1, subtitle, pdf, relative=False, diff = True):
+    if diff is False:
+        colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]
+        labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"]
+    else:
+        colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]
+        if relative is True:
+            labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]
+        else:
+            labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"]
+
+    fig = plt.figure(figsize=(6, 7))
+    ax = fig.add_subplot(111)
+    plt.subplots_adjust(bottom=0.1)
+    p1 = numpy.bincount(numpy.concatenate((familySizeList1)))
+    maximumY = numpy.amax(p1)
+
+    if len(range(minimumXFS, maximumXFS)) == 0:
+        range1 = range(minimumXFS - 1, minimumXFS + 2)
+    else:
+        range1 = range(0, maximumXFS + 2)
+    counts = plt.hist(familySizeList1, label=labels,
+                      color=colors, stacked=True,
+                      rwidth=0.8,alpha=1, align="left",
+                      edgecolor="None",bins=range1)
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+
+    plt.title(title_file1, fontsize=12)
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.xlabel("No. of Family Members", fontsize=12)
+    plt.ylabel("Absolute Frequency", fontsize=12)
+
+    ticks = numpy.arange(0, maximumXFS + 1, 1)
+    ticks1 = map(str, ticks)
+    if maximumXFS >= 20:
+        ticks1[len(ticks1) - 1] = ">=20"
+    plt.xticks(numpy.array(ticks), ticks1)
+    [l.set_visible(False) for (i, l) in enumerate(ax.get_xticklabels()) if i % 5 != 0]
+
+    plt.xlim((0, maximumXFS + 1))
+    if len(numpy.concatenate(familySizeList1)) != 0:
+        plt.ylim((0, max(numpy.bincount(numpy.concatenate(familySizeList1))) * 1.1))
+
+    plt.ylim((0, maximumY * 1.2))
+    legend = "\nmax. family size: \nabsolute frequency: \nrelative frequency: "
+    plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)
+
+    count = numpy.bincount(quant)  # original counts
+    legend1 = "{}\n{}\n{:.5f}" \
+        .format(max(quant), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))
+    plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)
+    legend3 = "singletons\n{:,}\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]),
+                                                float(counts[0][len(counts[0]) - 1][1]) / sum(
+                                                    counts[0][len(counts[0]) - 1]))
+    plt.text(0.7, -0.08, legend3, transform=plt.gcf().transFigure, size=12)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    
+def plotHDwithFSD(list1,maximumX,minimumX, subtitle, lenTags, title_file1,pdf,
+                   xlabel,relative=False):
+    if relative is True:
+        step = 0.1
+    else:
+        step = 1
+
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+    con_list1 = numpy.concatenate(list1)
+    p1 = numpy.array([v for k, v in sorted(Counter(con_list1).iteritems())])
+    maximumY = numpy.amax(p1)
+
+    if relative is True:  # relative difference
+        bin1 = numpy.arange(-1, maximumX + 0.2, 0.1)
+    else:
+        bin1 = maximumX + 1
+
+    counts = plt.hist(list1, bins=bin1, edgecolor='black', linewidth=1,
+                      label=["FS=1", "FS=2", "FS=3", "FS=4", "FS=5-10",
+                             "FS>10"], rwidth=0.8,
+                      color=["#808080", "#FFFFCC", "#FFBF00", "#DF0101", "#0431B4", "#86B404"],
+                      stacked=True, alpha=1,
+                      align="left",
+                      range=(0, maximumX + 1))
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))
+    bins = counts[1]  # width of bins
+    counts = numpy.array(map(int, counts[0][5]))
+    plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)
+    plt.title(title_file1, fontsize=12)
+    plt.xlabel(xlabel, fontsize=12)
+    plt.ylabel("Absolute Frequency", fontsize=12)
+
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+    plt.axis((minimumX - step, maximumX + step, 0, numpy.amax(counts) + sum(counts) * 0.1))
+    plt.xticks(numpy.arange(0, maximumX + step, step))
+
+    plt.ylim((0, maximumY * 1.2))
+
+    bin_centers = -0.4 * numpy.diff(bins) + bins[:-1]
+    for x_label, label in zip(counts, bin_centers):  # labels for values
+        if x_label == 0:
+            continue
+        else:
+            plt.annotate("{:,}\n{:.3f}".format(x_label, float(x_label) / sum(counts), 1),
+                         xy=(label, x_label + len(con_list1) * 0.01),
+                         xycoords="data", color="#000066",fontsize=10)
+
+    legend = "sample size= {:,} against {:,}".format(sum(counts), lenTags)
+    plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
+
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+def plotHDwithinSeq_Sum2(sum1, sum2,min_value, lenTags, title_file1, pdf):
+    fig = plt.figure(figsize=(6, 8))
+    plt.subplots_adjust(bottom=0.1)
+
+    ham = [numpy.array(min_value), sum1, sum2]  # new hd within tags
+
+    maximumX = numpy.amax(numpy.concatenate(ham))
+    minimumX = numpy.amin(numpy.concatenate(ham))
+    maximumY = numpy.amax(numpy.concatenate(map(lambda (x): numpy.bincount(x), ham)))
+
+    if len(range(minimumX, maximumX)) == 0:
+        range1 = minimumX
+    else:
+        range1 = range(minimumX, maximumX + 2)
+
+    counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
+                      label=["HD of whole tag", "tag1 - a\nvs. tag2 - a", "tag1 - b\nvs. tag2 - b"],
+                      bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1)
+    plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
+    plt.suptitle('Hamming distances within tags', fontsize=14)
+    plt.title(title_file1, fontsize=12)
+    plt.xlabel("Hamming Distance", fontsize=12)
+    plt.ylabel("Absolute Frequency", fontsize=12)
+    plt.grid(b=True, which='major', color='#424242', linestyle=':')
+
+
+    plt.axis((minimumX - 1, maximumX + 1, 0, maximumY * 1.1))
+    plt.xticks(numpy.arange(minimumX - 1, maximumX + 1, 1.0))
+    plt.ylim((0, maximumY * 1.1))
+
+    legend = "sample size= {:,} against {:,}".format(len(ham[0]), lenTags, lenTags)
+    plt.text(0.14, -0.01, legend, size=12, transform=plt.gcf().transFigure)
+    pdf.savefig(fig, bbox_inches="tight")
+    plt.close("all")
+    plt.clf()
+
+def createTableFSD2(list1, diff=True):
+    selfAB = numpy.concatenate(list1)
+    uniqueFS = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueFS), 1)
+    if diff is False:
+        count = numpy.zeros((len(uniqueFS), 6))
+    else:
+        count = numpy.zeros((len(uniqueFS), 7))
+
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for i, l  in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 1] = j[1]
+
+            if state == 3:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 2] = j[1]
+
+            if state == 4:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 3] = j[1]
+
+            if state == 5:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 4] = j[1]
+
+            if state == 6:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 5] = j[1]
+
+            if state == 7:
+                for i, l in zip(uniqueFS, nr):
+                    for j in table:
+                        if j[0] == uniqueFS[l]:
+                            count[l, 6] = j[1]
+            state = state + 1
+
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+
+    uniqueFS = uniqueFS.astype(str)
+    if uniqueFS[len(uniqueFS) - 1] == "20":
+        uniqueFS[len(uniqueFS) - 1] = ">20"
+
+    first = ["FS={}".format(i) for i in uniqueFS]
+    final = numpy.column_stack((first, count, sumRow))
+
+    return (final, sumCol)
+
+def createFileFSD2(summary, sumCol, overallSum, output_file, name,sep, rel=False, diff=True):
+    output_file.write(name)
+    output_file.write("\n")
+    if diff is False:
+        output_file.write("{}HD=1{}HD=2{}HD=3{}HD=4{}HD=5-8{}HD>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
+    else:
+        if rel is False:
+            output_file.write("{}diff=0{}diff=1{}diff=2{}diff=3{}diff=4{}diff=5-8{}diff>8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
+        else:
+            output_file.write("{}diff=0{}diff=0.1{}diff=0.2{}diff=0.3{}diff=0.4{}diff=0.5-0.8{}diff>0.8{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep,sep))
+
+    for item in summary:
+        for nr in item:
+            if "FS" not in nr and "diff" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr,sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el,sep))
+    output_file.write("{}{}".format(overallSum.astype(int),sep))
+    output_file.write("\n\n")
+
+def createTableHD(list1, row_label):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), 6))
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for i, l  in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+
+            if state == 3:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+
+            if state == 4:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 3] = j[1]
+
+            if state == 5:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 4] = j[1]
+
+            if state == 6:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 5] = j[1]
+            state = state + 1
+
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["{}{}".format(row_label,i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
+
+    return (final, sumCol)
+
+def createTableHDwithTags(list1):
+    selfAB = numpy.concatenate(list1)
+    uniqueHD = numpy.unique(selfAB)
+    nr = numpy.arange(0, len(uniqueHD), 1)
+    count = numpy.zeros((len(uniqueHD), 3))
+
+    state = 1
+    for i in list1:
+        counts = list(Counter(i).items())
+        hd = [item[0] for item in counts]
+        c = [item[1] for item in counts]
+        table = numpy.column_stack((hd, c))
+        if len(table) == 0:
+            state = state + 1
+            continue
+        else:
+            if state == 1:
+                for i, l  in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 0] = j[1]
+            if state == 2:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 1] = j[1]
+
+            if state == 3:
+                for i, l in zip(uniqueHD, nr):
+                    for j in table:
+                        if j[0] == uniqueHD[l]:
+                            count[l, 2] = j[1]
+            state = state + 1
+
+        sumRow = count.sum(axis=1)
+        sumCol = count.sum(axis=0)
+        first = ["HD={}".format(i) for i in uniqueHD]
+        final = numpy.column_stack((first, count, sumRow))
+
+    return (final, sumCol)
+
+def createFileHD(summary, sumCol, overallSum, output_file, name,sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}FS=1{}FS=2{}FS=3{}FS=4{}FS=5-10{}FS>10{}sum{}\n".format(sep,sep,sep,sep,sep,sep,sep,sep))
+    for item in summary:
+        for nr in item:
+            if "HD" not in nr and "diff" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr,sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el,sep))
+    output_file.write("{}{}".format(overallSum.astype(int),sep))
+    output_file.write("\n\n")
+
+def createFileHDwithinTag(summary, sumCol, overallSum, output_file, name,sep):
+    output_file.write(name)
+    output_file.write("\n")
+    output_file.write("{}HD of whole tag;tag1-half1 vs. tag2-half1{}tag1-half2 vs. tag2-half2{}sum{}\n".format(sep,sep,sep,sep))
+    for item in summary:
+        for nr in item:
+            if "HD" not in nr:
+                nr = nr.astype(float)
+                nr = nr.astype(int)
+            output_file.write("{}{}".format(nr,sep))
+        output_file.write("\n")
+    output_file.write("sum{}".format(sep))
+    sumCol = map(int, sumCol)
+    for el in sumCol:
+        output_file.write("{}{}".format(el,sep))
+    output_file.write("{}{}".format(overallSum.astype(int),sep))
+    output_file.write("\n\n")
+
+
+    
 def hamming(array1, array2):
     res = 99 * numpy.ones(len(array1))
     i = 0