Mercurial > repos > mheinzl > fsd_regions

diff fsd_regions.py @ 14:6879295d3f11 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author: mheinzl
date: Tue, 08 Jan 2019 09:50:01 -0500
parents: 63432e6f6a61
--- a/fsd_regions.py	Fri Dec 07 07:23:07 2018 -0500
+++ b/fsd_regions.py	Tue Jan 08 09:50:01 2019 -0500
@@ -1,270 +1,301 @@
-#!/usr/bin/env python
-
-# Family size distribution of tags which were aligned to the reference genome
-#
-# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
-# Contact: monika.heinzl@edumail.at
-#
-# Takes at least one TABULAR file with tags before the alignment to the SSCS,
-# a BAM file with tags of reads that overlap the regions of the reference genome and
-# an optional BED file with chromosome, start and stop position of the regions as input.
-# The program produces a plot which shows the distribution of family sizes of the tags from the input files and
-# a tabular file with the data of the plot.
-
-# USAGE: python FSD_regions.py --inputFile filenameSSCS --inputName1 filenameSSCS
-# --bamFile DCSbamFile --rangesFile BEDfile --output_tabular outptufile_name_tabular
-# --output_pdf outputfile_name_pdf
-
-import argparse
-import collections
-import re
-import sys
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pysam
-from matplotlib.backends.backend_pdf import PdfPages
-
-plt.switch_backend('agg')
-
-
-def readFileReferenceFree(file, delim):
-    with open(file, 'r') as dest_f:
-        data_array = np.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
-        return(data_array)
-
-
-def make_argparser():
-    parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
-    parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
-    parser.add_argument('--inputName1')
-    parser.add_argument('--bamFile', help='BAM file with aligned reads.')
-    parser.add_argument('--rangesFile', default=None, help='BED file with chromosome, start and stop positions.')
-    parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
-    parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
-    return parser
-
-
-def compare_read_families_refGenome(argv):
-    parser = make_argparser()
-    args = parser.parse_args(argv[1:])
-
-    firstFile = args.inputFile
-    name1 = args.inputName1
-    name1 = name1.split(".tabular")[0]
-    bamFile = args.bamFile
-
-    rangesFile = args.rangesFile
-    title_file = args.output_pdf
-    title_file2 = args.output_tabular
-    sep = "\t"
-
-    with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
-        data_array = readFileReferenceFree(firstFile, "\t")
-        pysam.index(bamFile)
-
-        bam = pysam.AlignmentFile(bamFile, "rb")
-        qname_dict = collections.OrderedDict()
-
-        if rangesFile != str(None):
-            with open(rangesFile, 'r') as regs:
-                range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#', dtype='string')
-
-            if range_array.ndim == 0:
-                print("Error: file has 0 lines")
-                exit(2)
-
-            if range_array.ndim == 1:
-                chrList = range_array[0]
-                start_posList = range_array[1].astype(int)
-                stop_posList = range_array[2].astype(int)
-                chrList = [chrList.tolist()]
-                start_posList = [start_posList.tolist()]
-                stop_posList = [stop_posList.tolist()]
-            else:
-                chrList = range_array[:, 0]
-                start_posList = range_array[:, 1].astype(int)
-                stop_posList = range_array[:, 2].astype(int)
-
-            if len(start_posList) != len(stop_posList):
-                print("start_positions and end_positions do not have the same length")
-                exit(3)
-
-            chrList = np.array(chrList)
-            start_posList = np.array(start_posList).astype(int)
-            stop_posList = np.array(stop_posList).astype(int)
-            for chr, start_pos, stop_pos in zip(chrList, start_posList, stop_posList):
-                chr_start_stop = "{}_{}_{}".format(chr, start_pos, stop_pos)
-                qname_dict[chr_start_stop] = []
-                for read in bam.fetch(chr.tobytes(), start_pos, stop_pos):
-                    if not read.is_unmapped:
-                        if re.search('_', read.query_name):
-                            tags = re.split('_', read.query_name)[0]
-                        else:
-                            tags = read.query_name
-                        qname_dict[chr_start_stop].append(tags)
-
-        else:
-            for read in bam.fetch():
-                if not read.is_unmapped:
-                    if re.search(r'_', read.query_name):
-                        tags = re.split('_', read.query_name)[0]
-                    else:
-                        tags = read.query_name
-
-                    if read.reference_name not in qname_dict.keys():
-                        qname_dict[read.reference_name] = [tags]
-                    else:
-                        qname_dict[read.reference_name].append(tags)
-
-        seq = np.array(data_array[:, 1])
-        tags = np.array(data_array[:, 2])
-        quant = np.array(data_array[:, 0]).astype(int)
-        group = np.array(qname_dict.keys())
-
-        all_ab = seq[np.where(tags == "ab")[0]]
-        all_ba = seq[np.where(tags == "ba")[0]]
-        quant_ab = quant[np.where(tags == "ab")[0]]
-        quant_ba = quant[np.where(tags == "ba")[0]]
-
-        seqDic_ab = dict(zip(all_ab, quant_ab))
-        seqDic_ba = dict(zip(all_ba, quant_ba))
-
-        lst_ab = []
-        lst_ba = []
-        quantAfterRegion = []
-        length_regions = 0
-        for i in group:
-            lst_ab_r = []
-            lst_ba_r = []
-            seq_mut = qname_dict[i]
-            if rangesFile == str(None):
-                seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True)
-            length_regions = length_regions + len(seq_mut) * 2
-            for r in seq_mut:
-                count_ab = seqDic_ab.get(r)
-                count_ba = seqDic_ba.get(r)
-                lst_ab_r.append(count_ab)
-                lst_ab.append(count_ab)
-                lst_ba_r.append(count_ba)
-                lst_ba.append(count_ba)
-
-            dataAB = np.array(lst_ab_r)
-            dataBA = np.array(lst_ba_r)
-            bigFamilies = np.where(dataAB > 20)[0]
-            dataAB[bigFamilies] = 22
-            bigFamilies = np.where(dataBA > 20)[0]
-            dataBA[bigFamilies] = 22
-
-            quantAll = np.concatenate((dataAB, dataBA))
-            quantAfterRegion.append(quantAll)
-
-        quant_ab = np.array(lst_ab)
-        quant_ba = np.array(lst_ba)
-
-        maximumX = np.amax(np.concatenate(quantAfterRegion))
-        minimumX = np.amin(np.concatenate(quantAfterRegion))
-
-        # PLOT
-        plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
-        plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
-        plt.rcParams['xtick.labelsize'] = 14
-        plt.rcParams['ytick.labelsize'] = 14
-        plt.rcParams['patch.edgecolor'] = "black"
-        fig = plt.figure()
-        plt.subplots_adjust(bottom=0.3)
-
-        colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
-
-        col = []
-        for i in range(0, len(group)):
-            col.append(colors[i])
-
-        counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=group,
-                          align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
-        ticks = np.arange(minimumX - 1, maximumX, 1)
-
-        ticks1 = map(str, ticks)
-        ticks1[len(ticks1) - 1] = ">20"
-        plt.xticks(np.array(ticks), ticks1)
-        count = np.bincount(map(int, quant_ab))  # original counts
-
-        legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
-        plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure)
-
-        legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int)))
-        plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
-
-        count2 = np.bincount(map(int, quant_ba))  # original counts
-
-        legend = "BA\n{}\n{}\n{:.5f}" \
-            .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
-        plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
-
-        plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)
-        plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,
-                 transform=plt.gcf().transFigure)
-
-        legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
-        plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)
-
-        space = 0
-        for i, count in zip(group, quantAfterRegion):
-            plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure)
-            plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
-            space = space + 0.02
-
-        plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
-        plt.xlabel("Family size", fontsize=14)
-        plt.ylabel("Absolute Frequency", fontsize=14)
-        plt.grid(b=True, which="major", color="#424242", linestyle=":")
-        plt.margins(0.01, None)
-
-        pdf.savefig(fig, bbox_inch="tight")
-        plt.close()
-
-        output_file.write("Dataset:{}{}\n".format(sep, name1))
-        output_file.write("{}AB{}BA\n".format(sep, sep))
-        output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
-        output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
-        output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
-        output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))
-        output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2))
-        output_file.write("\n\nValues from family size distribution\n")
-        output_file.write("{}".format(sep))
-        for i in group:
-            output_file.write("{}{}".format(i, sep))
-        output_file.write("\n")
-
-        j = 0
-        for fs in counts[1][0:len(counts[1]) - 1]:
-            if fs == 21:
-                fs = ">20"
-            else:
-                fs = "={}".format(fs)
-            output_file.write("FS{}{}".format(fs, sep))
-            if len(group) == 1:
-                output_file.write("{}{}".format(int(counts[0][j]), sep))
-            else:
-                for n in range(len(group)):
-                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))
-            output_file.write("\n")
-            j += 1
-        output_file.write("sum{}".format(sep))
-
-        if len(group) == 1:
-            output_file.write("{}{}".format(int(sum(counts[0])), sep))
-        else:
-            for i in counts[0]:
-                output_file.write("{}{}".format(int(sum(i)), sep))
-        output_file.write("\n")
-        output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
-        output_file.write("Region{}total nr. of tags per region\n".format(sep))
-        for i, count in zip(group, quantAfterRegion):
-            output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
-
-    print("Files successfully created!")
-
-
-if __name__ == '__main__':
-    sys.exit(compare_read_families_refGenome(sys.argv))
+#!/usr/bin/env python
+
+# Family size distribution of tags which were aligned to the reference genome
+#
+# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS,
+# a BAM file with tags of reads that overlap the regions of the reference genome and
+# an optional BED file with chromosome, start and stop position of the regions as input.
+# The program produces a plot which shows the distribution of family sizes of the tags from the input files and
+# a tabular file with the data of the plot.
+
+# USAGE: python FSD_regions.py --inputFile filenameSSCS --inputName1 filenameSSCS
+# --bamFile DCSbamFile --rangesFile BEDfile --output_tabular outptufile_name_tabular
+# --output_pdf outputfile_name_pdf
+
+import argparse
+import collections
+import re
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pysam
+from matplotlib.backends.backend_pdf import PdfPages
+
+plt.switch_backend('agg')
+
+
+def readFileReferenceFree(file, delim):
+    with open(file, 'r') as dest_f:
+        data_array = np.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
+        return(data_array)
+
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
+    parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--bamFile', help='BAM file with aligned reads.')
+    parser.add_argument('--rangesFile', default=None, help='BED file with chromosome, start and stop positions.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
+    return parser
+
+
+def compare_read_families_refGenome(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+
+    firstFile = args.inputFile
+    name1 = args.inputName1
+    name1 = name1.split(".tabular")[0]
+    bamFile = args.bamFile
+
+    rangesFile = args.rangesFile
+    title_file = args.output_pdf
+    title_file2 = args.output_tabular
+    sep = "\t"
+
+    with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
+        data_array = readFileReferenceFree(firstFile, "\t")
+        pysam.index(bamFile)
+
+        bam = pysam.AlignmentFile(bamFile, "rb")
+        qname_dict = collections.OrderedDict()
+
+        if rangesFile != str(None):
+            with open(rangesFile, 'r') as regs:
+                range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#', dtype='string')
+
+            if range_array.ndim == 0:
+                print("Error: file has 0 lines")
+                exit(2)
+
+            if range_array.ndim == 1:
+                chrList = range_array[0]
+                start_posList = range_array[1].astype(int)
+                stop_posList = range_array[2].astype(int)
+                chrList = [chrList.tolist()]
+                start_posList = [start_posList.tolist()]
+                stop_posList = [stop_posList.tolist()]
+            else:
+                chrList = range_array[:, 0]
+                start_posList = range_array[:, 1].astype(int)
+                stop_posList = range_array[:, 2].astype(int)
+
+            if len(start_posList) != len(stop_posList):
+                print("start_positions and end_positions do not have the same length")
+                exit(3)
+
+            chrList = np.array(chrList)
+            start_posList = np.array(start_posList).astype(int)
+            stop_posList = np.array(stop_posList).astype(int)
+            for chr, start_pos, stop_pos in zip(chrList, start_posList, stop_posList):
+                chr_start_stop = "{}_{}_{}".format(chr, start_pos, stop_pos)
+                qname_dict[chr_start_stop] = []
+                for read in bam.fetch(chr.tobytes(), start_pos, stop_pos):
+                    if not read.is_unmapped:
+                        if re.search('_', read.query_name):
+                            tags = re.split('_', read.query_name)[0]
+                        else:
+                            tags = read.query_name
+                        qname_dict[chr_start_stop].append(tags)
+
+        else:
+            for read in bam.fetch():
+                if not read.is_unmapped:
+                    if re.search(r'_', read.query_name):
+                        tags = re.split('_', read.query_name)[0]
+                    else:
+                        tags = read.query_name
+
+                    if read.reference_name not in qname_dict.keys():
+                        qname_dict[read.reference_name] = [tags]
+                    else:
+                        qname_dict[read.reference_name].append(tags)
+
+        seq = np.array(data_array[:, 1])
+        tags = np.array(data_array[:, 2])
+        quant = np.array(data_array[:, 0]).astype(int)
+        group = np.array(qname_dict.keys())
+
+        all_ab = seq[np.where(tags == "ab")[0]]
+        all_ba = seq[np.where(tags == "ba")[0]]
+        quant_ab = quant[np.where(tags == "ab")[0]]
+        quant_ba = quant[np.where(tags == "ba")[0]]
+
+        seqDic_ab = dict(zip(all_ab, quant_ab))
+        seqDic_ba = dict(zip(all_ba, quant_ba))
+
+        lst_ab = []
+        lst_ba = []
+        quantAfterRegion = []
+        for i in group:
+            lst_ab_r = []
+            lst_ba_r = []
+            seq_mut = qname_dict[i]
+            if rangesFile == str(None):
+                seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True)
+            for r in seq_mut:
+                if re.search('\.', r):  # BAM file with SSCS tags
+                    splitted_tag = re.split('\.', r)[0]
+                    direction = re.split('\.', r)[1]
+
+                    if direction == "ab":
+                        count_ab = seqDic_ab.get(splitted_tag)
+                        lst_ab_r.append(count_ab)
+                        lst_ab.append(count_ab)
+
+                    else:
+                        count_ba = seqDic_ba.get(splitted_tag)
+                        lst_ba_r.append(count_ba)
+                        lst_ba.append(count_ba)
+                else:  # BAM file with DCS tags
+                    count_ab = seqDic_ab.get(r)
+                    lst_ab_r.append(count_ab)
+                    lst_ab.append(count_ab)
+                    count_ba = seqDic_ba.get(r)
+                    lst_ba_r.append(count_ba)
+                    lst_ba.append(count_ba)
+
+            dataAB = np.array(lst_ab_r)
+            dataBA = np.array(lst_ba_r)
+            bigFamilies = np.where(dataAB > 20)[0]
+            dataAB[bigFamilies] = 22
+            bigFamilies = np.where(dataBA > 20)[0]
+            dataBA[bigFamilies] = 22
+
+            quantAll = np.concatenate((dataAB, dataBA))
+            quantAfterRegion.append(quantAll)
+
+        quant_ab = np.array(lst_ab)
+        quant_ba = np.array(lst_ba)
+        length_regions = len(np.concatenate(quantAfterRegion))
+
+        maximumX = np.amax(np.concatenate(quantAfterRegion))
+        minimumX = np.amin(np.concatenate(quantAfterRegion))
+
+        # PLOT
+        plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+        plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+        plt.rcParams['xtick.labelsize'] = 14
+        plt.rcParams['ytick.labelsize'] = 14
+        plt.rcParams['patch.edgecolor'] = "black"
+        fig = plt.figure()
+        plt.subplots_adjust(bottom=0.3)
+
+        colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
+
+        col = []
+        for i in range(0, len(group)):
+            col.append(colors[i])
+
+        counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=group,
+                          align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
+        ticks = np.arange(minimumX - 1, maximumX, 1)
+
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        plt.xticks(np.array(ticks), ticks1)
+        count = np.bincount(map(int, quant_ab))  # original counts
+
+        legend = "max. family size:\nabsolute frequency:" \
+                 "\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
+        plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure)
+
+        legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int)))
+        plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+
+        count2 = np.bincount(map(int, quant_ba))  # original counts
+
+        legend = "BA\n{}\n{}\n{:.5f}" \
+            .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
+        plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
+
+        plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)
+        if re.search('\.', r):  # BAM file with SSCS tags
+            plt.text(0.85, 0.2125, "{:,}".format(length_regions), size=11,
+                     transform=plt.gcf().transFigure)
+        else:
+            plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,
+                 transform=plt.gcf().transFigure)
+            legend4 = "* In the plot, both family sizes of the ab and ba strands were used." \
+                      "\nWhereas the total numbers indicate only the single count of the tags per region.\n"
+            plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)
+
+        space = 0
+        for i, count in zip(group, quantAfterRegion):
+            plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure)
+            if re.search('\.', r):  # BAM file with SSCS tags
+                plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count)), size=11, transform=plt.gcf().transFigure)
+            else:
+                plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
+            space = space + 0.02
+
+        plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+        plt.xlabel("Family size", fontsize=14)
+        plt.ylabel("Absolute Frequency", fontsize=14)
+        plt.grid(b=True, which="major", color="#424242", linestyle=":")
+        plt.margins(0.01, None)
+
+        pdf.savefig(fig, bbox_inch="tight")
+        plt.close()
+
+        output_file.write("Dataset:{}{}\n".format(sep, name1))
+        output_file.write("{}AB{}BA\n".format(sep, sep))
+        output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
+        output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
+        output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
+        output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))
+        if re.search('\.', r):  # BAM file with SSCS tags
+            output_file.write("total nr. of tags{}{}\n".format(sep, length_regions))
+        else:
+            output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2))
+        output_file.write("\n\nValues from family size distribution\n")
+        output_file.write("{}".format(sep))
+        for i in group:
+            output_file.write("{}{}".format(i, sep))
+        output_file.write("\n")
+
+        j = 0
+        for fs in counts[1][0:len(counts[1]) - 1]:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs, sep))
+            if len(group) == 1:
+                output_file.write("{}{}".format(int(counts[0][j]), sep))
+            else:
+                for n in range(len(group)):
+                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))
+            output_file.write("\n")
+            j += 1
+        output_file.write("sum{}".format(sep))
+
+        if len(group) == 1:
+            output_file.write("{}{}".format(int(sum(counts[0])), sep))
+        else:
+            for i in counts[0]:
+                output_file.write("{}{}".format(int(sum(i)), sep))
+        output_file.write("\n")
+        if re.search('\.', r):  # BAM file with SSCS tags
+            output_file.write("\n")
+        else:
+            output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used."
+                              "\nWhereas the total numbers indicate only the single count of the tags per region.\n")
+        output_file.write("Region{}total nr. of tags per region\n".format(sep))
+        for i, count in zip(group, quantAfterRegion):
+            if re.search('\.', r):  # BAM file with SSCS tags
+                output_file.write("{}{}{}\n".format(i, sep, len(count)))
+            else:
+                output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
+
+    print("Files successfully created!")
+
+
+if __name__ == '__main__':
+    sys.exit(compare_read_families_refGenome(sys.argv))
author	mheinzl
date	Tue, 08 Jan 2019 09:50:01 -0500
parents	63432e6f6a61
children