diff countandcluster/countandcluster.py @ 0:c516cb51d12c draft

Uploaded
author jshay
date Tue, 13 Aug 2019 15:24:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/countandcluster.py	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,46 @@
+# Julie Shay
+# July 22, 2019
+# This script makes two output files from a bam alignment file: a file with gene counts/coverages
+# and a file with cluster counts/coverages based on the most abundant gene per cluster
+import pysam
+import pandas as pd
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""")
+parser.add_argument('-i', dest='index', type=str, help="""index for bam file""")
+parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""")
+parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""")
+parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""")
+parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""")
+args = parser.parse_args()
+
+def gene_count(gene, bam):
+    read_count = bamfile.count(contig=gene, read_callback='all')
+    cov_counts = bamfile.count_coverage(contig=gene)
+    cov_number = 0
+    total_number = 0
+    for a, c, g, t in zip(*cov_counts):   # doesn't matter whether the order is right
+        total_number = total_number + 1
+        if (((a + g) + c) + t) != 0:
+            cov_number = cov_number + 1
+    prop_covered = float(cov_number) / float(total_number)
+    return read_count, prop_covered
+
+
+clusterdf = pd.read_csv(args.clusters, sep="\t")
+genelist = list(clusterdf["ID"])
+bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index)
+# for every gene in genelist, get all the reads that mapped to that gene from bamfile
+counts = []
+coverages = []
+for g in genelist:
+    count, coverage = gene_count(g, bamfile)
+    counts.append(count)
+    coverages.append(coverage)
+
+genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages})
+genedf.set_index("Gene").to_csv(args.geneout, sep="\t")
+clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max)
+clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t")