0
|
1 # Julie Shay
|
|
2 # July 22, 2019
|
|
3 # This script makes two output files from a bam alignment file: a file with gene counts/coverages
|
|
4 # and a file with cluster counts/coverages based on the most abundant gene per cluster
|
|
5 import pysam
|
|
6 import pandas as pd
|
|
7 import numpy as np
|
|
8 import argparse
|
|
9
|
|
10 parser = argparse.ArgumentParser()
|
|
11 parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""")
|
|
12 parser.add_argument('-i', dest='index', type=str, help="""index for bam file""")
|
|
13 parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""")
|
|
14 parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""")
|
|
15 parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""")
|
|
16 parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""")
|
|
17 args = parser.parse_args()
|
|
18
|
|
19 def gene_count(gene, bam):
|
|
20 read_count = bamfile.count(contig=gene, read_callback='all')
|
|
21 cov_counts = bamfile.count_coverage(contig=gene)
|
|
22 cov_number = 0
|
|
23 total_number = 0
|
|
24 for a, c, g, t in zip(*cov_counts): # doesn't matter whether the order is right
|
|
25 total_number = total_number + 1
|
|
26 if (((a + g) + c) + t) != 0:
|
|
27 cov_number = cov_number + 1
|
|
28 prop_covered = float(cov_number) / float(total_number)
|
|
29 return read_count, prop_covered
|
|
30
|
|
31
|
|
32 clusterdf = pd.read_csv(args.clusters, sep="\t")
|
|
33 genelist = list(clusterdf["ID"])
|
|
34 bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index)
|
|
35 # for every gene in genelist, get all the reads that mapped to that gene from bamfile
|
|
36 counts = []
|
|
37 coverages = []
|
|
38 for g in genelist:
|
|
39 count, coverage = gene_count(g, bamfile)
|
|
40 counts.append(count)
|
|
41 coverages.append(coverage)
|
|
42
|
|
43 genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages})
|
|
44 genedf.set_index("Gene").to_csv(args.geneout, sep="\t")
|
|
45 clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max)
|
|
46 clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t")
|