annotate countandcluster/countandcluster.py @ 1:1a2a9b1cfd93 draft default tip

Uploaded
author jshay
date Tue, 20 Aug 2019 11:23:10 -0400
parents c516cb51d12c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c516cb51d12c Uploaded
jshay
parents:
diff changeset
1 # Julie Shay
c516cb51d12c Uploaded
jshay
parents:
diff changeset
2 # July 22, 2019
c516cb51d12c Uploaded
jshay
parents:
diff changeset
3 # This script makes two output files from a bam alignment file: a file with gene counts/coverages
c516cb51d12c Uploaded
jshay
parents:
diff changeset
4 # and a file with cluster counts/coverages based on the most abundant gene per cluster
c516cb51d12c Uploaded
jshay
parents:
diff changeset
5 import pysam
c516cb51d12c Uploaded
jshay
parents:
diff changeset
6 import pandas as pd
c516cb51d12c Uploaded
jshay
parents:
diff changeset
7 import numpy as np
c516cb51d12c Uploaded
jshay
parents:
diff changeset
8 import argparse
c516cb51d12c Uploaded
jshay
parents:
diff changeset
9
c516cb51d12c Uploaded
jshay
parents:
diff changeset
10 parser = argparse.ArgumentParser()
c516cb51d12c Uploaded
jshay
parents:
diff changeset
11 parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
12 parser.add_argument('-i', dest='index', type=str, help="""index for bam file""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
13 parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
14 parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
15 parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
16 parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
17 args = parser.parse_args()
c516cb51d12c Uploaded
jshay
parents:
diff changeset
18
c516cb51d12c Uploaded
jshay
parents:
diff changeset
19 def gene_count(gene, bam):
c516cb51d12c Uploaded
jshay
parents:
diff changeset
20 read_count = bamfile.count(contig=gene, read_callback='all')
c516cb51d12c Uploaded
jshay
parents:
diff changeset
21 cov_counts = bamfile.count_coverage(contig=gene)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
22 cov_number = 0
c516cb51d12c Uploaded
jshay
parents:
diff changeset
23 total_number = 0
c516cb51d12c Uploaded
jshay
parents:
diff changeset
24 for a, c, g, t in zip(*cov_counts): # doesn't matter whether the order is right
c516cb51d12c Uploaded
jshay
parents:
diff changeset
25 total_number = total_number + 1
c516cb51d12c Uploaded
jshay
parents:
diff changeset
26 if (((a + g) + c) + t) != 0:
c516cb51d12c Uploaded
jshay
parents:
diff changeset
27 cov_number = cov_number + 1
c516cb51d12c Uploaded
jshay
parents:
diff changeset
28 prop_covered = float(cov_number) / float(total_number)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
29 return read_count, prop_covered
c516cb51d12c Uploaded
jshay
parents:
diff changeset
30
c516cb51d12c Uploaded
jshay
parents:
diff changeset
31
c516cb51d12c Uploaded
jshay
parents:
diff changeset
32 clusterdf = pd.read_csv(args.clusters, sep="\t")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
33 genelist = list(clusterdf["ID"])
c516cb51d12c Uploaded
jshay
parents:
diff changeset
34 bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
35 # for every gene in genelist, get all the reads that mapped to that gene from bamfile
c516cb51d12c Uploaded
jshay
parents:
diff changeset
36 counts = []
c516cb51d12c Uploaded
jshay
parents:
diff changeset
37 coverages = []
c516cb51d12c Uploaded
jshay
parents:
diff changeset
38 for g in genelist:
c516cb51d12c Uploaded
jshay
parents:
diff changeset
39 count, coverage = gene_count(g, bamfile)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
40 counts.append(count)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
41 coverages.append(coverage)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
42
c516cb51d12c Uploaded
jshay
parents:
diff changeset
43 genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages})
c516cb51d12c Uploaded
jshay
parents:
diff changeset
44 genedf.set_index("Gene").to_csv(args.geneout, sep="\t")
c516cb51d12c Uploaded
jshay
parents:
diff changeset
45 clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max)
c516cb51d12c Uploaded
jshay
parents:
diff changeset
46 clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t")