Mercurial > repos > jshay > hitsandclusters
comparison countandcluster/countandcluster.py @ 0:c516cb51d12c draft
Uploaded
author | jshay |
---|---|
date | Tue, 13 Aug 2019 15:24:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c516cb51d12c |
---|---|
1 # Julie Shay | |
2 # July 22, 2019 | |
3 # This script makes two output files from a bam alignment file: a file with gene counts/coverages | |
4 # and a file with cluster counts/coverages based on the most abundant gene per cluster | |
5 import pysam | |
6 import pandas as pd | |
7 import numpy as np | |
8 import argparse | |
9 | |
10 parser = argparse.ArgumentParser() | |
11 parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""") | |
12 parser.add_argument('-i', dest='index', type=str, help="""index for bam file""") | |
13 parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""") | |
14 parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""") | |
15 parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""") | |
16 parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""") | |
17 args = parser.parse_args() | |
18 | |
19 def gene_count(gene, bam): | |
20 read_count = bamfile.count(contig=gene, read_callback='all') | |
21 cov_counts = bamfile.count_coverage(contig=gene) | |
22 cov_number = 0 | |
23 total_number = 0 | |
24 for a, c, g, t in zip(*cov_counts): # doesn't matter whether the order is right | |
25 total_number = total_number + 1 | |
26 if (((a + g) + c) + t) != 0: | |
27 cov_number = cov_number + 1 | |
28 prop_covered = float(cov_number) / float(total_number) | |
29 return read_count, prop_covered | |
30 | |
31 | |
32 clusterdf = pd.read_csv(args.clusters, sep="\t") | |
33 genelist = list(clusterdf["ID"]) | |
34 bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index) | |
35 # for every gene in genelist, get all the reads that mapped to that gene from bamfile | |
36 counts = [] | |
37 coverages = [] | |
38 for g in genelist: | |
39 count, coverage = gene_count(g, bamfile) | |
40 counts.append(count) | |
41 coverages.append(coverage) | |
42 | |
43 genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages}) | |
44 genedf.set_index("Gene").to_csv(args.geneout, sep="\t") | |
45 clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max) | |
46 clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t") |