# HG changeset patch # User jshay # Date 1565724255 14400 # Node ID c516cb51d12cab8509e69a92665023fd08138533 Uploaded diff -r 000000000000 -r c516cb51d12c countandcluster/.shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/.shed.yml Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,9 @@ +name: hitsandclusters +owner: jashay90 +description: Count hits per gene and cluster +long_description: Takes a bam alignment file and a tab-delimited file defining the cluster identity of each reference sequence as input. Produces two output files -- a file with counts and proportion covered for each reference sequence, and a file with the maximum count and proportion covered within each cluster. +remote_repository_url: https://github.com/jashay90/galaxytools/tree/master/countandcluster +type: unrestricted +categories: + - Metagenomics + - Sequence Analysis diff -r 000000000000 -r c516cb51d12c countandcluster/countandcluster.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/countandcluster.py Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,46 @@ +# Julie Shay +# July 22, 2019 +# This script makes two output files from a bam alignment file: a file with gene counts/coverages +# and a file with cluster counts/coverages based on the most abundant gene per cluster +import pysam +import pandas as pd +import numpy as np +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""") +parser.add_argument('-i', dest='index', type=str, help="""index for bam file""") +parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""") +parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""") +parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""") +parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""") +args = parser.parse_args() + +def gene_count(gene, bam): + read_count = bamfile.count(contig=gene, read_callback='all') + cov_counts = bamfile.count_coverage(contig=gene) + cov_number = 0 + total_number = 0 + for a, c, g, t in zip(*cov_counts): # doesn't matter whether the order is right + total_number = total_number + 1 + if (((a + g) + c) + t) != 0: + cov_number = cov_number + 1 + prop_covered = float(cov_number) / float(total_number) + return read_count, prop_covered + + +clusterdf = pd.read_csv(args.clusters, sep="\t") +genelist = list(clusterdf["ID"]) +bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index) +# for every gene in genelist, get all the reads that mapped to that gene from bamfile +counts = [] +coverages = [] +for g in genelist: + count, coverage = gene_count(g, bamfile) + counts.append(count) + coverages.append(coverage) + +genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages}) +genedf.set_index("Gene").to_csv(args.geneout, sep="\t") +clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max) +clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t") diff -r 000000000000 -r c516cb51d12c countandcluster/countandcluster.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/countandcluster.xml Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,54 @@ + + + python + pysam + pandas + numpy + + + + + + + + + + + + + + + + + + + + + + + +@misc{githubhitsandclusters, + author = {LastTODO, FirstTODO}, + year = {TODO}, + title = {hitsandclusters}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/jashay90/galaxytools}, +} + + + diff -r 000000000000 -r c516cb51d12c countandcluster/test-data/clusterout.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/clusterout.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,2 @@ +Cluster Class Group Hits Gene Fraction +1 MLS ERMB 7 0.5460704607046071 diff -r 000000000000 -r c516cb51d12c countandcluster/test-data/geneout.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/geneout.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,23 @@ +Gene Hits Gene Fraction +1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB 5 0.45121951219512196 +1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6 1 0.05854241338112306 +554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB 5 0.4024390243902439 +605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME 3 0.21635220125786164 +AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME 1 0.03980099502487562 +Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX 1 0.07429718875502007 +CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM 7 0.45331529093369416 +MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB 4 0.31978319783197834 +MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB 7 0.5460704607046071 +MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB 1 0.12749003984063745 +MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB 1 0.13685636856368563 +MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB 4 0.4268292682926829 +MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB 6 0.24146981627296588 +MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB 3 0.34146341463414637 +MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB 2 0.27371273712737126 +MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB 1 0.13685636856368563 +MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB 5 0.42704149933065594 +MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB 4 0.28270042194092826 +MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB 3 0.2926829268292683 +Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 1 0.03252452245740836 +Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40 1 0.0475020475020475 +Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 1 0.030208333333333334 diff -r 000000000000 -r c516cb51d12c countandcluster/test-data/megares_subset.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/megares_subset.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,23 @@ +ID Cluster Class Group +1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6 2 Aminoglycosides APH6 +554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME 2 Aminoglycosides APH3-PRIME +AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME 2 Aminoglycosides APH3-DPRIME +Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX 3 betalactams CFX +CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM 1 MLS ERM +MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 4 Tetracyclines TETW +Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40 4 Tetracyclines TET40 +Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 4 Tetracyclines TETW diff -r 000000000000 -r c516cb51d12c countandcluster/test-data/test.bam Binary file countandcluster/test-data/test.bam has changed diff -r 000000000000 -r c516cb51d12c countandcluster/test-data/test.bam.bai Binary file countandcluster/test-data/test.bam.bai has changed