changeset 0:c516cb51d12c draft

Uploaded
author jshay
date Tue, 13 Aug 2019 15:24:15 -0400
parents
children 1a2a9b1cfd93
files countandcluster/.shed.yml countandcluster/countandcluster.py countandcluster/countandcluster.xml countandcluster/test-data/clusterout.tsv countandcluster/test-data/geneout.tsv countandcluster/test-data/megares_subset.tsv countandcluster/test-data/test.bam countandcluster/test-data/test.bam.bai
diffstat 8 files changed, 157 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/.shed.yml	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,9 @@
+name: hitsandclusters
+owner: jashay90
+description: Count hits per gene and cluster
+long_description: Takes a bam alignment file and a tab-delimited file defining the cluster identity of each reference sequence as input. Produces two output files -- a file with counts and proportion covered for each reference sequence, and a file with the maximum count and proportion covered within each cluster.
+remote_repository_url: https://github.com/jashay90/galaxytools/tree/master/countandcluster
+type: unrestricted
+categories:
+  - Metagenomics
+  - Sequence Analysis
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/countandcluster.py	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,46 @@
+# Julie Shay
+# July 22, 2019
+# This script makes two output files from a bam alignment file: a file with gene counts/coverages
+# and a file with cluster counts/coverages based on the most abundant gene per cluster
+import pysam
+import pandas as pd
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""")
+parser.add_argument('-i', dest='index', type=str, help="""index for bam file""")
+parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""")
+parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""")
+parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""")
+parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""")
+args = parser.parse_args()
+
+def gene_count(gene, bam):
+    read_count = bamfile.count(contig=gene, read_callback='all')
+    cov_counts = bamfile.count_coverage(contig=gene)
+    cov_number = 0
+    total_number = 0
+    for a, c, g, t in zip(*cov_counts):   # doesn't matter whether the order is right
+        total_number = total_number + 1
+        if (((a + g) + c) + t) != 0:
+            cov_number = cov_number + 1
+    prop_covered = float(cov_number) / float(total_number)
+    return read_count, prop_covered
+
+
+clusterdf = pd.read_csv(args.clusters, sep="\t")
+genelist = list(clusterdf["ID"])
+bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index)
+# for every gene in genelist, get all the reads that mapped to that gene from bamfile
+counts = []
+coverages = []
+for g in genelist:
+    count, coverage = gene_count(g, bamfile)
+    counts.append(count)
+    coverages.append(coverage)
+
+genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages})
+genedf.set_index("Gene").to_csv(args.geneout, sep="\t")
+clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max)
+clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/countandcluster.xml	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,54 @@
+<tool id="hitsandclusters" name="Count hits per gene and cluster" version="0.1.0">
+	<requirements>
+	<requirement type="package" version="3.6.7">python</requirement>	
+	<requirement type="package" version="0.15.2">pysam</requirement>
+	<requirement type="package" version="0.24.1">pandas</requirement>
+	<requirement type="package" version="1.15.4">numpy</requirement>	
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+	    python '$__tool_directory__/countandcluster.py' -c '$clusters' -b '$bam' -p '$prop' -g '$out1' -i '${bam.metadata.bam_index}' -o '$out2'
+    ]]></command>
+    <inputs>
+	    <param type="data" name="clusters" format="tabular" />
+	    <param type="data" name="bam" format="bam" />
+	    <param type="float" name="prop" value="0.9" />
+    </inputs>
+    <outputs>
+	    <data name="out1" format="tabular" />
+	    <data name="out2" format="tabular" />
+    </outputs>
+    <tests>
+	    <test>
+		    <param name="clusters" value="megares_subset.tsv"/>
+		    <param name="bam" value="test.bam"/>
+		    <param name="prop" value="0.5"/>
+		    <output name="out1" file="geneout.tsv"/>
+		    <output name="out2" file="clusterout.tsv"/>
+	    </test>
+    </tests>
+    <help><![CDATA[
+        usage: countandcluster.py [-h] [-b BAM] [-i INDEX] -c CLUSTERS [-p MINPROP] [-g GENEOUT] [-o CLUSTEROUT]
+
+optional arguments:
+  -h, --help  show this help message and exit
+  -b BAM          bam alignment file
+  -i INDEX	bam index
+  -c CLUSTERS     Tab-delimited file with genes and clusters
+  -p MINPROP      minimum proportion of cluster covered
+  -g GENEOUT     output file for gene counts
+  -o CLUSTEROUT  output file for cluster counts
+
+  ]]></help>
+    <citations>
+	     <citation type="bibtex">
+@misc{githubhitsandclusters,
+  author = {LastTODO, FirstTODO},
+  year = {TODO},
+  title = {hitsandclusters},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  url = {https://github.com/jashay90/galaxytools},
+}</citation>
+    </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/test-data/clusterout.tsv	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,2 @@
+Cluster	Class	Group	Hits	Gene Fraction
+1	MLS	ERMB	7	0.5460704607046071
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/test-data/geneout.tsv	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,23 @@
+Gene	Hits	Gene Fraction
+1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB	5	0.45121951219512196
+1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6	1	0.05854241338112306
+554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB	5	0.4024390243902439
+605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME	3	0.21635220125786164
+AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME	1	0.03980099502487562
+Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX	1	0.07429718875502007
+CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM	7	0.45331529093369416
+MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB	4	0.31978319783197834
+MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB	7	0.5460704607046071
+MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB	1	0.12749003984063745
+MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB	1	0.13685636856368563
+MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB	4	0.4268292682926829
+MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB	6	0.24146981627296588
+MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB	3	0.34146341463414637
+MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB	2	0.27371273712737126
+MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB	1	0.13685636856368563
+MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB	5	0.42704149933065594
+MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB	4	0.28270042194092826
+MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB	3	0.2926829268292683
+Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW	1	0.03252452245740836
+Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40	1	0.0475020475020475
+Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW	1	0.030208333333333334
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countandcluster/test-data/megares_subset.tsv	Tue Aug 13 15:24:15 2019 -0400
@@ -0,0 +1,23 @@
+ID	Cluster	Class	Group
+1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6	2	Aminoglycosides	APH6
+554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME	2	Aminoglycosides	APH3-PRIME
+AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME	2	Aminoglycosides	APH3-DPRIME
+Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX	3	betalactams	CFX
+CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM	1	MLS	ERM
+MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB	1	MLS	ERMB
+Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW	4	Tetracyclines	TETW
+Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40	4	Tetracyclines	TET40
+Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW	4	Tetracyclines	TETW
Binary file countandcluster/test-data/test.bam has changed
Binary file countandcluster/test-data/test.bam.bai has changed