Mercurial > repos > jshay > hitsandclusters
changeset 0:c516cb51d12c draft
Uploaded
author | jshay |
---|---|
date | Tue, 13 Aug 2019 15:24:15 -0400 |
parents | |
children | 1a2a9b1cfd93 |
files | countandcluster/.shed.yml countandcluster/countandcluster.py countandcluster/countandcluster.xml countandcluster/test-data/clusterout.tsv countandcluster/test-data/geneout.tsv countandcluster/test-data/megares_subset.tsv countandcluster/test-data/test.bam countandcluster/test-data/test.bam.bai |
diffstat | 8 files changed, 157 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/.shed.yml Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,9 @@ +name: hitsandclusters +owner: jashay90 +description: Count hits per gene and cluster +long_description: Takes a bam alignment file and a tab-delimited file defining the cluster identity of each reference sequence as input. Produces two output files -- a file with counts and proportion covered for each reference sequence, and a file with the maximum count and proportion covered within each cluster. +remote_repository_url: https://github.com/jashay90/galaxytools/tree/master/countandcluster +type: unrestricted +categories: + - Metagenomics + - Sequence Analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/countandcluster.py Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,46 @@ +# Julie Shay +# July 22, 2019 +# This script makes two output files from a bam alignment file: a file with gene counts/coverages +# and a file with cluster counts/coverages based on the most abundant gene per cluster +import pysam +import pandas as pd +import numpy as np +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('-b', dest='bam', type=str, help="""bam alignment file""") +parser.add_argument('-i', dest='index', type=str, help="""index for bam file""") +parser.add_argument('-c', dest='clusters', type=str, required=True, help="""Tab-delimited file with genes and clusters""") +parser.add_argument('-p', dest='minprop', type=float, default=0, help="""minimum proportion of cluster covered""") +parser.add_argument('-g', dest='geneout', type=str, default="geneout.tsv", help="""output file for gene counts""") +parser.add_argument('-o', dest='clusterout', type=str, default="clusterout.tsv", help="""output file for cluster counts""") +args = parser.parse_args() + +def gene_count(gene, bam): + read_count = bamfile.count(contig=gene, read_callback='all') + cov_counts = bamfile.count_coverage(contig=gene) + cov_number = 0 + total_number = 0 + for a, c, g, t in zip(*cov_counts): # doesn't matter whether the order is right + total_number = total_number + 1 + if (((a + g) + c) + t) != 0: + cov_number = cov_number + 1 + prop_covered = float(cov_number) / float(total_number) + return read_count, prop_covered + + +clusterdf = pd.read_csv(args.clusters, sep="\t") +genelist = list(clusterdf["ID"]) +bamfile = pysam.AlignmentFile(args.bam, 'rb', index_filename=args.index) +# for every gene in genelist, get all the reads that mapped to that gene from bamfile +counts = [] +coverages = [] +for g in genelist: + count, coverage = gene_count(g, bamfile) + counts.append(count) + coverages.append(coverage) + +genedf = pd.DataFrame({"Gene": genelist, "Hits": counts, "Gene Fraction": coverages}) +genedf.set_index("Gene").to_csv(args.geneout, sep="\t") +clusterdf = pd.merge(clusterdf, genedf.rename(columns={"Gene": "ID"}), on="ID").drop("ID", axis=1).groupby("Cluster").aggregate(np.max) +clusterdf[clusterdf["Gene Fraction"] >= args.minprop].to_csv(args.clusterout, sep="\t")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/countandcluster.xml Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,54 @@ +<tool id="hitsandclusters" name="Count hits per gene and cluster" version="0.1.0"> + <requirements> + <requirement type="package" version="3.6.7">python</requirement> + <requirement type="package" version="0.15.2">pysam</requirement> + <requirement type="package" version="0.24.1">pandas</requirement> + <requirement type="package" version="1.15.4">numpy</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/countandcluster.py' -c '$clusters' -b '$bam' -p '$prop' -g '$out1' -i '${bam.metadata.bam_index}' -o '$out2' + ]]></command> + <inputs> + <param type="data" name="clusters" format="tabular" /> + <param type="data" name="bam" format="bam" /> + <param type="float" name="prop" value="0.9" /> + </inputs> + <outputs> + <data name="out1" format="tabular" /> + <data name="out2" format="tabular" /> + </outputs> + <tests> + <test> + <param name="clusters" value="megares_subset.tsv"/> + <param name="bam" value="test.bam"/> + <param name="prop" value="0.5"/> + <output name="out1" file="geneout.tsv"/> + <output name="out2" file="clusterout.tsv"/> + </test> + </tests> + <help><![CDATA[ + usage: countandcluster.py [-h] [-b BAM] [-i INDEX] -c CLUSTERS [-p MINPROP] [-g GENEOUT] [-o CLUSTEROUT] + +optional arguments: + -h, --help show this help message and exit + -b BAM bam alignment file + -i INDEX bam index + -c CLUSTERS Tab-delimited file with genes and clusters + -p MINPROP minimum proportion of cluster covered + -g GENEOUT output file for gene counts + -o CLUSTEROUT output file for cluster counts + + ]]></help> + <citations> + <citation type="bibtex"> +@misc{githubhitsandclusters, + author = {LastTODO, FirstTODO}, + year = {TODO}, + title = {hitsandclusters}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/jashay90/galaxytools}, +}</citation> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/clusterout.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,2 @@ +Cluster Class Group Hits Gene Fraction +1 MLS ERMB 7 0.5460704607046071
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/geneout.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,23 @@ +Gene Hits Gene Fraction +1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB 5 0.45121951219512196 +1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6 1 0.05854241338112306 +554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB 5 0.4024390243902439 +605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME 3 0.21635220125786164 +AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME 1 0.03980099502487562 +Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX 1 0.07429718875502007 +CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM 7 0.45331529093369416 +MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB 4 0.31978319783197834 +MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB 7 0.5460704607046071 +MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB 1 0.12749003984063745 +MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB 1 0.13685636856368563 +MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB 4 0.4268292682926829 +MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB 6 0.24146981627296588 +MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB 3 0.34146341463414637 +MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB 2 0.27371273712737126 +MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB 1 0.13685636856368563 +MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB 5 0.42704149933065594 +MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB 4 0.28270042194092826 +MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB 3 0.2926829268292683 +Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 1 0.03252452245740836 +Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40 1 0.0475020475020475 +Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 1 0.030208333333333334
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countandcluster/test-data/megares_subset.tsv Tue Aug 13 15:24:15 2019 -0400 @@ -0,0 +1,23 @@ +ID Cluster Class Group +1066|unknown_id|unknown_name|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +1479|HF953351.1|HF953351|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH6 2 Aminoglycosides APH6 +554|FN806789.1|FN806789|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +605|V01547.1|V01547|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-PRIME 2 Aminoglycosides APH3-PRIME +AGly|strA_4_NC_003384|Aminoglycosides|Aminoglycoside_O-phosphotransferases|APH3-DPRIME 2 Aminoglycosides APH3-DPRIME +Bla|cfxA6|GQ342996|798-1793|966|betalactams|Class_A_betalactamases|CFX 3 betalactams CFX +CARD|phgb|AM180355|2319373-2320111|ARO:3000375|ErmB|MLS|23S_rRNA_methyltransferases|ERM 1 MLS ERM +MLS|ermB_11_M19270|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_12_U18931|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_15U48430|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_16_X82819|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_17_X64695|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_18_X66468|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_1_JN899585|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_20_AF109075|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_2_K00551|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_6_AF242872|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_7_AF368302|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +MLS|ermB_9_AF299292|MLS|23S_rRNA_methyltransferases|ERMB 1 MLS ERMB +Tet|EU434751.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 4 Tetracyclines TETW +Tet|JQ740052.1|gene2|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TET40 4 Tetracyclines TET40 +Tet|tetW_5_AJ427421|Tetracyclines|Tetracycline_resistance_ribosomal_protection_proteins|TETW 4 Tetracyclines TETW