diff mircounts.py @ 0:da29af78a960 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author artbio
date Mon, 24 Jul 2017 06:27:50 -0400
parents
children da1aa7de2b19
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mircounts.py	Mon Jul 24 06:27:50 2017 -0400
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+import argparse
+
+import pysam
+
+
+def Parser():
+    parser = argparse.ArgumentParser(description='miRNAs counts and coverages')
+    parser.add_argument('-a', '--alignment', metavar='FILE', type=str,
+                        dest='alignment_file', help='Alignment bam file')
+    parser.add_argument('--gff', metavar='FILE', type=str, dest='gff_file',
+                        help='GFF3 describing both pre-miRNAs\
+                              and mature miRNAs')
+    parser.add_argument('-q', '--quality_threshold', type=int,
+                        dest='quality_threshold',
+                        help='Quality threshold for coverage (default=10)',
+                        default=10)
+    parser.add_argument('-p', '--pre_mirs', type=str, dest='pre_mirs',
+                        help='pre-miRNAs count file path', metavar='FILE')
+    parser.add_argument('-m', '--mirs', type=str, dest='mirs',
+                        help='mature miRNA count file path', metavar='FILE')
+    parser.add_argument('--lattice', metavar='FILE', type=str, dest='lattice',
+                        help='Output file for the lattice dataframe.')
+    args = parser.parse_args()
+    return args
+
+
+def get_pre_mir_counts(bamfile):
+    """
+    Takes a AlignmentFile object and returns a dictionary of counts for reads
+    aligning with pre_mirs (as keys)
+    """
+    count = dict()
+    for ref_name in bamfile.references:
+        count[ref_name] = bamfile.count(reference=ref_name)
+    return count
+
+
+def get_pre_mir_coverage(bamfile, quality=10):
+    """
+    Takes a AlignmentFile object and returns a dictionary of lists
+    of coverage along the coordinates of pre_mirs (as keys)
+    """
+    coverage = dict()
+    for ref_name, ref_len in zip(bamfile.references, bamfile.lengths):
+        coverage[ref_name] = bamfile.count_coverage(reference=ref_name,
+                                                    start=0, end=ref_len,
+                                                    quality_threshold=quality)
+        """ Add the 4 coverage values """
+        coverage[ref_name] = [sum(x) for x in
+                              zip(*coverage[ref_name])]
+    return coverage
+
+
+def get_mir_counts(bamfile, gff_file):
+    """
+    Takes a AlignmentFile and a gff file and computes for
+    each 'miRNA' region of the gff the number of reads that hit it
+    returns a dict[mir_name] = count
+    """
+    counts = dict()
+    for line in open(gff_file, 'r'):
+        if line[0] != '#':
+            gff_fields = line[:-1].split("\t")
+            if gff_fields[2] == 'miRNA':
+                mir_name = gff_fields[0]
+                premir_name = gff_fields[8].split('=')[-1]
+                mir_start = int(gff_fields[3])
+                mir_end = int(gff_fields[4])
+                # GFF is 1-based, pysam is 0-based.
+                counts[mir_name] = bamfile.count(reference=premir_name,
+                                                 start=mir_start-1,
+                                                 end=mir_end-1)
+    return counts
+
+
+def write_dataframe_coverage(countdict, outfile):
+    """
+    Takes a dict[pre_mir reference name] = [coverage list]
+    and writes a dataframe with columns:
+    <gene_type name>, offset, normoffset, counts and normcounts
+    in the outfile
+    """
+    F = open(outfile, 'w')
+    F.write('Mir_hairpin\tOffset\tNorm_offset\tCount\tNorm_count\n')
+    for ref in sorted(countdict):
+        """
+        For each reference name in mirs,
+        write the coverage of each of its positions
+        """
+        maximum = max(countdict[ref])
+        reference_length = len(countdict[ref])
+        for pos, c in enumerate(countdict[ref]):
+            """ Compute and write value for each reference position"""
+            F.write('%s\t%s\t%s\t%s\t%s\n' % (ref, str(pos + 1),
+                    str(float(pos+1)/reference_length), str(float(c)),
+                    str(float(c)/maximum) if maximum != 0 else '0'))
+    F.close()
+
+
+def write_counts(countdict, outfile):
+    """
+    Takes a dict[<gene_type name>]=count and
+    writes a count table
+    """
+    F = open(outfile, 'w')
+    F.write('Gene\tCounts\n')
+    for gene in sorted(countdict):
+        F.write('%s\t%s\n' % (gene, str(countdict[gene])))
+    F.close()
+
+
+def main():
+    args = Parser()
+    bamfile = pysam.AlignmentFile(args.alignment_file, 'rb', check_sq=False)
+    if args.pre_mirs:
+        pre_mirs = get_pre_mir_counts(bamfile)
+        write_counts(pre_mirs, args.pre_mirs)
+        if args.lattice:
+            pre_mirs_coverage = get_pre_mir_coverage(bamfile,
+                                                     args.quality_threshold)
+            write_dataframe_coverage(pre_mirs_coverage, args.lattice)
+    if args.mirs:
+        mirs = get_mir_counts(bamfile, args.gff_file)
+        write_counts(mirs, args.mirs)
+
+
+if __name__ == '__main__':
+    main()