comparison mircounts.py @ 0:da29af78a960 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author artbio
date Mon, 24 Jul 2017 06:27:50 -0400
parents
children da1aa7de2b19
comparison
equal deleted inserted replaced
-1:000000000000 0:da29af78a960
1 #!/usr/bin/env python
2 import argparse
3
4 import pysam
5
6
7 def Parser():
8 parser = argparse.ArgumentParser(description='miRNAs counts and coverages')
9 parser.add_argument('-a', '--alignment', metavar='FILE', type=str,
10 dest='alignment_file', help='Alignment bam file')
11 parser.add_argument('--gff', metavar='FILE', type=str, dest='gff_file',
12 help='GFF3 describing both pre-miRNAs\
13 and mature miRNAs')
14 parser.add_argument('-q', '--quality_threshold', type=int,
15 dest='quality_threshold',
16 help='Quality threshold for coverage (default=10)',
17 default=10)
18 parser.add_argument('-p', '--pre_mirs', type=str, dest='pre_mirs',
19 help='pre-miRNAs count file path', metavar='FILE')
20 parser.add_argument('-m', '--mirs', type=str, dest='mirs',
21 help='mature miRNA count file path', metavar='FILE')
22 parser.add_argument('--lattice', metavar='FILE', type=str, dest='lattice',
23 help='Output file for the lattice dataframe.')
24 args = parser.parse_args()
25 return args
26
27
28 def get_pre_mir_counts(bamfile):
29 """
30 Takes a AlignmentFile object and returns a dictionary of counts for reads
31 aligning with pre_mirs (as keys)
32 """
33 count = dict()
34 for ref_name in bamfile.references:
35 count[ref_name] = bamfile.count(reference=ref_name)
36 return count
37
38
39 def get_pre_mir_coverage(bamfile, quality=10):
40 """
41 Takes a AlignmentFile object and returns a dictionary of lists
42 of coverage along the coordinates of pre_mirs (as keys)
43 """
44 coverage = dict()
45 for ref_name, ref_len in zip(bamfile.references, bamfile.lengths):
46 coverage[ref_name] = bamfile.count_coverage(reference=ref_name,
47 start=0, end=ref_len,
48 quality_threshold=quality)
49 """ Add the 4 coverage values """
50 coverage[ref_name] = [sum(x) for x in
51 zip(*coverage[ref_name])]
52 return coverage
53
54
55 def get_mir_counts(bamfile, gff_file):
56 """
57 Takes a AlignmentFile and a gff file and computes for
58 each 'miRNA' region of the gff the number of reads that hit it
59 returns a dict[mir_name] = count
60 """
61 counts = dict()
62 for line in open(gff_file, 'r'):
63 if line[0] != '#':
64 gff_fields = line[:-1].split("\t")
65 if gff_fields[2] == 'miRNA':
66 mir_name = gff_fields[0]
67 premir_name = gff_fields[8].split('=')[-1]
68 mir_start = int(gff_fields[3])
69 mir_end = int(gff_fields[4])
70 # GFF is 1-based, pysam is 0-based.
71 counts[mir_name] = bamfile.count(reference=premir_name,
72 start=mir_start-1,
73 end=mir_end-1)
74 return counts
75
76
77 def write_dataframe_coverage(countdict, outfile):
78 """
79 Takes a dict[pre_mir reference name] = [coverage list]
80 and writes a dataframe with columns:
81 <gene_type name>, offset, normoffset, counts and normcounts
82 in the outfile
83 """
84 F = open(outfile, 'w')
85 F.write('Mir_hairpin\tOffset\tNorm_offset\tCount\tNorm_count\n')
86 for ref in sorted(countdict):
87 """
88 For each reference name in mirs,
89 write the coverage of each of its positions
90 """
91 maximum = max(countdict[ref])
92 reference_length = len(countdict[ref])
93 for pos, c in enumerate(countdict[ref]):
94 """ Compute and write value for each reference position"""
95 F.write('%s\t%s\t%s\t%s\t%s\n' % (ref, str(pos + 1),
96 str(float(pos+1)/reference_length), str(float(c)),
97 str(float(c)/maximum) if maximum != 0 else '0'))
98 F.close()
99
100
101 def write_counts(countdict, outfile):
102 """
103 Takes a dict[<gene_type name>]=count and
104 writes a count table
105 """
106 F = open(outfile, 'w')
107 F.write('Gene\tCounts\n')
108 for gene in sorted(countdict):
109 F.write('%s\t%s\n' % (gene, str(countdict[gene])))
110 F.close()
111
112
113 def main():
114 args = Parser()
115 bamfile = pysam.AlignmentFile(args.alignment_file, 'rb', check_sq=False)
116 if args.pre_mirs:
117 pre_mirs = get_pre_mir_counts(bamfile)
118 write_counts(pre_mirs, args.pre_mirs)
119 if args.lattice:
120 pre_mirs_coverage = get_pre_mir_coverage(bamfile,
121 args.quality_threshold)
122 write_dataframe_coverage(pre_mirs_coverage, args.lattice)
123 if args.mirs:
124 mirs = get_mir_counts(bamfile, args.gff_file)
125 write_counts(mirs, args.mirs)
126
127
128 if __name__ == '__main__':
129 main()