Mercurial > repos > artbio > small_rna_maps
comparison small_rna_maps.py @ 0:6d48150495e3 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author | artbio |
---|---|
date | Mon, 24 Jul 2017 06:28:45 -0400 |
parents | |
children | 40972a8dfab9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6d48150495e3 |
---|---|
1 import argparse | |
2 from collections import defaultdict | |
3 | |
4 import numpy | |
5 | |
6 import pysam | |
7 | |
8 | |
9 def Parser(): | |
10 the_parser = argparse.ArgumentParser() | |
11 the_parser.add_argument('--input', dest='input', required=True, | |
12 nargs='+', help='input BAM files') | |
13 the_parser.add_argument('--sample_name', dest='sample_name', | |
14 required=True, nargs='+', help='sample name') | |
15 the_parser.add_argument('--output', action='store', | |
16 type=str, help='output tabular file') | |
17 the_parser.add_argument('-S', '--sizes', action='store', | |
18 help='use to output read sizes dataframe') | |
19 args = the_parser.parse_args() | |
20 return args | |
21 | |
22 | |
23 class Map: | |
24 | |
25 def __init__(self, bam_file, sample, computeSize=False): | |
26 self.sample_name = sample | |
27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
28 self.chromosomes = dict(zip(self.bam_object.references, | |
29 self.bam_object.lengths)) | |
30 self.map_dict = self.create_map(self.bam_object) | |
31 self.max = self.compute_max(self.map_dict) | |
32 self.mean = self.compute_mean(self.map_dict) | |
33 self.median = self.compute_median(self.map_dict) | |
34 self.coverage = self.compute_coverage(self.map_dict) | |
35 if computeSize: | |
36 self.size = self.compute_size(self.map_dict) | |
37 | |
38 def create_map(self, bam_object): | |
39 ''' | |
40 Returns a map_dictionary {(chromosome,read_position,polarity): | |
41 [read_length, ...]} | |
42 ''' | |
43 map_dictionary = defaultdict(list) | |
44 # get empty value for start and end of each chromosome | |
45 for chrom in self.chromosomes: | |
46 map_dictionary[(chrom, 1, 'F')] = [] | |
47 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
48 for chrom in self.chromosomes: | |
49 for read in bam_object.fetch(chrom): | |
50 positions = read.positions # a list of covered positions | |
51 for pos in positions: | |
52 if not map_dictionary[(chrom, pos+1, 'F')]: | |
53 map_dictionary[(chrom, pos+1, 'F')] = [] | |
54 if not map_dictionary[(chrom, pos+1, 'R')]: | |
55 map_dictionary[(chrom, pos+1, 'R')] = [] | |
56 if read.is_reverse: | |
57 map_dictionary[(chrom, positions[-1]+1, | |
58 'R')].append(read.query_alignment_length) | |
59 else: | |
60 map_dictionary[(chrom, positions[0]+1, | |
61 'F')].append(read.query_alignment_length) | |
62 return map_dictionary | |
63 | |
64 def compute_max(self, map_dictionary): | |
65 ''' | |
66 takes a map_dictionary as input and returns | |
67 a max_dictionary {(chromosome,read_position,polarity): | |
68 max_of_number_of_read_at_any_position} | |
69 ''' | |
70 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] | |
71 max_dictionary = dict(merge_keylist) | |
72 for key in map_dictionary: | |
73 if len(map_dictionary[key]) > max_dictionary[key[0]]: | |
74 max_dictionary[key[0]] = len(map_dictionary[key]) | |
75 return max_dictionary | |
76 | |
77 def compute_mean(self, map_dictionary): | |
78 ''' | |
79 takes a map_dictionary as input and returns | |
80 a mean_dictionary {(chromosome,read_position,polarity): | |
81 mean_value_of_reads} | |
82 ''' | |
83 mean_dictionary = dict() | |
84 for key in map_dictionary: | |
85 if len(map_dictionary[key]) == 0: | |
86 mean_dictionary[key] = 0 | |
87 else: | |
88 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), | |
89 1) | |
90 return mean_dictionary | |
91 | |
92 def compute_median(self, map_dictionary): | |
93 ''' | |
94 takes a map_dictionary as input and returns | |
95 a mean_dictionary {(chromosome,read_position,polarity): | |
96 mean_value_of_reads} | |
97 ''' | |
98 median_dictionary = dict() | |
99 for key in map_dictionary: | |
100 if len(map_dictionary[key]) == 0: | |
101 median_dictionary[key] = 0 | |
102 else: | |
103 median_dictionary[key] = numpy.median(map_dictionary[key]) | |
104 return median_dictionary | |
105 | |
106 def compute_coverage(self, map_dictionary, quality=10): | |
107 ''' | |
108 takes a map_dictionary as input and returns | |
109 a coverage_dictionary {(chromosome,read_position,polarity): | |
110 coverage} | |
111 ''' | |
112 coverage_dictionary = dict() | |
113 for chrom in self.chromosomes: | |
114 coverage_dictionary[(chrom, 1, 'F')] = 0 | |
115 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0 | |
116 | |
117 for key in map_dictionary: | |
118 coverage = self.bam_object.count_coverage( | |
119 reference=key[0], | |
120 start=key[1]-1, | |
121 end=key[1], | |
122 quality_threshold=quality) | |
123 """ Add the 4 coverage values """ | |
124 coverage = [sum(x) for x in zip(*coverage)] | |
125 coverage_dictionary[key] = coverage[0] | |
126 # coverage_dictionary[(key[0], key[1], 'R')] = coverage | |
127 return coverage_dictionary | |
128 | |
129 def compute_size(self, map_dictionary): | |
130 ''' | |
131 Takes a map_dictionary and returns a dictionary of sizes: | |
132 {chrom: {polarity: {size: nbre of reads}}} | |
133 ''' | |
134 size_dictionary = defaultdict(lambda: defaultdict( | |
135 lambda: defaultdict( int ))) | |
136 # to track empty chromosomes | |
137 for chrom in self.chromosomes: | |
138 if self.bam_object.count(chrom) == 0: | |
139 size_dictionary[chrom]['F'][10] = 0 | |
140 for key in map_dictionary: | |
141 for size in map_dictionary[key]: | |
142 size_dictionary[key[0]][key[2]][size] += 1 | |
143 return size_dictionary | |
144 | |
145 def write_size_table(self, out): | |
146 ''' | |
147 Dataset, Chromosome, Polarity, Size, Nbr_reads | |
148 out is an *open* file handler | |
149 ''' | |
150 for chrom in sorted(self.size): | |
151 sizes = self.size[chrom]['F'].keys() | |
152 sizes.extend(self.size[chrom]['R'].keys()) | |
153 for polarity in sorted(self.size[chrom]): | |
154 for size in range(min(sizes), max(sizes)+1): | |
155 try: | |
156 line = [self.sample_name, chrom, polarity, size, | |
157 self.size[chrom][polarity][size]] | |
158 except KeyError: | |
159 line = [self.sample_name, chrom, polarity, size, 0] | |
160 line = [str(i) for i in line] | |
161 out.write('\t'.join(line) + '\n') | |
162 | |
163 def write_table(self, out): | |
164 ''' | |
165 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads | |
166 Polarity, Max, Mean, Median, Coverage | |
167 out is an *open* file handler | |
168 ''' | |
169 for key in sorted(self.map_dict): | |
170 line = [self.sample_name, key[0], self.chromosomes[key[0]], | |
171 key[1], len(self.map_dict[key]), key[2], self.max[key[0]], | |
172 self.mean[key], self.median[key], self.coverage[key]] | |
173 line = [str(i) for i in line] | |
174 out.write('\t'.join(line) + '\n') | |
175 | |
176 | |
177 def main(inputs, samples, file_out, size_file_out=''): | |
178 F = open(file_out, 'w') | |
179 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", | |
180 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"] | |
181 F.write('\t'.join(header) + '\n') | |
182 if size_file_out: | |
183 Fs = open(size_file_out, 'w') | |
184 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"] | |
185 Fs.write('\t'.join(header) + '\n') | |
186 for file, sample in zip(inputs, samples): | |
187 mapobj = Map(file, sample, computeSize=True) | |
188 mapobj.write_table(F) | |
189 mapobj.write_size_table(Fs) | |
190 Fs.close() | |
191 else: | |
192 for file, sample in zip(inputs, samples): | |
193 mapobj = Map(file, sample, computeSize=False) | |
194 mapobj.write_table(F) | |
195 F.close() | |
196 | |
197 | |
198 if __name__ == "__main__": | |
199 args = Parser() | |
200 # if identical sample names | |
201 if len(set(args.sample_name)) != len(args.sample_name): | |
202 args.sample_name = [name + '_' + str(i) for | |
203 i, name in enumerate(args.sample_name)] | |
204 main(args.input, args.sample_name, args.output, args.sizes) |