Mercurial > repos > artbio > small_rna_maps
comparison small_rna_maps.py @ 2:507383cce5a8 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit edbb53cb13b52bf8e71c562fa8acc2c3be2fb270
author | artbio |
---|---|
date | Mon, 14 Aug 2017 05:52:34 -0400 |
parents | 40972a8dfab9 |
children | ed8b0142538d |
comparison
equal
deleted
inserted
replaced
1:40972a8dfab9 | 2:507383cce5a8 |
---|---|
6 import pysam | 6 import pysam |
7 | 7 |
8 | 8 |
9 def Parser(): | 9 def Parser(): |
10 the_parser = argparse.ArgumentParser() | 10 the_parser = argparse.ArgumentParser() |
11 the_parser.add_argument('--input', dest='input', required=True, | 11 the_parser.add_argument('--inputs', dest='inputs', required=True, |
12 nargs='+', help='input BAM files') | 12 nargs='+', help='list of input BAM files') |
13 the_parser.add_argument('--sample_name', dest='sample_name', | 13 the_parser.add_argument('--sample_names', dest='sample_names', |
14 required=True, nargs='+', help='sample name') | 14 required=True, nargs='+', |
15 the_parser.add_argument('--output', action='store', | 15 help='list of sample names') |
16 type=str, help='output tabular file') | 16 the_parser.add_argument('--outputs', nargs='+', action='store', |
17 the_parser.add_argument('-S', '--sizes', action='store', | 17 help='list of two output paths (only two)') |
18 help='use to output read sizes dataframe') | 18 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store', |
19 help='list of 2 plot methods (only two) among:\ | |
20 Counts, Max, Mean, Median, Coverage and Size') | |
19 args = the_parser.parse_args() | 21 args = the_parser.parse_args() |
20 return args | 22 return args |
21 | 23 |
22 | 24 |
23 class Map: | 25 class Map: |
24 | 26 |
25 def __init__(self, bam_file, sample, computeSize=False): | 27 def __init__(self, bam_file, sample): |
26 self.sample_name = sample | 28 self.sample_name = sample |
27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | 29 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
28 self.chromosomes = dict(zip(self.bam_object.references, | 30 self.chromosomes = dict(zip(self.bam_object.references, |
29 self.bam_object.lengths)) | 31 self.bam_object.lengths)) |
30 self.map_dict = self.create_map(self.bam_object) | 32 self.map_dict = self.create_map(self.bam_object) |
31 self.max = self.compute_max(self.map_dict) | |
32 self.mean = self.compute_mean(self.map_dict) | |
33 self.median = self.compute_median(self.map_dict) | |
34 self.coverage = self.compute_coverage(self.map_dict) | |
35 if computeSize: | |
36 self.size = self.compute_size(self.map_dict) | |
37 | 33 |
38 def create_map(self, bam_object): | 34 def create_map(self, bam_object): |
39 ''' | 35 ''' |
40 Returns a map_dictionary {(chromosome,read_position,polarity): | 36 Returns a map_dictionary {(chromosome,read_position,polarity): |
41 [read_length, ...]} | 37 [read_length, ...]} |
57 else: | 53 else: |
58 map_dictionary[(chrom, positions[0]+1, | 54 map_dictionary[(chrom, positions[0]+1, |
59 'F')].append(read.query_alignment_length) | 55 'F')].append(read.query_alignment_length) |
60 return map_dictionary | 56 return map_dictionary |
61 | 57 |
62 def compute_max(self, map_dictionary): | 58 def compute_map(self, map_dictionary, out): |
63 ''' | 59 ''' |
64 takes a map_dictionary as input and returns | 60 takes a map_dictionary as input and writes |
61 a readmap_dictionary {(chromosome,read_position,polarity): | |
62 number_of_reads} | |
63 in an open file handler out | |
64 ''' | |
65 readmap_dictionary = dict() | |
66 for key in map_dictionary: | |
67 readmap_dictionary[key] = len(map_dictionary[key]) | |
68 self.write_table(readmap_dictionary, out) | |
69 | |
70 def compute_max(self, map_dictionary, out): | |
71 ''' | |
72 takes a map_dictionary as input and writes | |
65 a max_dictionary {(chromosome,read_position,polarity): | 73 a max_dictionary {(chromosome,read_position,polarity): |
66 max_of_number_of_read_at_any_position} | 74 max_of_number_of_read_at_any_position} |
75 Not clear this function is still required | |
67 ''' | 76 ''' |
68 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] | 77 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] |
69 max_dictionary = dict(merge_keylist) | 78 max_dictionary = dict(merge_keylist) |
70 for key in map_dictionary: | 79 for key in map_dictionary: |
71 if len(map_dictionary[key]) > max_dictionary[key[0]]: | 80 if len(map_dictionary[key]) > max_dictionary[key[0]]: |
72 max_dictionary[key[0]] = len(map_dictionary[key]) | 81 max_dictionary[key[0]] = len(map_dictionary[key]) |
73 return max_dictionary | 82 self.write_table(max_dictionary, out) |
74 | 83 |
75 def compute_mean(self, map_dictionary): | 84 def compute_mean(self, map_dictionary, out): |
76 ''' | 85 ''' |
77 takes a map_dictionary as input and returns | 86 takes a map_dictionary as input and returns |
78 a mean_dictionary {(chromosome,read_position,polarity): | 87 a mean_dictionary {(chromosome,read_position,polarity): |
79 mean_value_of_reads} | 88 mean_value_of_reads} |
80 ''' | 89 ''' |
83 if len(map_dictionary[key]) == 0: | 92 if len(map_dictionary[key]) == 0: |
84 mean_dictionary[key] = 0 | 93 mean_dictionary[key] = 0 |
85 else: | 94 else: |
86 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), | 95 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), |
87 1) | 96 1) |
88 return mean_dictionary | 97 self.write_table(mean_dictionary, out) |
89 | 98 |
90 def compute_median(self, map_dictionary): | 99 def compute_median(self, map_dictionary, out): |
91 ''' | 100 ''' |
92 takes a map_dictionary as input and returns | 101 takes a map_dictionary as input and returns |
93 a mean_dictionary {(chromosome,read_position,polarity): | 102 a mean_dictionary {(chromosome,read_position,polarity): |
94 mean_value_of_reads} | 103 mean_value_of_reads} |
95 ''' | 104 ''' |
97 for key in map_dictionary: | 106 for key in map_dictionary: |
98 if len(map_dictionary[key]) == 0: | 107 if len(map_dictionary[key]) == 0: |
99 median_dictionary[key] = 0 | 108 median_dictionary[key] = 0 |
100 else: | 109 else: |
101 median_dictionary[key] = numpy.median(map_dictionary[key]) | 110 median_dictionary[key] = numpy.median(map_dictionary[key]) |
102 return median_dictionary | 111 self.write_table(median_dictionary, out) |
103 | 112 |
104 def compute_coverage(self, map_dictionary, quality=10): | 113 def compute_coverage(self, map_dictionary, out, quality=10): |
105 ''' | 114 ''' |
106 takes a map_dictionary as input and returns | 115 takes a map_dictionary as input and returns |
107 a coverage_dictionary {(chromosome,read_position,polarity): | 116 a coverage_dictionary {(chromosome,read_position,polarity): |
108 coverage} | 117 coverage} |
109 ''' | 118 ''' |
118 end=key[1], | 127 end=key[1], |
119 quality_threshold=quality) | 128 quality_threshold=quality) |
120 """ Add the 4 coverage values """ | 129 """ Add the 4 coverage values """ |
121 coverage = [sum(x) for x in zip(*coverage)] | 130 coverage = [sum(x) for x in zip(*coverage)] |
122 coverage_dictionary[key] = coverage[0] | 131 coverage_dictionary[key] = coverage[0] |
123 # coverage_dictionary[(key[0], key[1], 'R')] = coverage | 132 self.write_table(coverage_dictionary, out) |
124 return coverage_dictionary | 133 |
125 | 134 def compute_size(self, map_dictionary, out): |
126 def compute_size(self, map_dictionary): | |
127 ''' | 135 ''' |
128 Takes a map_dictionary and returns a dictionary of sizes: | 136 Takes a map_dictionary and returns a dictionary of sizes: |
129 {chrom: {polarity: {size: nbre of reads}}} | 137 {chrom: {polarity: {size: nbre of reads}}} |
130 ''' | 138 ''' |
131 size_dictionary = defaultdict(lambda: defaultdict( | 139 size_dictionary = defaultdict(lambda: defaultdict( |
135 if self.bam_object.count(chrom) == 0: | 143 if self.bam_object.count(chrom) == 0: |
136 size_dictionary[chrom]['F'][10] = 0 | 144 size_dictionary[chrom]['F'][10] = 0 |
137 for key in map_dictionary: | 145 for key in map_dictionary: |
138 for size in map_dictionary[key]: | 146 for size in map_dictionary[key]: |
139 size_dictionary[key[0]][key[2]][size] += 1 | 147 size_dictionary[key[0]][key[2]][size] += 1 |
140 return size_dictionary | 148 self.write_size_table(size_dictionary, out) |
141 | 149 |
142 def write_size_table(self, out): | 150 def write_table(self, mapdict, out): |
143 ''' | 151 ''' |
144 Dataset, Chromosome, Polarity, Size, Nbr_reads | 152 Generic writer |
153 Dataset, Chromosome, Chrom_length, Coordinate, Polarity, | |
154 <some mapped value> | |
145 out is an *open* file handler | 155 out is an *open* file handler |
146 ''' | 156 ''' |
147 for chrom in sorted(self.size): | 157 for key in sorted(mapdict): |
148 sizes = self.size[chrom]['F'].keys() | 158 line = [self.sample_name, key[0], self.chromosomes[key[0]], |
149 sizes.extend(self.size[chrom]['R'].keys()) | 159 key[1], key[2], mapdict[key]] |
150 for polarity in sorted(self.size[chrom]): | 160 line = [str(i) for i in line] |
161 out.write('\t'.join(line) + '\n') | |
162 | |
163 def write_size_table(self, sizedic, out): | |
164 ''' | |
165 Generic writer of summary values | |
166 Dataset, Chromosome, Chrom_length, <some category>, <some value> | |
167 out is an *open* file handler | |
168 ''' | |
169 for chrom in sorted(sizedic): | |
170 sizes = sizedic[chrom]['F'].keys() | |
171 sizes.extend(sizedic[chrom]['R'].keys()) | |
172 for polarity in sorted(sizedic[chrom]): | |
151 for size in range(min(sizes), max(sizes)+1): | 173 for size in range(min(sizes), max(sizes)+1): |
152 try: | 174 try: |
153 line = [self.sample_name, chrom, polarity, size, | 175 line = [self.sample_name, chrom, polarity, size, |
154 self.size[chrom][polarity][size]] | 176 sizedic[chrom][polarity][size]] |
155 except KeyError: | 177 except KeyError: |
156 line = [self.sample_name, chrom, polarity, size, 0] | 178 line = [self.sample_name, chrom, polarity, size, 0] |
157 line = [str(i) for i in line] | 179 line = [str(i) for i in line] |
158 out.write('\t'.join(line) + '\n') | 180 out.write('\t'.join(line) + '\n') |
159 | 181 |
160 def write_table(self, out): | 182 |
161 ''' | 183 def main(inputs, samples, methods, outputs): |
162 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads | 184 for method, output in zip(methods, outputs): |
163 Polarity, Max, Mean, Median, Coverage | 185 F = open(output, 'w') |
164 out is an *open* file handler | 186 if method == 'Size': |
165 ''' | 187 header = ["Dataset", "Chromosome", "Polarity", method, "Count"] |
166 for key in sorted(self.map_dict): | 188 else: |
167 line = [self.sample_name, key[0], self.chromosomes[key[0]], | 189 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", |
168 key[1], len(self.map_dict[key]), key[2], self.max[key[0]], | 190 "Polarity", method] |
169 self.mean[key], self.median[key], self.coverage[key]] | 191 F.write('\t'.join(header) + '\n') |
170 line = [str(i) for i in line] | 192 for input, sample in zip(inputs, samples): |
171 out.write('\t'.join(line) + '\n') | 193 mapobj = Map(input, sample) |
172 | 194 token = {"Counts": mapobj.compute_map, |
173 | 195 "Max": mapobj.compute_max, |
174 def main(inputs, samples, file_out, size_file_out=''): | 196 "Mean": mapobj.compute_mean, |
175 F = open(file_out, 'w') | 197 "Median": mapobj.compute_median, |
176 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", | 198 "Coverage": mapobj.compute_coverage, |
177 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"] | 199 "Size": mapobj.compute_size} |
178 F.write('\t'.join(header) + '\n') | 200 token[method](mapobj.map_dict, F) |
179 if size_file_out: | |
180 Fs = open(size_file_out, 'w') | |
181 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"] | |
182 Fs.write('\t'.join(header) + '\n') | |
183 for file, sample in zip(inputs, samples): | |
184 mapobj = Map(file, sample, computeSize=True) | |
185 mapobj.write_table(F) | |
186 mapobj.write_size_table(Fs) | |
187 Fs.close() | |
188 else: | |
189 for file, sample in zip(inputs, samples): | |
190 mapobj = Map(file, sample, computeSize=False) | |
191 mapobj.write_table(F) | |
192 F.close() | 201 F.close() |
193 | 202 |
194 | 203 |
195 if __name__ == "__main__": | 204 if __name__ == "__main__": |
196 args = Parser() | 205 args = Parser() |
197 # if identical sample names | 206 # if identical sample names |
198 if len(set(args.sample_name)) != len(args.sample_name): | 207 if len(set(args.sample_names)) != len(args.sample_names): |
199 args.sample_name = [name + '_' + str(i) for | 208 args.sample_names = [name + '_' + str(i) for |
200 i, name in enumerate(args.sample_name)] | 209 i, name in enumerate(args.sample_names)] |
201 main(args.input, args.sample_name, args.output, args.sizes) | 210 main(args.inputs, args.sample_names, args.plot_methods, args.outputs) |