comparison small_rna_maps.py @ 2:507383cce5a8 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit edbb53cb13b52bf8e71c562fa8acc2c3be2fb270
author artbio
date Mon, 14 Aug 2017 05:52:34 -0400
parents 40972a8dfab9
children ed8b0142538d
comparison
equal deleted inserted replaced
1:40972a8dfab9 2:507383cce5a8
6 import pysam 6 import pysam
7 7
8 8
9 def Parser(): 9 def Parser():
10 the_parser = argparse.ArgumentParser() 10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--input', dest='input', required=True, 11 the_parser.add_argument('--inputs', dest='inputs', required=True,
12 nargs='+', help='input BAM files') 12 nargs='+', help='list of input BAM files')
13 the_parser.add_argument('--sample_name', dest='sample_name', 13 the_parser.add_argument('--sample_names', dest='sample_names',
14 required=True, nargs='+', help='sample name') 14 required=True, nargs='+',
15 the_parser.add_argument('--output', action='store', 15 help='list of sample names')
16 type=str, help='output tabular file') 16 the_parser.add_argument('--outputs', nargs='+', action='store',
17 the_parser.add_argument('-S', '--sizes', action='store', 17 help='list of two output paths (only two)')
18 help='use to output read sizes dataframe') 18 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store',
19 help='list of 2 plot methods (only two) among:\
20 Counts, Max, Mean, Median, Coverage and Size')
19 args = the_parser.parse_args() 21 args = the_parser.parse_args()
20 return args 22 return args
21 23
22 24
23 class Map: 25 class Map:
24 26
25 def __init__(self, bam_file, sample, computeSize=False): 27 def __init__(self, bam_file, sample):
26 self.sample_name = sample 28 self.sample_name = sample
27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 29 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
28 self.chromosomes = dict(zip(self.bam_object.references, 30 self.chromosomes = dict(zip(self.bam_object.references,
29 self.bam_object.lengths)) 31 self.bam_object.lengths))
30 self.map_dict = self.create_map(self.bam_object) 32 self.map_dict = self.create_map(self.bam_object)
31 self.max = self.compute_max(self.map_dict)
32 self.mean = self.compute_mean(self.map_dict)
33 self.median = self.compute_median(self.map_dict)
34 self.coverage = self.compute_coverage(self.map_dict)
35 if computeSize:
36 self.size = self.compute_size(self.map_dict)
37 33
38 def create_map(self, bam_object): 34 def create_map(self, bam_object):
39 ''' 35 '''
40 Returns a map_dictionary {(chromosome,read_position,polarity): 36 Returns a map_dictionary {(chromosome,read_position,polarity):
41 [read_length, ...]} 37 [read_length, ...]}
57 else: 53 else:
58 map_dictionary[(chrom, positions[0]+1, 54 map_dictionary[(chrom, positions[0]+1,
59 'F')].append(read.query_alignment_length) 55 'F')].append(read.query_alignment_length)
60 return map_dictionary 56 return map_dictionary
61 57
62 def compute_max(self, map_dictionary): 58 def compute_map(self, map_dictionary, out):
63 ''' 59 '''
64 takes a map_dictionary as input and returns 60 takes a map_dictionary as input and writes
61 a readmap_dictionary {(chromosome,read_position,polarity):
62 number_of_reads}
63 in an open file handler out
64 '''
65 readmap_dictionary = dict()
66 for key in map_dictionary:
67 readmap_dictionary[key] = len(map_dictionary[key])
68 self.write_table(readmap_dictionary, out)
69
70 def compute_max(self, map_dictionary, out):
71 '''
72 takes a map_dictionary as input and writes
65 a max_dictionary {(chromosome,read_position,polarity): 73 a max_dictionary {(chromosome,read_position,polarity):
66 max_of_number_of_read_at_any_position} 74 max_of_number_of_read_at_any_position}
75 Not clear this function is still required
67 ''' 76 '''
68 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] 77 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
69 max_dictionary = dict(merge_keylist) 78 max_dictionary = dict(merge_keylist)
70 for key in map_dictionary: 79 for key in map_dictionary:
71 if len(map_dictionary[key]) > max_dictionary[key[0]]: 80 if len(map_dictionary[key]) > max_dictionary[key[0]]:
72 max_dictionary[key[0]] = len(map_dictionary[key]) 81 max_dictionary[key[0]] = len(map_dictionary[key])
73 return max_dictionary 82 self.write_table(max_dictionary, out)
74 83
75 def compute_mean(self, map_dictionary): 84 def compute_mean(self, map_dictionary, out):
76 ''' 85 '''
77 takes a map_dictionary as input and returns 86 takes a map_dictionary as input and returns
78 a mean_dictionary {(chromosome,read_position,polarity): 87 a mean_dictionary {(chromosome,read_position,polarity):
79 mean_value_of_reads} 88 mean_value_of_reads}
80 ''' 89 '''
83 if len(map_dictionary[key]) == 0: 92 if len(map_dictionary[key]) == 0:
84 mean_dictionary[key] = 0 93 mean_dictionary[key] = 0
85 else: 94 else:
86 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), 95 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
87 1) 96 1)
88 return mean_dictionary 97 self.write_table(mean_dictionary, out)
89 98
90 def compute_median(self, map_dictionary): 99 def compute_median(self, map_dictionary, out):
91 ''' 100 '''
92 takes a map_dictionary as input and returns 101 takes a map_dictionary as input and returns
93 a mean_dictionary {(chromosome,read_position,polarity): 102 a mean_dictionary {(chromosome,read_position,polarity):
94 mean_value_of_reads} 103 mean_value_of_reads}
95 ''' 104 '''
97 for key in map_dictionary: 106 for key in map_dictionary:
98 if len(map_dictionary[key]) == 0: 107 if len(map_dictionary[key]) == 0:
99 median_dictionary[key] = 0 108 median_dictionary[key] = 0
100 else: 109 else:
101 median_dictionary[key] = numpy.median(map_dictionary[key]) 110 median_dictionary[key] = numpy.median(map_dictionary[key])
102 return median_dictionary 111 self.write_table(median_dictionary, out)
103 112
104 def compute_coverage(self, map_dictionary, quality=10): 113 def compute_coverage(self, map_dictionary, out, quality=10):
105 ''' 114 '''
106 takes a map_dictionary as input and returns 115 takes a map_dictionary as input and returns
107 a coverage_dictionary {(chromosome,read_position,polarity): 116 a coverage_dictionary {(chromosome,read_position,polarity):
108 coverage} 117 coverage}
109 ''' 118 '''
118 end=key[1], 127 end=key[1],
119 quality_threshold=quality) 128 quality_threshold=quality)
120 """ Add the 4 coverage values """ 129 """ Add the 4 coverage values """
121 coverage = [sum(x) for x in zip(*coverage)] 130 coverage = [sum(x) for x in zip(*coverage)]
122 coverage_dictionary[key] = coverage[0] 131 coverage_dictionary[key] = coverage[0]
123 # coverage_dictionary[(key[0], key[1], 'R')] = coverage 132 self.write_table(coverage_dictionary, out)
124 return coverage_dictionary 133
125 134 def compute_size(self, map_dictionary, out):
126 def compute_size(self, map_dictionary):
127 ''' 135 '''
128 Takes a map_dictionary and returns a dictionary of sizes: 136 Takes a map_dictionary and returns a dictionary of sizes:
129 {chrom: {polarity: {size: nbre of reads}}} 137 {chrom: {polarity: {size: nbre of reads}}}
130 ''' 138 '''
131 size_dictionary = defaultdict(lambda: defaultdict( 139 size_dictionary = defaultdict(lambda: defaultdict(
135 if self.bam_object.count(chrom) == 0: 143 if self.bam_object.count(chrom) == 0:
136 size_dictionary[chrom]['F'][10] = 0 144 size_dictionary[chrom]['F'][10] = 0
137 for key in map_dictionary: 145 for key in map_dictionary:
138 for size in map_dictionary[key]: 146 for size in map_dictionary[key]:
139 size_dictionary[key[0]][key[2]][size] += 1 147 size_dictionary[key[0]][key[2]][size] += 1
140 return size_dictionary 148 self.write_size_table(size_dictionary, out)
141 149
142 def write_size_table(self, out): 150 def write_table(self, mapdict, out):
143 ''' 151 '''
144 Dataset, Chromosome, Polarity, Size, Nbr_reads 152 Generic writer
153 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
154 <some mapped value>
145 out is an *open* file handler 155 out is an *open* file handler
146 ''' 156 '''
147 for chrom in sorted(self.size): 157 for key in sorted(mapdict):
148 sizes = self.size[chrom]['F'].keys() 158 line = [self.sample_name, key[0], self.chromosomes[key[0]],
149 sizes.extend(self.size[chrom]['R'].keys()) 159 key[1], key[2], mapdict[key]]
150 for polarity in sorted(self.size[chrom]): 160 line = [str(i) for i in line]
161 out.write('\t'.join(line) + '\n')
162
163 def write_size_table(self, sizedic, out):
164 '''
165 Generic writer of summary values
166 Dataset, Chromosome, Chrom_length, <some category>, <some value>
167 out is an *open* file handler
168 '''
169 for chrom in sorted(sizedic):
170 sizes = sizedic[chrom]['F'].keys()
171 sizes.extend(sizedic[chrom]['R'].keys())
172 for polarity in sorted(sizedic[chrom]):
151 for size in range(min(sizes), max(sizes)+1): 173 for size in range(min(sizes), max(sizes)+1):
152 try: 174 try:
153 line = [self.sample_name, chrom, polarity, size, 175 line = [self.sample_name, chrom, polarity, size,
154 self.size[chrom][polarity][size]] 176 sizedic[chrom][polarity][size]]
155 except KeyError: 177 except KeyError:
156 line = [self.sample_name, chrom, polarity, size, 0] 178 line = [self.sample_name, chrom, polarity, size, 0]
157 line = [str(i) for i in line] 179 line = [str(i) for i in line]
158 out.write('\t'.join(line) + '\n') 180 out.write('\t'.join(line) + '\n')
159 181
160 def write_table(self, out): 182
161 ''' 183 def main(inputs, samples, methods, outputs):
162 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads 184 for method, output in zip(methods, outputs):
163 Polarity, Max, Mean, Median, Coverage 185 F = open(output, 'w')
164 out is an *open* file handler 186 if method == 'Size':
165 ''' 187 header = ["Dataset", "Chromosome", "Polarity", method, "Count"]
166 for key in sorted(self.map_dict): 188 else:
167 line = [self.sample_name, key[0], self.chromosomes[key[0]], 189 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate",
168 key[1], len(self.map_dict[key]), key[2], self.max[key[0]], 190 "Polarity", method]
169 self.mean[key], self.median[key], self.coverage[key]] 191 F.write('\t'.join(header) + '\n')
170 line = [str(i) for i in line] 192 for input, sample in zip(inputs, samples):
171 out.write('\t'.join(line) + '\n') 193 mapobj = Map(input, sample)
172 194 token = {"Counts": mapobj.compute_map,
173 195 "Max": mapobj.compute_max,
174 def main(inputs, samples, file_out, size_file_out=''): 196 "Mean": mapobj.compute_mean,
175 F = open(file_out, 'w') 197 "Median": mapobj.compute_median,
176 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", 198 "Coverage": mapobj.compute_coverage,
177 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"] 199 "Size": mapobj.compute_size}
178 F.write('\t'.join(header) + '\n') 200 token[method](mapobj.map_dict, F)
179 if size_file_out:
180 Fs = open(size_file_out, 'w')
181 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"]
182 Fs.write('\t'.join(header) + '\n')
183 for file, sample in zip(inputs, samples):
184 mapobj = Map(file, sample, computeSize=True)
185 mapobj.write_table(F)
186 mapobj.write_size_table(Fs)
187 Fs.close()
188 else:
189 for file, sample in zip(inputs, samples):
190 mapobj = Map(file, sample, computeSize=False)
191 mapobj.write_table(F)
192 F.close() 201 F.close()
193 202
194 203
195 if __name__ == "__main__": 204 if __name__ == "__main__":
196 args = Parser() 205 args = Parser()
197 # if identical sample names 206 # if identical sample names
198 if len(set(args.sample_name)) != len(args.sample_name): 207 if len(set(args.sample_names)) != len(args.sample_names):
199 args.sample_name = [name + '_' + str(i) for 208 args.sample_names = [name + '_' + str(i) for
200 i, name in enumerate(args.sample_name)] 209 i, name in enumerate(args.sample_names)]
201 main(args.input, args.sample_name, args.output, args.sizes) 210 main(args.inputs, args.sample_names, args.plot_methods, args.outputs)