Mercurial > repos > artbio > small_rna_maps
comparison small_rna_maps.py @ 9:3ea75c573429 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 6199193c7fe2cb56403eea8af0b40d44f7311fd5
author | artbio |
---|---|
date | Tue, 07 Nov 2017 12:31:28 -0500 |
parents | 1827b74f872b |
children | 82fedc576024 |
comparison
equal
deleted
inserted
replaced
8:1827b74f872b | 9:3ea75c573429 |
---|---|
12 nargs='+', help='list of input BAM files') | 12 nargs='+', help='list of input BAM files') |
13 the_parser.add_argument('--minsize', dest='minsize', type=int, | 13 the_parser.add_argument('--minsize', dest='minsize', type=int, |
14 default=0, help='minimal size of reads') | 14 default=0, help='minimal size of reads') |
15 the_parser.add_argument('--maxsize', dest='maxsize', type=int, | 15 the_parser.add_argument('--maxsize', dest='maxsize', type=int, |
16 default=10000, help='maximal size of reads') | 16 default=10000, help='maximal size of reads') |
17 the_parser.add_argument('--cluster', dest='cluster', type=int, | |
18 default=0, help='clustering distance') | |
17 the_parser.add_argument('--sample_names', dest='sample_names', | 19 the_parser.add_argument('--sample_names', dest='sample_names', |
18 required=True, nargs='+', | 20 required=True, nargs='+', |
19 help='list of sample names') | 21 help='list of sample names') |
20 the_parser.add_argument('--outputs', nargs='+', action='store', | 22 the_parser.add_argument('--outputs', nargs='+', action='store', |
21 help='list of two output paths (only two)') | 23 help='list of two output paths (only two)') |
26 return args | 28 return args |
27 | 29 |
28 | 30 |
29 class Map: | 31 class Map: |
30 | 32 |
31 def __init__(self, bam_file, sample, minsize, maxsize): | 33 def __init__(self, bam_file, sample, minsize, maxsize, cluster): |
32 self.sample_name = sample | 34 self.sample_name = sample |
33 self.minsize = minsize | 35 self.minsize = minsize |
34 self.maxsize = maxsize | 36 self.maxsize = maxsize |
37 self.cluster = cluster | |
35 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | 38 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
36 self.chromosomes = dict(zip(self.bam_object.references, | 39 self.chromosomes = dict(zip(self.bam_object.references, |
37 self.bam_object.lengths)) | 40 self.bam_object.lengths)) |
38 self.map_dict = self.create_map(self.bam_object, self.minsize, | 41 self.map_dict = self.create_map(self.bam_object, self.minsize, |
39 self.maxsize) | 42 self.maxsize) |
43 if self.cluster: | |
44 self.map_dict = self.tile_map(self.map_dict, self.cluster) | |
40 | 45 |
41 def create_map(self, bam_object, minsize, maxsize): | 46 def create_map(self, bam_object, minsize, maxsize): |
42 ''' | 47 ''' |
43 Returns a map_dictionary {(chromosome,read_position,polarity): | 48 Returns a map_dictionary {(chromosome,read_position,polarity): |
44 [read_length, ...]} | 49 [read_length, ...]} |
45 ''' | 50 ''' |
46 map_dictionary = defaultdict(list) | 51 map_dictionary = defaultdict(list) |
47 # get empty value for start and end of each chromosome | 52 for chrom in self.chromosomes: |
48 for chrom in self.chromosomes: | 53 # get empty value for start and end of each chromosome |
49 map_dictionary[(chrom, 1, 'F')] = [] | 54 map_dictionary[(chrom, 1, 'F')] = [] |
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | 55 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] |
51 for chrom in self.chromosomes: | |
52 for read in bam_object.fetch(chrom): | 56 for read in bam_object.fetch(chrom): |
53 if (read.query_alignment_length >= minsize and | 57 if (read.query_alignment_length >= minsize and |
54 read.query_alignment_length <= maxsize): | 58 read.query_alignment_length <= maxsize): |
55 positions = read.positions # a list of covered positions | 59 positions = read.positions # a list of covered positions |
56 if read.is_reverse: | 60 if read.is_reverse: |
58 read.query_alignment_length) | 62 read.query_alignment_length) |
59 else: | 63 else: |
60 map_dictionary[(chrom, positions[0]+1, 'F')].append( | 64 map_dictionary[(chrom, positions[0]+1, 'F')].append( |
61 read.query_alignment_length) | 65 read.query_alignment_length) |
62 return map_dictionary | 66 return map_dictionary |
67 | |
68 def grouper(self, iterable, clust_distance): | |
69 prev = None | |
70 group = [] | |
71 for item in iterable: | |
72 if not prev or item - prev <= clust_distance: | |
73 group.append(item) | |
74 else: | |
75 yield group | |
76 group = [item] | |
77 prev = item | |
78 if group: | |
79 yield group | |
80 | |
81 def tile_map(self, map_dic, clust_distance): | |
82 ''' | |
83 takes a map_dictionary {(chromosome,read_position,polarity): | |
84 [read_length, ...]} | |
85 and retur a map_dictionary with same structure but with | |
86 read positions aggregated by size | |
87 ''' | |
88 clustered_dic = defaultdict(list) | |
89 for chrom in self.chromosomes: | |
90 clustered_dic[(chrom, 1, 'F')] = [] | |
91 clustered_dic[(chrom, self.chromosomes[chrom], 'F')] = [] | |
92 F_chrom_coord = [] | |
93 R_chrom_coord = [] | |
94 for key in map_dic: | |
95 if key[0] == chrom and key[2] == 'F': | |
96 F_chrom_coord.append(key[1]) | |
97 elif key[0] == chrom and key[2] == 'R': | |
98 R_chrom_coord.append(key[1]) | |
99 F_chrom_coord = list(set(F_chrom_coord)) | |
100 R_chrom_coord = list(set(R_chrom_coord)) | |
101 F_chrom_coord.sort() | |
102 R_chrom_coord.sort() | |
103 F_clust_values = [i for i in self.grouper(F_chrom_coord, | |
104 clust_distance)] | |
105 F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values] | |
106 R_clust_values = [i for i in self.grouper(R_chrom_coord, | |
107 clust_distance)] | |
108 R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values] | |
109 F_clust_dic = dict(zip(F_clust_keys, F_clust_values)) | |
110 R_clust_dic = dict(zip(R_clust_keys, R_clust_values)) | |
111 # {centered_coordinate: [coord1, coord2, coord3, ..]} | |
112 for centcoor in F_clust_dic: | |
113 accumulator = [] | |
114 for coor in F_clust_dic[centcoor]: | |
115 accumulator.extend(map_dic[(chrom, coor, 'F')]) | |
116 clustered_dic[(chrom, centcoor, 'F')] = accumulator | |
117 for centcoor in R_clust_dic: | |
118 accumulator = [] | |
119 for coor in R_clust_dic[centcoor]: | |
120 accumulator.extend(map_dic[(chrom, coor, 'R')]) | |
121 clustered_dic[(chrom, centcoor, 'R')] = accumulator | |
122 return clustered_dic | |
63 | 123 |
64 def compute_readcount(self, map_dictionary, out): | 124 def compute_readcount(self, map_dictionary, out): |
65 ''' | 125 ''' |
66 takes a map_dictionary as input and writes | 126 takes a map_dictionary as input and writes |
67 a readmap_dictionary {(chromosome,read_position,polarity): | 127 a readmap_dictionary {(chromosome,read_position,polarity): |
189 line = [self.sample_name, chrom, polarity, size, 0] | 249 line = [self.sample_name, chrom, polarity, size, 0] |
190 line = [str(i) for i in line] | 250 line = [str(i) for i in line] |
191 out.write('\t'.join(line) + '\n') | 251 out.write('\t'.join(line) + '\n') |
192 | 252 |
193 | 253 |
194 def main(inputs, samples, methods, outputs, minsize, maxsize): | 254 def main(inputs, samples, methods, outputs, minsize, maxsize, cluster): |
195 for method, output in zip(methods, outputs): | 255 for method, output in zip(methods, outputs): |
196 F = open(output, 'w') | 256 F = open(output, 'w') |
197 if method == 'Size': | 257 if method == 'Size': |
198 header = ["Dataset", "Chromosome", "Polarity", method, "Counts"] | 258 header = ["Dataset", "Chromosome", "Polarity", method, "Counts"] |
199 else: | 259 else: |
200 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", | 260 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", |
201 "Polarity", method] | 261 "Polarity", method] |
202 F.write('\t'.join(header) + '\n') | 262 F.write('\t'.join(header) + '\n') |
203 for input, sample in zip(inputs, samples): | 263 for input, sample in zip(inputs, samples): |
204 mapobj = Map(input, sample, minsize, maxsize) | 264 mapobj = Map(input, sample, minsize, maxsize, cluster) |
205 token = {"Counts": mapobj.compute_readcount, | 265 token = {"Counts": mapobj.compute_readcount, |
206 "Max": mapobj.compute_max, | 266 "Max": mapobj.compute_max, |
207 "Mean": mapobj.compute_mean, | 267 "Mean": mapobj.compute_mean, |
208 "Median": mapobj.compute_median, | 268 "Median": mapobj.compute_median, |
209 "Coverage": mapobj.compute_coverage, | 269 "Coverage": mapobj.compute_coverage, |
217 # if identical sample names | 277 # if identical sample names |
218 if len(set(args.sample_names)) != len(args.sample_names): | 278 if len(set(args.sample_names)) != len(args.sample_names): |
219 args.sample_names = [name + '_' + str(i) for | 279 args.sample_names = [name + '_' + str(i) for |
220 i, name in enumerate(args.sample_names)] | 280 i, name in enumerate(args.sample_names)] |
221 main(args.inputs, args.sample_names, args.plot_methods, args.outputs, | 281 main(args.inputs, args.sample_names, args.plot_methods, args.outputs, |
222 args.minsize, args.maxsize) | 282 args.minsize, args.maxsize, args.cluster) |