comparison small_rna_maps.py @ 9:3ea75c573429 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 6199193c7fe2cb56403eea8af0b40d44f7311fd5
author artbio
date Tue, 07 Nov 2017 12:31:28 -0500
parents 1827b74f872b
children 82fedc576024
comparison
equal deleted inserted replaced
8:1827b74f872b 9:3ea75c573429
12 nargs='+', help='list of input BAM files') 12 nargs='+', help='list of input BAM files')
13 the_parser.add_argument('--minsize', dest='minsize', type=int, 13 the_parser.add_argument('--minsize', dest='minsize', type=int,
14 default=0, help='minimal size of reads') 14 default=0, help='minimal size of reads')
15 the_parser.add_argument('--maxsize', dest='maxsize', type=int, 15 the_parser.add_argument('--maxsize', dest='maxsize', type=int,
16 default=10000, help='maximal size of reads') 16 default=10000, help='maximal size of reads')
17 the_parser.add_argument('--cluster', dest='cluster', type=int,
18 default=0, help='clustering distance')
17 the_parser.add_argument('--sample_names', dest='sample_names', 19 the_parser.add_argument('--sample_names', dest='sample_names',
18 required=True, nargs='+', 20 required=True, nargs='+',
19 help='list of sample names') 21 help='list of sample names')
20 the_parser.add_argument('--outputs', nargs='+', action='store', 22 the_parser.add_argument('--outputs', nargs='+', action='store',
21 help='list of two output paths (only two)') 23 help='list of two output paths (only two)')
26 return args 28 return args
27 29
28 30
29 class Map: 31 class Map:
30 32
31 def __init__(self, bam_file, sample, minsize, maxsize): 33 def __init__(self, bam_file, sample, minsize, maxsize, cluster):
32 self.sample_name = sample 34 self.sample_name = sample
33 self.minsize = minsize 35 self.minsize = minsize
34 self.maxsize = maxsize 36 self.maxsize = maxsize
37 self.cluster = cluster
35 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 38 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
36 self.chromosomes = dict(zip(self.bam_object.references, 39 self.chromosomes = dict(zip(self.bam_object.references,
37 self.bam_object.lengths)) 40 self.bam_object.lengths))
38 self.map_dict = self.create_map(self.bam_object, self.minsize, 41 self.map_dict = self.create_map(self.bam_object, self.minsize,
39 self.maxsize) 42 self.maxsize)
43 if self.cluster:
44 self.map_dict = self.tile_map(self.map_dict, self.cluster)
40 45
41 def create_map(self, bam_object, minsize, maxsize): 46 def create_map(self, bam_object, minsize, maxsize):
42 ''' 47 '''
43 Returns a map_dictionary {(chromosome,read_position,polarity): 48 Returns a map_dictionary {(chromosome,read_position,polarity):
44 [read_length, ...]} 49 [read_length, ...]}
45 ''' 50 '''
46 map_dictionary = defaultdict(list) 51 map_dictionary = defaultdict(list)
47 # get empty value for start and end of each chromosome 52 for chrom in self.chromosomes:
48 for chrom in self.chromosomes: 53 # get empty value for start and end of each chromosome
49 map_dictionary[(chrom, 1, 'F')] = [] 54 map_dictionary[(chrom, 1, 'F')] = []
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] 55 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
51 for chrom in self.chromosomes:
52 for read in bam_object.fetch(chrom): 56 for read in bam_object.fetch(chrom):
53 if (read.query_alignment_length >= minsize and 57 if (read.query_alignment_length >= minsize and
54 read.query_alignment_length <= maxsize): 58 read.query_alignment_length <= maxsize):
55 positions = read.positions # a list of covered positions 59 positions = read.positions # a list of covered positions
56 if read.is_reverse: 60 if read.is_reverse:
58 read.query_alignment_length) 62 read.query_alignment_length)
59 else: 63 else:
60 map_dictionary[(chrom, positions[0]+1, 'F')].append( 64 map_dictionary[(chrom, positions[0]+1, 'F')].append(
61 read.query_alignment_length) 65 read.query_alignment_length)
62 return map_dictionary 66 return map_dictionary
67
68 def grouper(self, iterable, clust_distance):
69 prev = None
70 group = []
71 for item in iterable:
72 if not prev or item - prev <= clust_distance:
73 group.append(item)
74 else:
75 yield group
76 group = [item]
77 prev = item
78 if group:
79 yield group
80
81 def tile_map(self, map_dic, clust_distance):
82 '''
83 takes a map_dictionary {(chromosome,read_position,polarity):
84 [read_length, ...]}
85 and retur a map_dictionary with same structure but with
86 read positions aggregated by size
87 '''
88 clustered_dic = defaultdict(list)
89 for chrom in self.chromosomes:
90 clustered_dic[(chrom, 1, 'F')] = []
91 clustered_dic[(chrom, self.chromosomes[chrom], 'F')] = []
92 F_chrom_coord = []
93 R_chrom_coord = []
94 for key in map_dic:
95 if key[0] == chrom and key[2] == 'F':
96 F_chrom_coord.append(key[1])
97 elif key[0] == chrom and key[2] == 'R':
98 R_chrom_coord.append(key[1])
99 F_chrom_coord = list(set(F_chrom_coord))
100 R_chrom_coord = list(set(R_chrom_coord))
101 F_chrom_coord.sort()
102 R_chrom_coord.sort()
103 F_clust_values = [i for i in self.grouper(F_chrom_coord,
104 clust_distance)]
105 F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values]
106 R_clust_values = [i for i in self.grouper(R_chrom_coord,
107 clust_distance)]
108 R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values]
109 F_clust_dic = dict(zip(F_clust_keys, F_clust_values))
110 R_clust_dic = dict(zip(R_clust_keys, R_clust_values))
111 # {centered_coordinate: [coord1, coord2, coord3, ..]}
112 for centcoor in F_clust_dic:
113 accumulator = []
114 for coor in F_clust_dic[centcoor]:
115 accumulator.extend(map_dic[(chrom, coor, 'F')])
116 clustered_dic[(chrom, centcoor, 'F')] = accumulator
117 for centcoor in R_clust_dic:
118 accumulator = []
119 for coor in R_clust_dic[centcoor]:
120 accumulator.extend(map_dic[(chrom, coor, 'R')])
121 clustered_dic[(chrom, centcoor, 'R')] = accumulator
122 return clustered_dic
63 123
64 def compute_readcount(self, map_dictionary, out): 124 def compute_readcount(self, map_dictionary, out):
65 ''' 125 '''
66 takes a map_dictionary as input and writes 126 takes a map_dictionary as input and writes
67 a readmap_dictionary {(chromosome,read_position,polarity): 127 a readmap_dictionary {(chromosome,read_position,polarity):
189 line = [self.sample_name, chrom, polarity, size, 0] 249 line = [self.sample_name, chrom, polarity, size, 0]
190 line = [str(i) for i in line] 250 line = [str(i) for i in line]
191 out.write('\t'.join(line) + '\n') 251 out.write('\t'.join(line) + '\n')
192 252
193 253
194 def main(inputs, samples, methods, outputs, minsize, maxsize): 254 def main(inputs, samples, methods, outputs, minsize, maxsize, cluster):
195 for method, output in zip(methods, outputs): 255 for method, output in zip(methods, outputs):
196 F = open(output, 'w') 256 F = open(output, 'w')
197 if method == 'Size': 257 if method == 'Size':
198 header = ["Dataset", "Chromosome", "Polarity", method, "Counts"] 258 header = ["Dataset", "Chromosome", "Polarity", method, "Counts"]
199 else: 259 else:
200 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", 260 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate",
201 "Polarity", method] 261 "Polarity", method]
202 F.write('\t'.join(header) + '\n') 262 F.write('\t'.join(header) + '\n')
203 for input, sample in zip(inputs, samples): 263 for input, sample in zip(inputs, samples):
204 mapobj = Map(input, sample, minsize, maxsize) 264 mapobj = Map(input, sample, minsize, maxsize, cluster)
205 token = {"Counts": mapobj.compute_readcount, 265 token = {"Counts": mapobj.compute_readcount,
206 "Max": mapobj.compute_max, 266 "Max": mapobj.compute_max,
207 "Mean": mapobj.compute_mean, 267 "Mean": mapobj.compute_mean,
208 "Median": mapobj.compute_median, 268 "Median": mapobj.compute_median,
209 "Coverage": mapobj.compute_coverage, 269 "Coverage": mapobj.compute_coverage,
217 # if identical sample names 277 # if identical sample names
218 if len(set(args.sample_names)) != len(args.sample_names): 278 if len(set(args.sample_names)) != len(args.sample_names):
219 args.sample_names = [name + '_' + str(i) for 279 args.sample_names = [name + '_' + str(i) for
220 i, name in enumerate(args.sample_names)] 280 i, name in enumerate(args.sample_names)]
221 main(args.inputs, args.sample_names, args.plot_methods, args.outputs, 281 main(args.inputs, args.sample_names, args.plot_methods, args.outputs,
222 args.minsize, args.maxsize) 282 args.minsize, args.maxsize, args.cluster)