Mercurial > repos > artbio > small_rna_maps
diff small_rna_maps.py @ 9:3ea75c573429 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 6199193c7fe2cb56403eea8af0b40d44f7311fd5
author | artbio |
---|---|
date | Tue, 07 Nov 2017 12:31:28 -0500 |
parents | 1827b74f872b |
children | 82fedc576024 |
line wrap: on
line diff
--- a/small_rna_maps.py Mon Oct 23 08:29:39 2017 -0400 +++ b/small_rna_maps.py Tue Nov 07 12:31:28 2017 -0500 @@ -14,6 +14,8 @@ default=0, help='minimal size of reads') the_parser.add_argument('--maxsize', dest='maxsize', type=int, default=10000, help='maximal size of reads') + the_parser.add_argument('--cluster', dest='cluster', type=int, + default=0, help='clustering distance') the_parser.add_argument('--sample_names', dest='sample_names', required=True, nargs='+', help='list of sample names') @@ -28,15 +30,18 @@ class Map: - def __init__(self, bam_file, sample, minsize, maxsize): + def __init__(self, bam_file, sample, minsize, maxsize, cluster): self.sample_name = sample self.minsize = minsize self.maxsize = maxsize + self.cluster = cluster self.bam_object = pysam.AlignmentFile(bam_file, 'rb') self.chromosomes = dict(zip(self.bam_object.references, self.bam_object.lengths)) self.map_dict = self.create_map(self.bam_object, self.minsize, self.maxsize) + if self.cluster: + self.map_dict = self.tile_map(self.map_dict, self.cluster) def create_map(self, bam_object, minsize, maxsize): ''' @@ -44,11 +49,10 @@ [read_length, ...]} ''' map_dictionary = defaultdict(list) - # get empty value for start and end of each chromosome for chrom in self.chromosomes: + # get empty value for start and end of each chromosome map_dictionary[(chrom, 1, 'F')] = [] map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] - for chrom in self.chromosomes: for read in bam_object.fetch(chrom): if (read.query_alignment_length >= minsize and read.query_alignment_length <= maxsize): @@ -61,6 +65,62 @@ read.query_alignment_length) return map_dictionary + def grouper(self, iterable, clust_distance): + prev = None + group = [] + for item in iterable: + if not prev or item - prev <= clust_distance: + group.append(item) + else: + yield group + group = [item] + prev = item + if group: + yield group + + def tile_map(self, map_dic, clust_distance): + ''' + takes a map_dictionary {(chromosome,read_position,polarity): + [read_length, ...]} + and retur a map_dictionary with same structure but with + read positions aggregated by size + ''' + clustered_dic = defaultdict(list) + for chrom in self.chromosomes: + clustered_dic[(chrom, 1, 'F')] = [] + clustered_dic[(chrom, self.chromosomes[chrom], 'F')] = [] + F_chrom_coord = [] + R_chrom_coord = [] + for key in map_dic: + if key[0] == chrom and key[2] == 'F': + F_chrom_coord.append(key[1]) + elif key[0] == chrom and key[2] == 'R': + R_chrom_coord.append(key[1]) + F_chrom_coord = list(set(F_chrom_coord)) + R_chrom_coord = list(set(R_chrom_coord)) + F_chrom_coord.sort() + R_chrom_coord.sort() + F_clust_values = [i for i in self.grouper(F_chrom_coord, + clust_distance)] + F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values] + R_clust_values = [i for i in self.grouper(R_chrom_coord, + clust_distance)] + R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values] + F_clust_dic = dict(zip(F_clust_keys, F_clust_values)) + R_clust_dic = dict(zip(R_clust_keys, R_clust_values)) + # {centered_coordinate: [coord1, coord2, coord3, ..]} + for centcoor in F_clust_dic: + accumulator = [] + for coor in F_clust_dic[centcoor]: + accumulator.extend(map_dic[(chrom, coor, 'F')]) + clustered_dic[(chrom, centcoor, 'F')] = accumulator + for centcoor in R_clust_dic: + accumulator = [] + for coor in R_clust_dic[centcoor]: + accumulator.extend(map_dic[(chrom, coor, 'R')]) + clustered_dic[(chrom, centcoor, 'R')] = accumulator + return clustered_dic + def compute_readcount(self, map_dictionary, out): ''' takes a map_dictionary as input and writes @@ -191,7 +251,7 @@ out.write('\t'.join(line) + '\n') -def main(inputs, samples, methods, outputs, minsize, maxsize): +def main(inputs, samples, methods, outputs, minsize, maxsize, cluster): for method, output in zip(methods, outputs): F = open(output, 'w') if method == 'Size': @@ -201,7 +261,7 @@ "Polarity", method] F.write('\t'.join(header) + '\n') for input, sample in zip(inputs, samples): - mapobj = Map(input, sample, minsize, maxsize) + mapobj = Map(input, sample, minsize, maxsize, cluster) token = {"Counts": mapobj.compute_readcount, "Max": mapobj.compute_max, "Mean": mapobj.compute_mean, @@ -219,4 +279,4 @@ args.sample_names = [name + '_' + str(i) for i, name in enumerate(args.sample_names)] main(args.inputs, args.sample_names, args.plot_methods, args.outputs, - args.minsize, args.maxsize) + args.minsize, args.maxsize, args.cluster)