Mercurial > repos > artbio > small_rna_signatures
comparison signature.py @ 7:07771982ef9b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 7276b6b73aef7af4058ad2c1e34c4557e9cccbe0
author | artbio |
---|---|
date | Sun, 10 Sep 2017 13:50:40 -0400 |
parents | a35e6f9c1d34 |
children | 8d3ca9652a5b |
comparison
equal
deleted
inserted
replaced
6:4da23f009c9e | 7:07771982ef9b |
---|---|
38 return args | 38 return args |
39 | 39 |
40 | 40 |
41 class Map: | 41 class Map: |
42 | 42 |
43 def __init__(self, bam_file): | 43 def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23, |
44 maxtarget=29, minscope=1, maxscope=19, output_h='', | |
45 output_z=''): | |
44 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | 46 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
47 self.query_range = range(minquery, maxquery + 1) | |
48 self.target_range = range(mintarget, maxtarget + 1) | |
49 self.scope = range(minscope, maxscope + 1) | |
50 self.H = open(output_h, 'w') | |
51 self.Z = open(output_z, 'w') | |
45 self.chromosomes = dict(zip(self.bam_object.references, | 52 self.chromosomes = dict(zip(self.bam_object.references, |
46 self.bam_object.lengths)) | 53 self.bam_object.lengths)) |
47 self.map_dict = self.create_map(self.bam_object) | 54 self.map_dict = self.create_map(self.bam_object) |
55 self.query_positions = self.compute_query_positions() | |
56 self.Z.write(self.compute_signature_pairs()) | |
57 self.H.write(self.compute_signature_h()) | |
58 self.H.close() | |
59 self.Z.close() | |
48 | 60 |
49 def create_map(self, bam_object): | 61 def create_map(self, bam_object): |
50 ''' | 62 ''' |
51 Returns a map_dictionary {(chromosome,read_position,polarity): | 63 Returns a map_dictionary {(chromosome,read_position,polarity): |
52 [read_length, ...]} | 64 [read_length, ...]} |
56 for chrom in self.chromosomes: | 68 for chrom in self.chromosomes: |
57 map_dictionary[(chrom, 1, 'F')] = [] | 69 map_dictionary[(chrom, 1, 'F')] = [] |
58 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | 70 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] |
59 for chrom in self.chromosomes: | 71 for chrom in self.chromosomes: |
60 for read in bam_object.fetch(chrom): | 72 for read in bam_object.fetch(chrom): |
61 positions = read.positions # a list of covered positions | |
62 if read.is_reverse: | 73 if read.is_reverse: |
63 map_dictionary[(chrom, positions[-1]+1, | 74 map_dictionary[(chrom, read.reference_end, |
64 'R')].append(read.query_alignment_length) | 75 'R')].append(read.query_alignment_length) |
65 else: | 76 else: |
66 map_dictionary[(chrom, positions[0]+1, | 77 map_dictionary[(chrom, read.reference_start+1, |
67 'F')].append(read.query_alignment_length) | 78 'F')].append(read.query_alignment_length) |
68 return map_dictionary | 79 return map_dictionary |
69 | 80 |
70 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): | 81 def compute_query_positions(self): |
71 query_range = range(minquery, maxquery + 1) | 82 ''' this method does not filter on read size, just forward reads |
72 target_range = range(mintarget, maxtarget + 1) | 83 that overlap reverse reads in the overlap range''' |
84 all_query_positions = defaultdict(list) | |
85 for genomicKey in self.map_dict.keys(): | |
86 chrom, coord, pol = genomicKey | |
87 for i in self.scope: | |
88 if pol == 'F' and len(self.map_dict[chrom, | |
89 coord+i-1, | |
90 'R']) > 0: | |
91 all_query_positions[chrom].append(coord) | |
92 break | |
93 for chrom in all_query_positions: | |
94 all_query_positions[chrom] = sorted( | |
95 list(set(all_query_positions[chrom]))) | |
96 return all_query_positions | |
97 | |
98 def countpairs(self, uppers, lowers): | |
99 query_range = self.query_range | |
100 target_range = self.target_range | |
101 uppers = [size for size in uppers if size in query_range or size in | |
102 target_range] | |
103 lowers = [size for size in lowers if size in query_range or size in | |
104 target_range] | |
105 paired = [] | |
106 for upread in uppers: | |
107 for downread in lowers: | |
108 if (upread in query_range and downread in target_range) or ( | |
109 upread in target_range and downread in query_range): | |
110 paired.append(upread) | |
111 lowers.remove(downread) | |
112 break | |
113 return len(paired) | |
114 | |
115 def compute_signature_pairs(self): | |
116 frequency_table = defaultdict(dict) | |
117 scope = self.scope | |
118 for chrom in self.chromosomes: | |
119 for overlap in scope: | |
120 frequency_table[chrom][overlap] = 0 | |
121 for chrom in self.query_positions: | |
122 for coord in self.query_positions[chrom]: | |
123 for overlap in scope: | |
124 uppers = self.map_dict[chrom, coord, 'F'] | |
125 lowers = self.map_dict[chrom, coord+overlap-1, 'R'] | |
126 frequency_table[chrom][overlap] += self.countpairs(uppers, | |
127 lowers) | |
128 # compute overlaps for all chromosomes merged | |
129 for overlap in scope: | |
130 accumulator = [] | |
131 for chrom in frequency_table: | |
132 if chrom != 'all_chromosomes': | |
133 accumulator.append(frequency_table[chrom][overlap]) | |
134 frequency_table['all_chromosomes'][overlap] = sum(accumulator) | |
135 return self.stringify_table(frequency_table) | |
136 | |
137 def signature_tables(self): | |
138 query_range = self.query_range | |
139 target_range = self.target_range | |
73 Query_table = defaultdict(dict) | 140 Query_table = defaultdict(dict) |
74 Target_table = defaultdict(dict) | 141 Target_table = defaultdict(dict) |
75 for key in self.map_dict: | 142 for key in self.map_dict: |
76 for size in self.map_dict[key]: | 143 for size in self.map_dict[key]: |
77 if size in query_range or size in target_range: | 144 if size in query_range or size in target_range: |
85 if size in target_range: | 152 if size in target_range: |
86 Target_table[key[0]][coordinate] = \ | 153 Target_table[key[0]][coordinate] = \ |
87 Target_table[key[0]].get(coordinate, 0) + 1 | 154 Target_table[key[0]].get(coordinate, 0) + 1 |
88 return Query_table, Target_table | 155 return Query_table, Target_table |
89 | 156 |
90 def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget, | 157 def compute_signature_h(self): |
91 scope, zscore="no"): | 158 scope = self.scope |
92 Query_table, Target_table = self.signature_tables(minquery, maxquery, | 159 Query_table, Target_table = self.signature_tables() |
93 mintarget, maxtarget) | |
94 frequency_table = defaultdict(dict) | |
95 for chrom in self.chromosomes: | |
96 for overlap in scope: | |
97 frequency_table[chrom][overlap] = 0 | |
98 for chrom in Query_table: | |
99 for coord in Query_table[chrom]: | |
100 for overlap in scope: | |
101 frequency_table[chrom][overlap] += min( | |
102 Query_table[chrom][coord], | |
103 Target_table[chrom].get(-coord - overlap + 1, 0)) | |
104 # since we want the number of pairs, not the number or paired reads | |
105 # to do: what in complex cases | |
106 # with query and target sizes partially overlap ? | |
107 for chrom in frequency_table: | |
108 for overlap in frequency_table[chrom]: | |
109 frequency_table[chrom][overlap] /= 2 | |
110 # compute overlaps for all chromosomes merged | |
111 for overlap in scope: | |
112 accumulator = [] | |
113 for chrom in frequency_table: | |
114 if chrom != 'all_chromosomes': | |
115 accumulator.append(frequency_table[chrom][overlap]) | |
116 frequency_table['all_chromosomes'][overlap] = sum(accumulator) | |
117 return self.stringify_table(frequency_table) | |
118 | |
119 def compute_signature_h(self, minquery, maxquery, mintarget, | |
120 maxtarget, scope): | |
121 Query_table, Target_table = self.signature_tables(minquery, maxquery, | |
122 mintarget, maxtarget) | |
123 frequency_table = defaultdict(dict) | 160 frequency_table = defaultdict(dict) |
124 for chrom in self.chromosomes: | 161 for chrom in self.chromosomes: |
125 for overlap in scope: | 162 for overlap in scope: |
126 frequency_table[chrom][overlap] = 0 | 163 frequency_table[chrom][overlap] = 0 |
127 for chrom in Query_table: | 164 for chrom in Query_table: |
185 str(frequency_table[chrom][overlap]), | 222 str(frequency_table[chrom][overlap]), |
186 str((frequency_table[chrom][overlap] - z_mean)/z_std))) | 223 str((frequency_table[chrom][overlap] - z_mean)/z_std))) |
187 return ''.join(tablestring) | 224 return ''.join(tablestring) |
188 | 225 |
189 | 226 |
190 | |
191 def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope, | |
192 output_h, output_z, genome_wide=False, zscore="no"): | |
193 H = open(output_h, 'w') | |
194 Z = open(output_z, 'w') | |
195 mapobj = Map(input) | |
196 scope = range(minscope, maxscope + 1) | |
197 Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget, | |
198 maxtarget, scope, zscore="no")) | |
199 H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget, | |
200 maxtarget, scope)) | |
201 H.close() | |
202 Z.close() | |
203 | |
204 | |
205 if __name__ == "__main__": | 227 if __name__ == "__main__": |
206 args = Parser() | 228 args = Parser() |
207 main(args.input, args.minquery, args.maxquery, args.mintarget, | 229 mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget, |
208 args.maxtarget, args.minscope, args.maxscope, args.output_h, | 230 args.maxtarget, args.minscope, args.maxscope, args.output_h, |
209 args.output_z) | 231 args.output_z) |