Mercurial > repos > artbio > small_rna_signatures
comparison signature.py @ 0:a35e6f9c1d34 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 6719543c5017d581ae012b864d7c9088f0767fc8
author | artbio |
---|---|
date | Mon, 28 Aug 2017 09:29:47 -0400 |
parents | |
children | 07771982ef9b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a35e6f9c1d34 |
---|---|
1 import argparse | |
2 from collections import defaultdict | |
3 | |
4 import numpy | |
5 | |
6 import pysam | |
7 | |
8 | |
9 def Parser(): | |
10 the_parser = argparse.ArgumentParser() | |
11 the_parser.add_argument( | |
12 '--input', action="store", type=str, help="bam alignment file") | |
13 the_parser.add_argument( | |
14 '--minquery', type=int, | |
15 help="Minimum readsize of query reads (nt) - must be an integer") | |
16 the_parser.add_argument( | |
17 '--maxquery', type=int, | |
18 help="Maximum readsize of query reads (nt) - must be an integer") | |
19 the_parser.add_argument( | |
20 '--mintarget', type=int, | |
21 help="Minimum readsize of target reads (nt) - must be an integer") | |
22 the_parser.add_argument( | |
23 '--maxtarget', type=int, | |
24 help="Maximum readsize of target reads (nt) - must be an integer") | |
25 the_parser.add_argument( | |
26 '--minscope', type=int, | |
27 help="Minimum overlap analyzed (nt) - must be an integer") | |
28 the_parser.add_argument( | |
29 '--maxscope', type=int, | |
30 help="Maximum overlap analyzed (nt) - must be an integer") | |
31 the_parser.add_argument( | |
32 '--output_h', action="store", type=str, | |
33 help="h-signature dataframe") | |
34 the_parser.add_argument( | |
35 '--output_z', action="store", type=str, | |
36 help="z-signature dataframe") | |
37 args = the_parser.parse_args() | |
38 return args | |
39 | |
40 | |
41 class Map: | |
42 | |
43 def __init__(self, bam_file): | |
44 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
45 self.chromosomes = dict(zip(self.bam_object.references, | |
46 self.bam_object.lengths)) | |
47 self.map_dict = self.create_map(self.bam_object) | |
48 | |
49 def create_map(self, bam_object): | |
50 ''' | |
51 Returns a map_dictionary {(chromosome,read_position,polarity): | |
52 [read_length, ...]} | |
53 ''' | |
54 map_dictionary = defaultdict(list) | |
55 # get empty value for start and end of each chromosome | |
56 for chrom in self.chromosomes: | |
57 map_dictionary[(chrom, 1, 'F')] = [] | |
58 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
59 for chrom in self.chromosomes: | |
60 for read in bam_object.fetch(chrom): | |
61 positions = read.positions # a list of covered positions | |
62 if read.is_reverse: | |
63 map_dictionary[(chrom, positions[-1]+1, | |
64 'R')].append(read.query_alignment_length) | |
65 else: | |
66 map_dictionary[(chrom, positions[0]+1, | |
67 'F')].append(read.query_alignment_length) | |
68 return map_dictionary | |
69 | |
70 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): | |
71 query_range = range(minquery, maxquery + 1) | |
72 target_range = range(mintarget, maxtarget + 1) | |
73 Query_table = defaultdict(dict) | |
74 Target_table = defaultdict(dict) | |
75 for key in self.map_dict: | |
76 for size in self.map_dict[key]: | |
77 if size in query_range or size in target_range: | |
78 if key[2] == 'F': | |
79 coordinate = key[1] | |
80 else: | |
81 coordinate = -key[1] | |
82 if size in query_range: | |
83 Query_table[key[0]][coordinate] = Query_table[key[0]].get( | |
84 coordinate, 0) + 1 | |
85 if size in target_range: | |
86 Target_table[key[0]][coordinate] = \ | |
87 Target_table[key[0]].get(coordinate, 0) + 1 | |
88 return Query_table, Target_table | |
89 | |
90 def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget, | |
91 scope, zscore="no"): | |
92 Query_table, Target_table = self.signature_tables(minquery, maxquery, | |
93 mintarget, maxtarget) | |
94 frequency_table = defaultdict(dict) | |
95 for chrom in self.chromosomes: | |
96 for overlap in scope: | |
97 frequency_table[chrom][overlap] = 0 | |
98 for chrom in Query_table: | |
99 for coord in Query_table[chrom]: | |
100 for overlap in scope: | |
101 frequency_table[chrom][overlap] += min( | |
102 Query_table[chrom][coord], | |
103 Target_table[chrom].get(-coord - overlap + 1, 0)) | |
104 # since we want the number of pairs, not the number or paired reads | |
105 # to do: what in complex cases | |
106 # with query and target sizes partially overlap ? | |
107 for chrom in frequency_table: | |
108 for overlap in frequency_table[chrom]: | |
109 frequency_table[chrom][overlap] /= 2 | |
110 # compute overlaps for all chromosomes merged | |
111 for overlap in scope: | |
112 accumulator = [] | |
113 for chrom in frequency_table: | |
114 if chrom != 'all_chromosomes': | |
115 accumulator.append(frequency_table[chrom][overlap]) | |
116 frequency_table['all_chromosomes'][overlap] = sum(accumulator) | |
117 return self.stringify_table(frequency_table) | |
118 | |
119 def compute_signature_h(self, minquery, maxquery, mintarget, | |
120 maxtarget, scope): | |
121 Query_table, Target_table = self.signature_tables(minquery, maxquery, | |
122 mintarget, maxtarget) | |
123 frequency_table = defaultdict(dict) | |
124 for chrom in self.chromosomes: | |
125 for overlap in scope: | |
126 frequency_table[chrom][overlap] = 0 | |
127 for chrom in Query_table: | |
128 Total_Query_Numb = 0 | |
129 for coord in Query_table[chrom]: | |
130 Total_Query_Numb += Query_table[chrom][coord] | |
131 for coord in Query_table[chrom]: | |
132 local_table = dict([(overlap, 0) for overlap in scope]) | |
133 number_of_targets = 0 | |
134 for overlap in scope: | |
135 local_table[overlap] += Query_table[chrom][coord] * \ | |
136 Target_table[chrom].get(-coord - overlap + 1, 0) | |
137 number_of_targets += Target_table[chrom].get( | |
138 -coord - overlap + 1, 0) | |
139 for overlap in scope: | |
140 try: | |
141 frequency_table[chrom][overlap] += \ | |
142 local_table[overlap] / number_of_targets \ | |
143 / float(Total_Query_Numb) | |
144 except ZeroDivisionError: | |
145 continue | |
146 # compute overlap probabilities for all chromosomes merged | |
147 general_frequency_table = dict([(overlap, 0) for overlap in scope]) | |
148 total_aligned_reads = 0 | |
149 for chrom in frequency_table: | |
150 for overlap in frequency_table[chrom]: | |
151 total_aligned_reads += self.bam_object.count(chrom) | |
152 for chrom in frequency_table: | |
153 for overlap in frequency_table[chrom]: | |
154 try: | |
155 general_frequency_table[overlap] += \ | |
156 frequency_table[chrom][overlap] / total_aligned_reads \ | |
157 * self.bam_object.count(chrom) | |
158 except ZeroDivisionError: | |
159 continue | |
160 for overlap in general_frequency_table: | |
161 frequency_table['all_chromosomes'][overlap] = \ | |
162 general_frequency_table[overlap] | |
163 return self.stringify_table(frequency_table) | |
164 | |
165 def stringify_table(self, frequency_table): | |
166 ''' | |
167 method both to compute z-score and to return a writable string | |
168 ''' | |
169 tablestring = [] | |
170 for chrom in sorted(frequency_table): | |
171 accumulator = [] | |
172 for overlap in frequency_table[chrom]: | |
173 accumulator.append(frequency_table[chrom][overlap]) | |
174 z_mean = numpy.mean(accumulator) | |
175 z_std = numpy.std(accumulator) | |
176 if z_std == 0: | |
177 for overlap in sorted(frequency_table[chrom]): | |
178 tablestring.append('%s\t%s\t%s\t%s\n' % ( | |
179 chrom, str(overlap), | |
180 str(frequency_table[chrom][overlap]), str(0))) | |
181 else: | |
182 for overlap in sorted(frequency_table[chrom]): | |
183 tablestring.append('%s\t%s\t%s\t%s\n' % ( | |
184 chrom, str(overlap), | |
185 str(frequency_table[chrom][overlap]), | |
186 str((frequency_table[chrom][overlap] - z_mean)/z_std))) | |
187 return ''.join(tablestring) | |
188 | |
189 | |
190 | |
191 def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope, | |
192 output_h, output_z, genome_wide=False, zscore="no"): | |
193 H = open(output_h, 'w') | |
194 Z = open(output_z, 'w') | |
195 mapobj = Map(input) | |
196 scope = range(minscope, maxscope + 1) | |
197 Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget, | |
198 maxtarget, scope, zscore="no")) | |
199 H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget, | |
200 maxtarget, scope)) | |
201 H.close() | |
202 Z.close() | |
203 | |
204 | |
205 if __name__ == "__main__": | |
206 args = Parser() | |
207 main(args.input, args.minquery, args.maxquery, args.mintarget, | |
208 args.maxtarget, args.minscope, args.maxscope, args.output_h, | |
209 args.output_z) |