Mercurial > repos > xuebing > sharplabtool
comparison tools/stats/aggregate_scores_in_intervals.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 # Greg Von Kuster | |
3 """ | |
4 usage: %prog score_file interval_file chrom start stop [out_file] [options] | |
5 -b, --binned: 'score_file' is actually a directory of binned array files | |
6 -m, --mask=FILE: bed file containing regions not to consider valid | |
7 -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file | |
8 """ | |
9 | |
10 from __future__ import division | |
11 from galaxy import eggs | |
12 import pkg_resources | |
13 pkg_resources.require( "bx-python" ) | |
14 pkg_resources.require( "lrucache" ) | |
15 try: | |
16 pkg_resources.require( "python-lzo" ) | |
17 except: | |
18 pass | |
19 | |
20 import psyco_full | |
21 import sys | |
22 import os, os.path | |
23 from UserDict import DictMixin | |
24 import bx.wiggle | |
25 from bx.binned_array import BinnedArray, FileBinnedArray | |
26 from bx.bitset import * | |
27 from bx.bitset_builders import * | |
28 from fpconst import isNaN | |
29 from bx.cookbook import doc_optparse | |
30 from galaxy.tools.exception_handling import * | |
31 | |
32 assert sys.version_info[:2] >= ( 2, 4 ) | |
33 | |
34 import tempfile, struct | |
35 class PositionalScoresOnDisk: | |
36 fmt = 'f' | |
37 fmt_size = struct.calcsize( fmt ) | |
38 default_value = float( 'nan' ) | |
39 | |
40 def __init__( self ): | |
41 self.file = tempfile.TemporaryFile( 'w+b' ) | |
42 self.length = 0 | |
43 def __getitem__( self, i ): | |
44 if i < 0: i = self.length + i | |
45 if i < 0 or i >= self.length: return self.default_value | |
46 try: | |
47 self.file.seek( i * self.fmt_size ) | |
48 return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] | |
49 except Exception, e: | |
50 raise IndexError, e | |
51 def __setitem__( self, i, value ): | |
52 if i < 0: i = self.length + i | |
53 if i < 0: raise IndexError, 'Negative assignment index out of range' | |
54 if i >= self.length: | |
55 self.file.seek( self.length * self.fmt_size ) | |
56 self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) ) | |
57 self.length = i + 1 | |
58 self.file.seek( i * self.fmt_size ) | |
59 self.file.write( struct.pack( self.fmt, value ) ) | |
60 def __len__( self ): | |
61 return self.length | |
62 def __repr__( self ): | |
63 i = 0 | |
64 repr = "[ " | |
65 for i in xrange( self.length ): | |
66 repr = "%s %s," % ( repr, self[i] ) | |
67 return "%s ]" % ( repr ) | |
68 | |
69 class FileBinnedArrayDir( DictMixin ): | |
70 """ | |
71 Adapter that makes a directory of FileBinnedArray files look like | |
72 a regular dict of BinnedArray objects. | |
73 """ | |
74 def __init__( self, dir ): | |
75 self.dir = dir | |
76 self.cache = dict() | |
77 def __getitem__( self, key ): | |
78 value = None | |
79 if key in self.cache: | |
80 value = self.cache[key] | |
81 else: | |
82 fname = os.path.join( self.dir, "%s.ba" % key ) | |
83 if os.path.exists( fname ): | |
84 value = FileBinnedArray( open( fname ) ) | |
85 self.cache[key] = value | |
86 if value is None: | |
87 raise KeyError( "File does not exist: " + fname ) | |
88 return value | |
89 | |
90 def stop_err(msg): | |
91 sys.stderr.write(msg) | |
92 sys.exit() | |
93 | |
94 def load_scores_wiggle( fname, chrom_buffer_size = 3 ): | |
95 """ | |
96 Read a wiggle file and return a dict of BinnedArray objects keyed | |
97 by chromosome. | |
98 """ | |
99 scores_by_chrom = dict() | |
100 try: | |
101 for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ): | |
102 if chrom not in scores_by_chrom: | |
103 if chrom_buffer_size: | |
104 scores_by_chrom[chrom] = BinnedArray() | |
105 chrom_buffer_size -= 1 | |
106 else: | |
107 scores_by_chrom[chrom] = PositionalScoresOnDisk() | |
108 scores_by_chrom[chrom][pos] = val | |
109 except UCSCLimitException: | |
110 # Wiggle data was truncated, at the very least need to warn the user. | |
111 print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' | |
112 except IndexError: | |
113 stop_err('Data error: one or more column data values is missing in "%s"' %fname) | |
114 except ValueError: | |
115 stop_err('Data error: invalid data type for one or more values in "%s".' %fname) | |
116 return scores_by_chrom | |
117 | |
118 def load_scores_ba_dir( dir ): | |
119 """ | |
120 Return a dict-like object (keyed by chromosome) that returns | |
121 FileBinnedArray objects created from "key.ba" files in `dir` | |
122 """ | |
123 return FileBinnedArrayDir( dir ) | |
124 | |
125 def main(): | |
126 | |
127 # Parse command line | |
128 options, args = doc_optparse.parse( __doc__ ) | |
129 | |
130 try: | |
131 score_fname = args[0] | |
132 interval_fname = args[1] | |
133 chrom_col = args[2] | |
134 start_col = args[3] | |
135 stop_col = args[4] | |
136 if len( args ) > 5: | |
137 out_file = open( args[5], 'w' ) | |
138 else: | |
139 out_file = sys.stdout | |
140 binned = bool( options.binned ) | |
141 mask_fname = options.mask | |
142 except: | |
143 doc_optparse.exit() | |
144 | |
145 if score_fname == 'None': | |
146 stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) | |
147 | |
148 try: | |
149 chrom_col = int(chrom_col) - 1 | |
150 start_col = int(start_col) - 1 | |
151 stop_col = int(stop_col) - 1 | |
152 except: | |
153 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) | |
154 | |
155 if chrom_col < 0 or start_col < 0 or stop_col < 0: | |
156 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) | |
157 | |
158 if binned: | |
159 scores_by_chrom = load_scores_ba_dir( score_fname ) | |
160 else: | |
161 try: | |
162 chrom_buffer = int( options.chrom_buffer ) | |
163 except: | |
164 chrom_buffer = 3 | |
165 scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer ) | |
166 | |
167 if mask_fname: | |
168 masks = binned_bitsets_from_file( open( mask_fname ) ) | |
169 else: | |
170 masks = None | |
171 | |
172 skipped_lines = 0 | |
173 first_invalid_line = 0 | |
174 invalid_line = '' | |
175 | |
176 for i, line in enumerate( open( interval_fname )): | |
177 valid = True | |
178 line = line.rstrip('\r\n') | |
179 if line and not line.startswith( '#' ): | |
180 fields = line.split() | |
181 | |
182 try: | |
183 chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] ) | |
184 except: | |
185 valid = False | |
186 skipped_lines += 1 | |
187 if not invalid_line: | |
188 first_invalid_line = i + 1 | |
189 invalid_line = line | |
190 if valid: | |
191 total = 0 | |
192 count = 0 | |
193 min_score = 100000000 | |
194 max_score = -100000000 | |
195 for j in range( start, stop ): | |
196 if chrom in scores_by_chrom: | |
197 try: | |
198 # Skip if base is masked | |
199 if masks and chrom in masks: | |
200 if masks[chrom][j]: | |
201 continue | |
202 # Get the score, only count if not 'nan' | |
203 score = scores_by_chrom[chrom][j] | |
204 if not isNaN( score ): | |
205 total += score | |
206 count += 1 | |
207 max_score = max( score, max_score ) | |
208 min_score = min( score, min_score ) | |
209 except: | |
210 continue | |
211 if count > 0: | |
212 avg = total/count | |
213 else: | |
214 avg = "nan" | |
215 min_score = "nan" | |
216 max_score = "nan" | |
217 | |
218 # Build the resulting line of data | |
219 out_line = [] | |
220 for k in range(0, len(fields)): | |
221 out_line.append(fields[k]) | |
222 out_line.append(avg) | |
223 out_line.append(min_score) | |
224 out_line.append(max_score) | |
225 | |
226 print >> out_file, "\t".join( map( str, out_line ) ) | |
227 else: | |
228 skipped_lines += 1 | |
229 if not invalid_line: | |
230 first_invalid_line = i + 1 | |
231 invalid_line = line | |
232 elif line.startswith( '#' ): | |
233 # We'll save the original comments | |
234 print >> out_file, line | |
235 | |
236 out_file.close() | |
237 | |
238 if skipped_lines > 0: | |
239 print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) | |
240 if skipped_lines == i: | |
241 print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' | |
242 | |
243 if __name__ == "__main__": main() |