0
|
1 #!/usr/bin/env python
|
|
2 # Greg Von Kuster
|
|
3 """
|
|
4 usage: %prog score_file interval_file chrom start stop [out_file] [options]
|
|
5 -b, --binned: 'score_file' is actually a directory of binned array files
|
|
6 -m, --mask=FILE: bed file containing regions not to consider valid
|
|
7 -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
|
|
8 """
|
|
9
|
|
10 from __future__ import division
|
|
11 from galaxy import eggs
|
|
12 import pkg_resources
|
|
13 pkg_resources.require( "bx-python" )
|
|
14 pkg_resources.require( "lrucache" )
|
|
15 try:
|
|
16 pkg_resources.require( "python-lzo" )
|
|
17 except:
|
|
18 pass
|
|
19
|
|
20 import psyco_full
|
|
21 import sys
|
|
22 import os, os.path
|
|
23 from UserDict import DictMixin
|
|
24 import bx.wiggle
|
|
25 from bx.binned_array import BinnedArray, FileBinnedArray
|
|
26 from bx.bitset import *
|
|
27 from bx.bitset_builders import *
|
|
28 from fpconst import isNaN
|
|
29 from bx.cookbook import doc_optparse
|
|
30 from galaxy.tools.exception_handling import *
|
|
31
|
|
32 assert sys.version_info[:2] >= ( 2, 4 )
|
|
33
|
|
34 import tempfile, struct
|
|
35 class PositionalScoresOnDisk:
|
|
36 fmt = 'f'
|
|
37 fmt_size = struct.calcsize( fmt )
|
|
38 default_value = float( 'nan' )
|
|
39
|
|
40 def __init__( self ):
|
|
41 self.file = tempfile.TemporaryFile( 'w+b' )
|
|
42 self.length = 0
|
|
43 def __getitem__( self, i ):
|
|
44 if i < 0: i = self.length + i
|
|
45 if i < 0 or i >= self.length: return self.default_value
|
|
46 try:
|
|
47 self.file.seek( i * self.fmt_size )
|
|
48 return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
|
|
49 except Exception, e:
|
|
50 raise IndexError, e
|
|
51 def __setitem__( self, i, value ):
|
|
52 if i < 0: i = self.length + i
|
|
53 if i < 0: raise IndexError, 'Negative assignment index out of range'
|
|
54 if i >= self.length:
|
|
55 self.file.seek( self.length * self.fmt_size )
|
|
56 self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
|
|
57 self.length = i + 1
|
|
58 self.file.seek( i * self.fmt_size )
|
|
59 self.file.write( struct.pack( self.fmt, value ) )
|
|
60 def __len__( self ):
|
|
61 return self.length
|
|
62 def __repr__( self ):
|
|
63 i = 0
|
|
64 repr = "[ "
|
|
65 for i in xrange( self.length ):
|
|
66 repr = "%s %s," % ( repr, self[i] )
|
|
67 return "%s ]" % ( repr )
|
|
68
|
|
69 class FileBinnedArrayDir( DictMixin ):
|
|
70 """
|
|
71 Adapter that makes a directory of FileBinnedArray files look like
|
|
72 a regular dict of BinnedArray objects.
|
|
73 """
|
|
74 def __init__( self, dir ):
|
|
75 self.dir = dir
|
|
76 self.cache = dict()
|
|
77 def __getitem__( self, key ):
|
|
78 value = None
|
|
79 if key in self.cache:
|
|
80 value = self.cache[key]
|
|
81 else:
|
|
82 fname = os.path.join( self.dir, "%s.ba" % key )
|
|
83 if os.path.exists( fname ):
|
|
84 value = FileBinnedArray( open( fname ) )
|
|
85 self.cache[key] = value
|
|
86 if value is None:
|
|
87 raise KeyError( "File does not exist: " + fname )
|
|
88 return value
|
|
89
|
|
90 def stop_err(msg):
|
|
91 sys.stderr.write(msg)
|
|
92 sys.exit()
|
|
93
|
|
94 def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
|
|
95 """
|
|
96 Read a wiggle file and return a dict of BinnedArray objects keyed
|
|
97 by chromosome.
|
|
98 """
|
|
99 scores_by_chrom = dict()
|
|
100 try:
|
|
101 for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
|
|
102 if chrom not in scores_by_chrom:
|
|
103 if chrom_buffer_size:
|
|
104 scores_by_chrom[chrom] = BinnedArray()
|
|
105 chrom_buffer_size -= 1
|
|
106 else:
|
|
107 scores_by_chrom[chrom] = PositionalScoresOnDisk()
|
|
108 scores_by_chrom[chrom][pos] = val
|
|
109 except UCSCLimitException:
|
|
110 # Wiggle data was truncated, at the very least need to warn the user.
|
|
111 print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
|
|
112 except IndexError:
|
|
113 stop_err('Data error: one or more column data values is missing in "%s"' %fname)
|
|
114 except ValueError:
|
|
115 stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
|
|
116 return scores_by_chrom
|
|
117
|
|
118 def load_scores_ba_dir( dir ):
|
|
119 """
|
|
120 Return a dict-like object (keyed by chromosome) that returns
|
|
121 FileBinnedArray objects created from "key.ba" files in `dir`
|
|
122 """
|
|
123 return FileBinnedArrayDir( dir )
|
|
124
|
|
125 def main():
|
|
126
|
|
127 # Parse command line
|
|
128 options, args = doc_optparse.parse( __doc__ )
|
|
129
|
|
130 try:
|
|
131 score_fname = args[0]
|
|
132 interval_fname = args[1]
|
|
133 chrom_col = args[2]
|
|
134 start_col = args[3]
|
|
135 stop_col = args[4]
|
|
136 if len( args ) > 5:
|
|
137 out_file = open( args[5], 'w' )
|
|
138 else:
|
|
139 out_file = sys.stdout
|
|
140 binned = bool( options.binned )
|
|
141 mask_fname = options.mask
|
|
142 except:
|
|
143 doc_optparse.exit()
|
|
144
|
|
145 if score_fname == 'None':
|
|
146 stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' )
|
|
147
|
|
148 try:
|
|
149 chrom_col = int(chrom_col) - 1
|
|
150 start_col = int(start_col) - 1
|
|
151 stop_col = int(stop_col) - 1
|
|
152 except:
|
|
153 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
|
|
154
|
|
155 if chrom_col < 0 or start_col < 0 or stop_col < 0:
|
|
156 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
|
|
157
|
|
158 if binned:
|
|
159 scores_by_chrom = load_scores_ba_dir( score_fname )
|
|
160 else:
|
|
161 try:
|
|
162 chrom_buffer = int( options.chrom_buffer )
|
|
163 except:
|
|
164 chrom_buffer = 3
|
|
165 scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )
|
|
166
|
|
167 if mask_fname:
|
|
168 masks = binned_bitsets_from_file( open( mask_fname ) )
|
|
169 else:
|
|
170 masks = None
|
|
171
|
|
172 skipped_lines = 0
|
|
173 first_invalid_line = 0
|
|
174 invalid_line = ''
|
|
175
|
|
176 for i, line in enumerate( open( interval_fname )):
|
|
177 valid = True
|
|
178 line = line.rstrip('\r\n')
|
|
179 if line and not line.startswith( '#' ):
|
|
180 fields = line.split()
|
|
181
|
|
182 try:
|
|
183 chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
|
|
184 except:
|
|
185 valid = False
|
|
186 skipped_lines += 1
|
|
187 if not invalid_line:
|
|
188 first_invalid_line = i + 1
|
|
189 invalid_line = line
|
|
190 if valid:
|
|
191 total = 0
|
|
192 count = 0
|
|
193 min_score = 100000000
|
|
194 max_score = -100000000
|
|
195 for j in range( start, stop ):
|
|
196 if chrom in scores_by_chrom:
|
|
197 try:
|
|
198 # Skip if base is masked
|
|
199 if masks and chrom in masks:
|
|
200 if masks[chrom][j]:
|
|
201 continue
|
|
202 # Get the score, only count if not 'nan'
|
|
203 score = scores_by_chrom[chrom][j]
|
|
204 if not isNaN( score ):
|
|
205 total += score
|
|
206 count += 1
|
|
207 max_score = max( score, max_score )
|
|
208 min_score = min( score, min_score )
|
|
209 except:
|
|
210 continue
|
|
211 if count > 0:
|
|
212 avg = total/count
|
|
213 else:
|
|
214 avg = "nan"
|
|
215 min_score = "nan"
|
|
216 max_score = "nan"
|
|
217
|
|
218 # Build the resulting line of data
|
|
219 out_line = []
|
|
220 for k in range(0, len(fields)):
|
|
221 out_line.append(fields[k])
|
|
222 out_line.append(avg)
|
|
223 out_line.append(min_score)
|
|
224 out_line.append(max_score)
|
|
225
|
|
226 print >> out_file, "\t".join( map( str, out_line ) )
|
|
227 else:
|
|
228 skipped_lines += 1
|
|
229 if not invalid_line:
|
|
230 first_invalid_line = i + 1
|
|
231 invalid_line = line
|
|
232 elif line.startswith( '#' ):
|
|
233 # We'll save the original comments
|
|
234 print >> out_file, line
|
|
235
|
|
236 out_file.close()
|
|
237
|
|
238 if skipped_lines > 0:
|
|
239 print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
|
|
240 if skipped_lines == i:
|
|
241 print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
|
|
242
|
|
243 if __name__ == "__main__": main()
|