Mercurial > repos > xuebing > sharplabtool
comparison tools/stats/aggregate_scores_in_intervals.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # Greg Von Kuster | |
| 3 """ | |
| 4 usage: %prog score_file interval_file chrom start stop [out_file] [options] | |
| 5 -b, --binned: 'score_file' is actually a directory of binned array files | |
| 6 -m, --mask=FILE: bed file containing regions not to consider valid | |
| 7 -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file | |
| 8 """ | |
| 9 | |
| 10 from __future__ import division | |
| 11 from galaxy import eggs | |
| 12 import pkg_resources | |
| 13 pkg_resources.require( "bx-python" ) | |
| 14 pkg_resources.require( "lrucache" ) | |
| 15 try: | |
| 16 pkg_resources.require( "python-lzo" ) | |
| 17 except: | |
| 18 pass | |
| 19 | |
| 20 import psyco_full | |
| 21 import sys | |
| 22 import os, os.path | |
| 23 from UserDict import DictMixin | |
| 24 import bx.wiggle | |
| 25 from bx.binned_array import BinnedArray, FileBinnedArray | |
| 26 from bx.bitset import * | |
| 27 from bx.bitset_builders import * | |
| 28 from fpconst import isNaN | |
| 29 from bx.cookbook import doc_optparse | |
| 30 from galaxy.tools.exception_handling import * | |
| 31 | |
| 32 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 33 | |
| 34 import tempfile, struct | |
| 35 class PositionalScoresOnDisk: | |
| 36 fmt = 'f' | |
| 37 fmt_size = struct.calcsize( fmt ) | |
| 38 default_value = float( 'nan' ) | |
| 39 | |
| 40 def __init__( self ): | |
| 41 self.file = tempfile.TemporaryFile( 'w+b' ) | |
| 42 self.length = 0 | |
| 43 def __getitem__( self, i ): | |
| 44 if i < 0: i = self.length + i | |
| 45 if i < 0 or i >= self.length: return self.default_value | |
| 46 try: | |
| 47 self.file.seek( i * self.fmt_size ) | |
| 48 return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] | |
| 49 except Exception, e: | |
| 50 raise IndexError, e | |
| 51 def __setitem__( self, i, value ): | |
| 52 if i < 0: i = self.length + i | |
| 53 if i < 0: raise IndexError, 'Negative assignment index out of range' | |
| 54 if i >= self.length: | |
| 55 self.file.seek( self.length * self.fmt_size ) | |
| 56 self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) ) | |
| 57 self.length = i + 1 | |
| 58 self.file.seek( i * self.fmt_size ) | |
| 59 self.file.write( struct.pack( self.fmt, value ) ) | |
| 60 def __len__( self ): | |
| 61 return self.length | |
| 62 def __repr__( self ): | |
| 63 i = 0 | |
| 64 repr = "[ " | |
| 65 for i in xrange( self.length ): | |
| 66 repr = "%s %s," % ( repr, self[i] ) | |
| 67 return "%s ]" % ( repr ) | |
| 68 | |
| 69 class FileBinnedArrayDir( DictMixin ): | |
| 70 """ | |
| 71 Adapter that makes a directory of FileBinnedArray files look like | |
| 72 a regular dict of BinnedArray objects. | |
| 73 """ | |
| 74 def __init__( self, dir ): | |
| 75 self.dir = dir | |
| 76 self.cache = dict() | |
| 77 def __getitem__( self, key ): | |
| 78 value = None | |
| 79 if key in self.cache: | |
| 80 value = self.cache[key] | |
| 81 else: | |
| 82 fname = os.path.join( self.dir, "%s.ba" % key ) | |
| 83 if os.path.exists( fname ): | |
| 84 value = FileBinnedArray( open( fname ) ) | |
| 85 self.cache[key] = value | |
| 86 if value is None: | |
| 87 raise KeyError( "File does not exist: " + fname ) | |
| 88 return value | |
| 89 | |
| 90 def stop_err(msg): | |
| 91 sys.stderr.write(msg) | |
| 92 sys.exit() | |
| 93 | |
| 94 def load_scores_wiggle( fname, chrom_buffer_size = 3 ): | |
| 95 """ | |
| 96 Read a wiggle file and return a dict of BinnedArray objects keyed | |
| 97 by chromosome. | |
| 98 """ | |
| 99 scores_by_chrom = dict() | |
| 100 try: | |
| 101 for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ): | |
| 102 if chrom not in scores_by_chrom: | |
| 103 if chrom_buffer_size: | |
| 104 scores_by_chrom[chrom] = BinnedArray() | |
| 105 chrom_buffer_size -= 1 | |
| 106 else: | |
| 107 scores_by_chrom[chrom] = PositionalScoresOnDisk() | |
| 108 scores_by_chrom[chrom][pos] = val | |
| 109 except UCSCLimitException: | |
| 110 # Wiggle data was truncated, at the very least need to warn the user. | |
| 111 print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' | |
| 112 except IndexError: | |
| 113 stop_err('Data error: one or more column data values is missing in "%s"' %fname) | |
| 114 except ValueError: | |
| 115 stop_err('Data error: invalid data type for one or more values in "%s".' %fname) | |
| 116 return scores_by_chrom | |
| 117 | |
| 118 def load_scores_ba_dir( dir ): | |
| 119 """ | |
| 120 Return a dict-like object (keyed by chromosome) that returns | |
| 121 FileBinnedArray objects created from "key.ba" files in `dir` | |
| 122 """ | |
| 123 return FileBinnedArrayDir( dir ) | |
| 124 | |
| 125 def main(): | |
| 126 | |
| 127 # Parse command line | |
| 128 options, args = doc_optparse.parse( __doc__ ) | |
| 129 | |
| 130 try: | |
| 131 score_fname = args[0] | |
| 132 interval_fname = args[1] | |
| 133 chrom_col = args[2] | |
| 134 start_col = args[3] | |
| 135 stop_col = args[4] | |
| 136 if len( args ) > 5: | |
| 137 out_file = open( args[5], 'w' ) | |
| 138 else: | |
| 139 out_file = sys.stdout | |
| 140 binned = bool( options.binned ) | |
| 141 mask_fname = options.mask | |
| 142 except: | |
| 143 doc_optparse.exit() | |
| 144 | |
| 145 if score_fname == 'None': | |
| 146 stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) | |
| 147 | |
| 148 try: | |
| 149 chrom_col = int(chrom_col) - 1 | |
| 150 start_col = int(start_col) - 1 | |
| 151 stop_col = int(stop_col) - 1 | |
| 152 except: | |
| 153 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) | |
| 154 | |
| 155 if chrom_col < 0 or start_col < 0 or stop_col < 0: | |
| 156 stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) | |
| 157 | |
| 158 if binned: | |
| 159 scores_by_chrom = load_scores_ba_dir( score_fname ) | |
| 160 else: | |
| 161 try: | |
| 162 chrom_buffer = int( options.chrom_buffer ) | |
| 163 except: | |
| 164 chrom_buffer = 3 | |
| 165 scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer ) | |
| 166 | |
| 167 if mask_fname: | |
| 168 masks = binned_bitsets_from_file( open( mask_fname ) ) | |
| 169 else: | |
| 170 masks = None | |
| 171 | |
| 172 skipped_lines = 0 | |
| 173 first_invalid_line = 0 | |
| 174 invalid_line = '' | |
| 175 | |
| 176 for i, line in enumerate( open( interval_fname )): | |
| 177 valid = True | |
| 178 line = line.rstrip('\r\n') | |
| 179 if line and not line.startswith( '#' ): | |
| 180 fields = line.split() | |
| 181 | |
| 182 try: | |
| 183 chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] ) | |
| 184 except: | |
| 185 valid = False | |
| 186 skipped_lines += 1 | |
| 187 if not invalid_line: | |
| 188 first_invalid_line = i + 1 | |
| 189 invalid_line = line | |
| 190 if valid: | |
| 191 total = 0 | |
| 192 count = 0 | |
| 193 min_score = 100000000 | |
| 194 max_score = -100000000 | |
| 195 for j in range( start, stop ): | |
| 196 if chrom in scores_by_chrom: | |
| 197 try: | |
| 198 # Skip if base is masked | |
| 199 if masks and chrom in masks: | |
| 200 if masks[chrom][j]: | |
| 201 continue | |
| 202 # Get the score, only count if not 'nan' | |
| 203 score = scores_by_chrom[chrom][j] | |
| 204 if not isNaN( score ): | |
| 205 total += score | |
| 206 count += 1 | |
| 207 max_score = max( score, max_score ) | |
| 208 min_score = min( score, min_score ) | |
| 209 except: | |
| 210 continue | |
| 211 if count > 0: | |
| 212 avg = total/count | |
| 213 else: | |
| 214 avg = "nan" | |
| 215 min_score = "nan" | |
| 216 max_score = "nan" | |
| 217 | |
| 218 # Build the resulting line of data | |
| 219 out_line = [] | |
| 220 for k in range(0, len(fields)): | |
| 221 out_line.append(fields[k]) | |
| 222 out_line.append(avg) | |
| 223 out_line.append(min_score) | |
| 224 out_line.append(max_score) | |
| 225 | |
| 226 print >> out_file, "\t".join( map( str, out_line ) ) | |
| 227 else: | |
| 228 skipped_lines += 1 | |
| 229 if not invalid_line: | |
| 230 first_invalid_line = i + 1 | |
| 231 invalid_line = line | |
| 232 elif line.startswith( '#' ): | |
| 233 # We'll save the original comments | |
| 234 print >> out_file, line | |
| 235 | |
| 236 out_file.close() | |
| 237 | |
| 238 if skipped_lines > 0: | |
| 239 print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) | |
| 240 if skipped_lines == i: | |
| 241 print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' | |
| 242 | |
| 243 if __name__ == "__main__": main() |
