Mercurial > repos > xuebing > sharplabtool
diff tools/stats/aggregate_scores_in_intervals.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/stats/aggregate_scores_in_intervals.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,243 @@ +#!/usr/bin/env python +# Greg Von Kuster +""" +usage: %prog score_file interval_file chrom start stop [out_file] [options] + -b, --binned: 'score_file' is actually a directory of binned array files + -m, --mask=FILE: bed file containing regions not to consider valid + -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file +""" + +from __future__ import division +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) +pkg_resources.require( "lrucache" ) +try: + pkg_resources.require( "python-lzo" ) +except: + pass + +import psyco_full +import sys +import os, os.path +from UserDict import DictMixin +import bx.wiggle +from bx.binned_array import BinnedArray, FileBinnedArray +from bx.bitset import * +from bx.bitset_builders import * +from fpconst import isNaN +from bx.cookbook import doc_optparse +from galaxy.tools.exception_handling import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +import tempfile, struct +class PositionalScoresOnDisk: + fmt = 'f' + fmt_size = struct.calcsize( fmt ) + default_value = float( 'nan' ) + + def __init__( self ): + self.file = tempfile.TemporaryFile( 'w+b' ) + self.length = 0 + def __getitem__( self, i ): + if i < 0: i = self.length + i + if i < 0 or i >= self.length: return self.default_value + try: + self.file.seek( i * self.fmt_size ) + return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] + except Exception, e: + raise IndexError, e + def __setitem__( self, i, value ): + if i < 0: i = self.length + i + if i < 0: raise IndexError, 'Negative assignment index out of range' + if i >= self.length: + self.file.seek( self.length * self.fmt_size ) + self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) ) + self.length = i + 1 + self.file.seek( i * self.fmt_size ) + self.file.write( struct.pack( self.fmt, value ) ) + def __len__( self ): + return self.length + def __repr__( self ): + i = 0 + repr = "[ " + for i in xrange( self.length ): + repr = "%s %s," % ( repr, self[i] ) + return "%s ]" % ( repr ) + +class FileBinnedArrayDir( DictMixin ): + """ + Adapter that makes a directory of FileBinnedArray files look like + a regular dict of BinnedArray objects. + """ + def __init__( self, dir ): + self.dir = dir + self.cache = dict() + def __getitem__( self, key ): + value = None + if key in self.cache: + value = self.cache[key] + else: + fname = os.path.join( self.dir, "%s.ba" % key ) + if os.path.exists( fname ): + value = FileBinnedArray( open( fname ) ) + self.cache[key] = value + if value is None: + raise KeyError( "File does not exist: " + fname ) + return value + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def load_scores_wiggle( fname, chrom_buffer_size = 3 ): + """ + Read a wiggle file and return a dict of BinnedArray objects keyed + by chromosome. + """ + scores_by_chrom = dict() + try: + for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ): + if chrom not in scores_by_chrom: + if chrom_buffer_size: + scores_by_chrom[chrom] = BinnedArray() + chrom_buffer_size -= 1 + else: + scores_by_chrom[chrom] = PositionalScoresOnDisk() + scores_by_chrom[chrom][pos] = val + except UCSCLimitException: + # Wiggle data was truncated, at the very least need to warn the user. + print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' + except IndexError: + stop_err('Data error: one or more column data values is missing in "%s"' %fname) + except ValueError: + stop_err('Data error: invalid data type for one or more values in "%s".' %fname) + return scores_by_chrom + +def load_scores_ba_dir( dir ): + """ + Return a dict-like object (keyed by chromosome) that returns + FileBinnedArray objects created from "key.ba" files in `dir` + """ + return FileBinnedArrayDir( dir ) + +def main(): + + # Parse command line + options, args = doc_optparse.parse( __doc__ ) + + try: + score_fname = args[0] + interval_fname = args[1] + chrom_col = args[2] + start_col = args[3] + stop_col = args[4] + if len( args ) > 5: + out_file = open( args[5], 'w' ) + else: + out_file = sys.stdout + binned = bool( options.binned ) + mask_fname = options.mask + except: + doc_optparse.exit() + + if score_fname == 'None': + stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) + + try: + chrom_col = int(chrom_col) - 1 + start_col = int(start_col) - 1 + stop_col = int(stop_col) - 1 + except: + stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) + + if chrom_col < 0 or start_col < 0 or stop_col < 0: + stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) + + if binned: + scores_by_chrom = load_scores_ba_dir( score_fname ) + else: + try: + chrom_buffer = int( options.chrom_buffer ) + except: + chrom_buffer = 3 + scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer ) + + if mask_fname: + masks = binned_bitsets_from_file( open( mask_fname ) ) + else: + masks = None + + skipped_lines = 0 + first_invalid_line = 0 + invalid_line = '' + + for i, line in enumerate( open( interval_fname )): + valid = True + line = line.rstrip('\r\n') + if line and not line.startswith( '#' ): + fields = line.split() + + try: + chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] ) + except: + valid = False + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line + if valid: + total = 0 + count = 0 + min_score = 100000000 + max_score = -100000000 + for j in range( start, stop ): + if chrom in scores_by_chrom: + try: + # Skip if base is masked + if masks and chrom in masks: + if masks[chrom][j]: + continue + # Get the score, only count if not 'nan' + score = scores_by_chrom[chrom][j] + if not isNaN( score ): + total += score + count += 1 + max_score = max( score, max_score ) + min_score = min( score, min_score ) + except: + continue + if count > 0: + avg = total/count + else: + avg = "nan" + min_score = "nan" + max_score = "nan" + + # Build the resulting line of data + out_line = [] + for k in range(0, len(fields)): + out_line.append(fields[k]) + out_line.append(avg) + out_line.append(min_score) + out_line.append(max_score) + + print >> out_file, "\t".join( map( str, out_line ) ) + else: + skipped_lines += 1 + if not invalid_line: + first_invalid_line = i + 1 + invalid_line = line + elif line.startswith( '#' ): + # We'll save the original comments + print >> out_file, line + + out_file.close() + + if skipped_lines > 0: + print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) + if skipped_lines == i: + print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' + +if __name__ == "__main__": main()