sharplabtool: tools/stats/aggregate_scores_in

comparison tools/stats/aggregate_scores_in_intervals.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9071e359b9a3
+#!/usr/bin/env python
+# Greg Von Kuster
+"""
+usage: %prog score_file interval_file chrom start stop [out_file] [options]
+-b, --binned: 'score_file' is actually a directory of binned array files
+-m, --mask=FILE: bed file containing regions not to consider valid
+-c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
+"""
+from __future__ import division
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+pkg_resources.require( "lrucache" )
+try:
+pkg_resources.require( "python-lzo" )
+except:
+pass
+import psyco_full
+import sys
+import os, os.path
+from UserDict import DictMixin
+import bx.wiggle
+from bx.binned_array import BinnedArray, FileBinnedArray
+from bx.bitset import *
+from bx.bitset_builders import *
+from fpconst import isNaN
+from bx.cookbook import doc_optparse
+from galaxy.tools.exception_handling import *
+assert sys.version_info[:2] >= ( 2, 4 )
+import tempfile, struct
+class PositionalScoresOnDisk:
+fmt = 'f'
+fmt_size = struct.calcsize( fmt )
+default_value = float( 'nan' )
+def __init__( self ):
+self.file = tempfile.TemporaryFile( 'w+b' )
+self.length = 0
+def __getitem__( self, i ):
+if i < 0: i = self.length + i
+if i < 0 or i >= self.length: return self.default_value
+try:
+self.file.seek( i * self.fmt_size )
+return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
+except Exception, e:
+raise IndexError, e
+def __setitem__( self, i, value ):
+if i < 0: i = self.length + i
+if i < 0: raise IndexError, 'Negative assignment index out of range'
+if i >= self.length:
+self.file.seek( self.length * self.fmt_size )
+self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
+self.length = i + 1
+self.file.seek( i * self.fmt_size )
+self.file.write( struct.pack( self.fmt, value ) )
+def __len__( self ):
+return self.length
+def __repr__( self ):
+i = 0
+repr = "[ "
+for i in xrange( self.length ):
+repr = "%s %s," % ( repr, self[i] )
+return "%s ]" % ( repr )
+class FileBinnedArrayDir( DictMixin ):
+"""
+Adapter that makes a directory of FileBinnedArray files look like
+a regular dict of BinnedArray objects.
+"""
+def __init__( self, dir ):
+self.dir = dir
+self.cache = dict()
+def __getitem__( self, key ):
+value = None
+if key in self.cache:
+value = self.cache[key]
+else:
+fname = os.path.join( self.dir, "%s.ba" % key )
+if os.path.exists( fname ):
+value = FileBinnedArray( open( fname ) )
+self.cache[key] = value
+if value is None:
+raise KeyError( "File does not exist: " + fname )
+return value
+def stop_err(msg):
+sys.stderr.write(msg)
+sys.exit()
+def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
+"""
+Read a wiggle file and return a dict of BinnedArray objects keyed
+by chromosome.
+"""
+scores_by_chrom = dict()
+try:
+for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
+if chrom not in scores_by_chrom:
+if chrom_buffer_size:
+scores_by_chrom[chrom] = BinnedArray()
+chrom_buffer_size -= 1
+else:
+scores_by_chrom[chrom] = PositionalScoresOnDisk()
+scores_by_chrom[chrom][pos] = val
+except UCSCLimitException:
+# Wiggle data was truncated, at the very least need to warn the user.
+print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
+except IndexError:
+stop_err('Data error: one or more column data values is missing in "%s"' %fname)
+except ValueError:
+stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
+return scores_by_chrom
+def load_scores_ba_dir( dir ):
+"""
+Return a dict-like object (keyed by chromosome) that returns
+FileBinnedArray objects created from "key.ba" files in `dir`
+"""
+return FileBinnedArrayDir( dir )
+def main():
+# Parse command line
+options, args = doc_optparse.parse( __doc__ )
+try:
+score_fname = args[0]
+interval_fname = args[1]
+chrom_col = args[2]
+start_col = args[3]
+stop_col = args[4]
+if len( args ) > 5:
+out_file = open( args[5], 'w' )
+else:
+out_file = sys.stdout
+binned = bool( options.binned )
+mask_fname = options.mask
+except:
+doc_optparse.exit()
+if score_fname == 'None':
+stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.' )
+try:
+chrom_col = int(chrom_col) - 1
+start_col = int(start_col) - 1
+stop_col = int(stop_col) - 1
+except:
+stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
+if chrom_col < 0 or start_col < 0 or stop_col < 0:
+stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
+if binned:
+scores_by_chrom = load_scores_ba_dir( score_fname )
+else:
+try:
+chrom_buffer = int( options.chrom_buffer )
+except:
+chrom_buffer = 3
+scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )
+if mask_fname:
+masks = binned_bitsets_from_file( open( mask_fname ) )
+else:
+masks = None
+skipped_lines = 0
+first_invalid_line = 0
+invalid_line = ''
+for i, line in enumerate( open( interval_fname )):
+valid = True
+line = line.rstrip('\r\n')
+if line and not line.startswith( '#' ):
+fields = line.split()
+try:
+chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
+except:
+valid = False
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+if valid:
+total = 0
+count = 0
+min_score = 100000000
+max_score = -100000000
+for j in range( start, stop ):
+if chrom in scores_by_chrom:
+try:
+# Skip if base is masked
+if masks and chrom in masks:
+if masks[chrom][j]:
+continue
+# Get the score, only count if not 'nan'
+score = scores_by_chrom[chrom][j]
+if not isNaN( score ):
+total += score
+count += 1
+max_score = max( score, max_score )
+min_score = min( score, min_score )
+except:
+continue
+if count > 0:
+avg = total/count
+else:
+avg = "nan"
+min_score = "nan"
+max_score = "nan"
+# Build the resulting line of data
+out_line = []
+for k in range(0, len(fields)):
+out_line.append(fields[k])
+out_line.append(avg)
+out_line.append(min_score)
+out_line.append(max_score)
+print >> out_file, "\t".join( map( str, out_line ) )
+else:
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+elif line.startswith( '#' ):
+# We'll save the original comments
+print >> out_file, line
+out_file.close()
+if skipped_lines > 0:
+print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+if skipped_lines == i:
+print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
+if __name__ == "__main__": main()

Mercurial > repos > xuebing > sharplabtool

comparison tools/stats/aggregate_scores_in_intervals.py @ 0:9071e359b9a3