sharplabtool: tools/extract/phastOdds/get_scores

comparison tools/extract/phastOdds/get_scores_galaxy.py @ 0:9071e359b9a3

Uploaded

author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9071e359b9a3
+#!/usr/bin/env python
+"""
+usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
+-p, --perCol: standardize to lod per column
+"""
+from __future__ import division
+import sys
+from galaxy import eggs
+from numpy import *
+from tables import *
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+from bx import intervals
+# ignore wanrnings about NumArray flavor
+from warnings import filterwarnings
+from tables.exceptions import FlavorWarning
+filterwarnings("ignore", category=FlavorWarning)
+assert sys.version_info[:2] >= ( 2, 4 )
+def stop_err( msg ):
+sys.stderr.write(msg)
+sys.exit()
+def main():
+# Parse command line
+options, args = doc_optparse.parse( __doc__ )
+try:
+h5_fname = args[0]
+mapping_fname = args[1]
+in_fname = args[2]
+out_fname = args[3]
+chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
+per_col = bool( options.perCol )
+except Exception, e:
+doc_optparse.exception()
+if h5_fname == 'None.h5':
+stop_err( 'Invalid genome build, this tool currently only works with data from build hg17.  Click the pencil icon in your history item to correct the build if appropriate.' )
+# Open the h5 file
+h5 = openFile( h5_fname, mode = "r" )
+# Load intervals and names for the subregions
+intersecters = {}
+for i, line in enumerate( file( mapping_fname ) ):
+line = line.rstrip( '\r\n' )
+if line and not line.startswith( '#' ):
+chr, start, end, name = line.split()[0:4]
+if not intersecters.has_key( chr ):
+intersecters[ chr ] = intervals.Intersecter()
+intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
+# Find the subregion containing each input interval
+skipped_lines = 0
+first_invalid_line = 0
+invalid_line = ''
+out_file = open( out_fname, "w" )
+warnings = []
+warning = ''
+for i, line in enumerate( file( in_fname ) ):
+line = line.rstrip( '\r\n' )
+if line.startswith( '#' ):
+if i == 0:
+out_file.write( "%s\tscore\n" % line )
+else:
+out_file.write( "%s\n" % line )
+fields = line.split( "\t" )
+try:
+chr = fields[ chrom_col ]
+start = int( fields[ start_col ] )
+end = int( fields[ end_col ] )
+except:
+warning = "Invalid value for chrom, start or end column."
+warnings.append( warning )
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+continue
+# Find matching interval
+try:
+matches = intersecters[ chr ].find( start, end )
+except:
+warning = "'%s' is not a valid chrom value for the region. " %chr
+warnings.append( warning )
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+continue
+if not len( matches ) == 1:
+warning = "Interval must match exactly one target region. "
+warnings.append( warning )
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+continue
+region = matches[0]
+if not ( start >= region.start and end <= region.end ):
+warning = "Interval must fall entirely within region. "
+warnings.append( warning )
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+continue
+region_name = region.value
+rel_start = start - region.start
+rel_end = end - region.start
+if not rel_start < rel_end:
+warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
+warnings.append( warning )
+skipped_lines += 1
+if not invalid_line:
+first_invalid_line = i + 1
+invalid_line = line
+continue
+s = h5.getNode( h5.root, "scores_" + region_name )
+c = h5.getNode( h5.root, "counts_" + region_name )
+score = s[rel_end-1]
+count = c[rel_end-1]
+if rel_start > 0:
+score -= s[rel_start-1]
+count -= c[rel_start-1]
+if per_col:
+score /= count
+fields.append( str( score ) )
+out_file.write( "%s\n" % "\t".join( fields ) )
+# Close the file handle
+h5.close()
+out_file.close()
+if warnings:
+warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
+warn_msg += warnings[0]
+print warn_msg
+if skipped_lines:
+print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+if __name__ == "__main__": main()

Mercurial > repos > xuebing > sharplabtool

comparison tools/extract/phastOdds/get_scores_galaxy.py @ 0:9071e359b9a3