Mercurial > repos > xuebing > sharplabtool
comparison tools/extract/phastOdds/get_scores_galaxy.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options] | |
| 5 -p, --perCol: standardize to lod per column | |
| 6 """ | |
| 7 | |
| 8 from __future__ import division | |
| 9 | |
| 10 import sys | |
| 11 from galaxy import eggs | |
| 12 from numpy import * | |
| 13 from tables import * | |
| 14 | |
| 15 import pkg_resources; pkg_resources.require( "bx-python" ) | |
| 16 from bx.cookbook import doc_optparse | |
| 17 | |
| 18 from bx import intervals | |
| 19 | |
| 20 # ignore wanrnings about NumArray flavor | |
| 21 from warnings import filterwarnings | |
| 22 from tables.exceptions import FlavorWarning | |
| 23 filterwarnings("ignore", category=FlavorWarning) | |
| 24 | |
| 25 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 26 | |
| 27 def stop_err( msg ): | |
| 28 sys.stderr.write(msg) | |
| 29 sys.exit() | |
| 30 | |
| 31 def main(): | |
| 32 # Parse command line | |
| 33 options, args = doc_optparse.parse( __doc__ ) | |
| 34 try: | |
| 35 h5_fname = args[0] | |
| 36 mapping_fname = args[1] | |
| 37 in_fname = args[2] | |
| 38 out_fname = args[3] | |
| 39 chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] ) | |
| 40 per_col = bool( options.perCol ) | |
| 41 except Exception, e: | |
| 42 doc_optparse.exception() | |
| 43 | |
| 44 if h5_fname == 'None.h5': | |
| 45 stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' ) | |
| 46 | |
| 47 # Open the h5 file | |
| 48 h5 = openFile( h5_fname, mode = "r" ) | |
| 49 # Load intervals and names for the subregions | |
| 50 intersecters = {} | |
| 51 for i, line in enumerate( file( mapping_fname ) ): | |
| 52 line = line.rstrip( '\r\n' ) | |
| 53 if line and not line.startswith( '#' ): | |
| 54 chr, start, end, name = line.split()[0:4] | |
| 55 if not intersecters.has_key( chr ): | |
| 56 intersecters[ chr ] = intervals.Intersecter() | |
| 57 intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) ) | |
| 58 | |
| 59 # Find the subregion containing each input interval | |
| 60 skipped_lines = 0 | |
| 61 first_invalid_line = 0 | |
| 62 invalid_line = '' | |
| 63 out_file = open( out_fname, "w" ) | |
| 64 warnings = [] | |
| 65 warning = '' | |
| 66 for i, line in enumerate( file( in_fname ) ): | |
| 67 line = line.rstrip( '\r\n' ) | |
| 68 if line.startswith( '#' ): | |
| 69 if i == 0: | |
| 70 out_file.write( "%s\tscore\n" % line ) | |
| 71 else: | |
| 72 out_file.write( "%s\n" % line ) | |
| 73 fields = line.split( "\t" ) | |
| 74 try: | |
| 75 chr = fields[ chrom_col ] | |
| 76 start = int( fields[ start_col ] ) | |
| 77 end = int( fields[ end_col ] ) | |
| 78 except: | |
| 79 warning = "Invalid value for chrom, start or end column." | |
| 80 warnings.append( warning ) | |
| 81 skipped_lines += 1 | |
| 82 if not invalid_line: | |
| 83 first_invalid_line = i + 1 | |
| 84 invalid_line = line | |
| 85 continue | |
| 86 # Find matching interval | |
| 87 try: | |
| 88 matches = intersecters[ chr ].find( start, end ) | |
| 89 except: | |
| 90 warning = "'%s' is not a valid chrom value for the region. " %chr | |
| 91 warnings.append( warning ) | |
| 92 skipped_lines += 1 | |
| 93 if not invalid_line: | |
| 94 first_invalid_line = i + 1 | |
| 95 invalid_line = line | |
| 96 continue | |
| 97 if not len( matches ) == 1: | |
| 98 warning = "Interval must match exactly one target region. " | |
| 99 warnings.append( warning ) | |
| 100 skipped_lines += 1 | |
| 101 if not invalid_line: | |
| 102 first_invalid_line = i + 1 | |
| 103 invalid_line = line | |
| 104 continue | |
| 105 region = matches[0] | |
| 106 if not ( start >= region.start and end <= region.end ): | |
| 107 warning = "Interval must fall entirely within region. " | |
| 108 warnings.append( warning ) | |
| 109 skipped_lines += 1 | |
| 110 if not invalid_line: | |
| 111 first_invalid_line = i + 1 | |
| 112 invalid_line = line | |
| 113 continue | |
| 114 region_name = region.value | |
| 115 rel_start = start - region.start | |
| 116 rel_end = end - region.start | |
| 117 if not rel_start < rel_end: | |
| 118 warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end ) | |
| 119 warnings.append( warning ) | |
| 120 skipped_lines += 1 | |
| 121 if not invalid_line: | |
| 122 first_invalid_line = i + 1 | |
| 123 invalid_line = line | |
| 124 continue | |
| 125 s = h5.getNode( h5.root, "scores_" + region_name ) | |
| 126 c = h5.getNode( h5.root, "counts_" + region_name ) | |
| 127 score = s[rel_end-1] | |
| 128 count = c[rel_end-1] | |
| 129 if rel_start > 0: | |
| 130 score -= s[rel_start-1] | |
| 131 count -= c[rel_start-1] | |
| 132 if per_col: | |
| 133 score /= count | |
| 134 fields.append( str( score ) ) | |
| 135 out_file.write( "%s\n" % "\t".join( fields ) ) | |
| 136 # Close the file handle | |
| 137 h5.close() | |
| 138 out_file.close() | |
| 139 | |
| 140 if warnings: | |
| 141 warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings ) | |
| 142 warn_msg += warnings[0] | |
| 143 print warn_msg | |
| 144 if skipped_lines: | |
| 145 print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) | |
| 146 | |
| 147 if __name__ == "__main__": main() |
