Mercurial > repos > xuebing > sharplabtool
diff tools/encode/split_by_partitions.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/encode/split_by_partitions.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,125 @@ +#!/usr/bin/env python +#Original script from /home/james/work/encode/feature_partitions/split_by_partitions.py + +#Usage: python(2.4) split_by_partitions.py partition_index in_file out_file chrCol startCol endCol strandCol + +from __future__ import division + +import sys +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.bitset import * +from bx.bitset_builders import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def main(): + GALAXY_DATA_INDEX_DIR = sys.argv[1] + partition_index = '%s/encode_feature_partitions/partition_list.txt' % GALAXY_DATA_INDEX_DIR + partition_offset = "%s/encode_feature_partitions/" % GALAXY_DATA_INDEX_DIR + + warnings = [] + + # Load up the partitions + partitions = list() + try: + for line in open( partition_index ): + name, score, filename = line.split() + partitions.append( ( name, score, binned_bitsets_from_file( open( partition_offset+filename ) ) ) ) + except: + stop_err( "Error loading partitioning dataset." ) + + try: + in_file = open( sys.argv[2] ) + except: + stop_err( "Bad input data." ) + + try: + out_file = open( sys.argv[3], "w" ) + except: + stop_err( "Bad output file." ) + + try: + chrCol = int( sys.argv[4] ) - 1 + except: + stop_err( "Bad chr column: %s" % ( str( sys.argv[4] ) ) ) + try: + startCol = int( sys.argv[5] ) - 1 + except: + stop_err( "Bad start column: %s" % ( str( sys.argv[5] ) ) ) + try: + endCol = int( sys.argv[6] ) - 1 + except: + stop_err( "Bad end column: %s" % ( str( sys.argv[6] ) ) ) + try: + strandCol = int( sys.argv[7] )-1 + except: + strandCol = -1 + + line_count = 0 + skipped_lines = 0 + first_invalid_line = None + invalid_line = '' + try: + for line in in_file: + line_count += 1 + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ): + fields = line.split( '\t' ) + try: + chr, start, end = fields[chrCol], int( fields[startCol] ), int( fields[endCol] ) + except: + skipped_lines += 1 + if first_invalid_line is None: + first_invalid_line = line_count + invalid_line = line + continue + label = "input_line_" + str( line_count ) #if input file type was known to be bed, then could guess at label column + + if strandCol < 0: + strand = "+" + else: + try: + strand = fields[strandCol] + except: + strand = "+" + + # Find which partition it overlaps + overlap = 0 + for name, score, bb in partitions: + # Is there at least 1bp overlap? + if chr in bb: + overlap = bb[chr].count_range( start, end-start ) + if overlap > 0: + break + else: + # No overlap with any partition? For now throw this since the + # partitions tile the encode regions completely, indicate an interval + # that does not even overlap an encode region + warning = "warning: Interval (%s, %d, %d) does not overlap any partition" % ( chr, start, end ) + ", line[" + str( line_count ) + "]. " + warnings.append( warning ) + name = "no_overlap" + score = 0 + # Annotate with the name of the partition + frac_overlap = overlap / ( end-start ) + # BED6 plus? + print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s\t%s\t%0.4f" % ( chr, start, end, label, score, strand, name, frac_overlap ) + except: + out_file.close() + in_file.close() + stop_err( "Unknown error while processing line # %d: %s" % ( line_count, line ) ) + out_file.close() + in_file.close() + + if warnings: + warn_msg = "This tool is useful on ENCODE regions only, %d warnings, 1st is: " % len( warnings ) + warn_msg += warnings[0] + print warn_msg + if skipped_lines: + print "Skipped %d invalid lines starting at line # %d: %s" % ( skipped_lines, first_invalid_line, invalid_line ) + +if __name__ == "__main__": main()